polars-df 0.3.1-x86_64-darwin → 0.5.0-x86_64-darwin
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +24 -1
- data/Cargo.lock +486 -380
- data/Cargo.toml +0 -2
- data/LICENSE-THIRD-PARTY.txt +6761 -7881
- data/README.md +31 -2
- data/lib/polars/3.0/polars.bundle +0 -0
- data/lib/polars/3.1/polars.bundle +0 -0
- data/lib/polars/3.2/polars.bundle +0 -0
- data/lib/polars/batched_csv_reader.rb +1 -1
- data/lib/polars/binary_expr.rb +77 -0
- data/lib/polars/binary_name_space.rb +66 -0
- data/lib/polars/convert.rb +2 -2
- data/lib/polars/data_frame.rb +263 -87
- data/lib/polars/data_types.rb +6 -4
- data/lib/polars/date_time_expr.rb +148 -8
- data/lib/polars/expr.rb +78 -11
- data/lib/polars/io.rb +73 -62
- data/lib/polars/lazy_frame.rb +107 -10
- data/lib/polars/lazy_functions.rb +7 -3
- data/lib/polars/list_expr.rb +70 -21
- data/lib/polars/list_name_space.rb +2 -2
- data/lib/polars/series.rb +190 -74
- data/lib/polars/string_expr.rb +150 -44
- data/lib/polars/string_name_space.rb +4 -4
- data/lib/polars/struct_name_space.rb +32 -0
- data/lib/polars/utils.rb +51 -9
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +4 -2
- metadata +4 -2
data/lib/polars/data_frame.rb
CHANGED
@@ -18,7 +18,10 @@ module Polars
|
|
18
18
|
# Whether to interpret two-dimensional data as columns or as rows. If `nil`,
|
19
19
|
# the orientation is inferred by matching the columns and data dimensions. If
|
20
20
|
# this does not yield conclusive results, column orientation is used.
|
21
|
-
def initialize(data = nil, columns: nil, orient: nil)
|
21
|
+
def initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
|
22
|
+
schema ||= columns
|
23
|
+
raise Todo if schema_overrides
|
24
|
+
|
22
25
|
# TODO deprecate in favor of read_sql
|
23
26
|
if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
|
24
27
|
result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.to_sql)
|
@@ -29,14 +32,14 @@ module Polars
|
|
29
32
|
end
|
30
33
|
|
31
34
|
if data.nil?
|
32
|
-
self._df = self.class.hash_to_rbdf({},
|
35
|
+
self._df = self.class.hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
|
33
36
|
elsif data.is_a?(Hash)
|
34
37
|
data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
|
35
|
-
self._df = self.class.hash_to_rbdf(data,
|
38
|
+
self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, nan_to_null: nan_to_null)
|
36
39
|
elsif data.is_a?(Array)
|
37
|
-
self._df = self.class.sequence_to_rbdf(data,
|
40
|
+
self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, orient: orient, infer_schema_length: infer_schema_length)
|
38
41
|
elsif data.is_a?(Series)
|
39
|
-
self._df = self.class.series_to_rbdf(data,
|
42
|
+
self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides)
|
40
43
|
else
|
41
44
|
raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
|
42
45
|
end
|
@@ -56,8 +59,8 @@ module Polars
|
|
56
59
|
end
|
57
60
|
|
58
61
|
# @private
|
59
|
-
def self._from_hash(data,
|
60
|
-
_from_rbdf(hash_to_rbdf(data,
|
62
|
+
def self._from_hash(data, schema: nil, schema_overrides: nil)
|
63
|
+
_from_rbdf(hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides))
|
61
64
|
end
|
62
65
|
|
63
66
|
# def self._from_records
|
@@ -97,7 +100,7 @@ module Polars
|
|
97
100
|
eol_char: "\n"
|
98
101
|
)
|
99
102
|
if Utils.pathlike?(file)
|
100
|
-
path = Utils.
|
103
|
+
path = Utils.normalise_filepath(file)
|
101
104
|
else
|
102
105
|
path = nil
|
103
106
|
# if defined?(StringIO) && file.is_a?(StringIO)
|
@@ -196,32 +199,56 @@ module Polars
|
|
196
199
|
|
197
200
|
# @private
|
198
201
|
def self._read_parquet(
|
199
|
-
|
202
|
+
source,
|
200
203
|
columns: nil,
|
201
204
|
n_rows: nil,
|
202
205
|
parallel: "auto",
|
203
206
|
row_count_name: nil,
|
204
207
|
row_count_offset: 0,
|
205
|
-
low_memory: false
|
208
|
+
low_memory: false,
|
209
|
+
use_statistics: true,
|
210
|
+
rechunk: true
|
206
211
|
)
|
207
|
-
if Utils.pathlike?(
|
208
|
-
|
212
|
+
if Utils.pathlike?(source)
|
213
|
+
source = Utils.normalise_filepath(source)
|
214
|
+
end
|
215
|
+
if columns.is_a?(String)
|
216
|
+
columns = [columns]
|
209
217
|
end
|
210
218
|
|
211
|
-
if
|
212
|
-
|
219
|
+
if source.is_a?(String) && source.include?("*") && Utils.local_file?(source)
|
220
|
+
scan =
|
221
|
+
Polars.scan_parquet(
|
222
|
+
source,
|
223
|
+
n_rows: n_rows,
|
224
|
+
rechunk: true,
|
225
|
+
parallel: parallel,
|
226
|
+
row_count_name: row_count_name,
|
227
|
+
row_count_offset: row_count_offset,
|
228
|
+
low_memory: low_memory
|
229
|
+
)
|
230
|
+
|
231
|
+
if columns.nil?
|
232
|
+
return self._from_rbdf(scan.collect._df)
|
233
|
+
elsif Utils.is_str_sequence(columns, allow_str: false)
|
234
|
+
return self._from_rbdf(scan.select(columns).collect._df)
|
235
|
+
else
|
236
|
+
raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: Array[String]"
|
237
|
+
end
|
213
238
|
end
|
214
239
|
|
215
240
|
projection, columns = Utils.handle_projection_columns(columns)
|
216
241
|
_from_rbdf(
|
217
242
|
RbDataFrame.read_parquet(
|
218
|
-
|
243
|
+
source,
|
219
244
|
columns,
|
220
245
|
projection,
|
221
246
|
n_rows,
|
222
247
|
parallel,
|
223
248
|
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
224
|
-
low_memory
|
249
|
+
low_memory,
|
250
|
+
use_statistics,
|
251
|
+
rechunk
|
225
252
|
)
|
226
253
|
)
|
227
254
|
end
|
@@ -229,7 +256,7 @@ module Polars
|
|
229
256
|
# @private
|
230
257
|
def self._read_avro(file, columns: nil, n_rows: nil)
|
231
258
|
if Utils.pathlike?(file)
|
232
|
-
file = Utils.
|
259
|
+
file = Utils.normalise_filepath(file)
|
233
260
|
end
|
234
261
|
projection, columns = Utils.handle_projection_columns(columns)
|
235
262
|
_from_rbdf(RbDataFrame.read_avro(file, columns, projection, n_rows))
|
@@ -246,7 +273,7 @@ module Polars
|
|
246
273
|
memory_map: true
|
247
274
|
)
|
248
275
|
if Utils.pathlike?(file)
|
249
|
-
file = Utils.
|
276
|
+
file = Utils.normalise_filepath(file)
|
250
277
|
end
|
251
278
|
if columns.is_a?(String)
|
252
279
|
columns = [columns]
|
@@ -272,7 +299,7 @@ module Polars
|
|
272
299
|
# @private
|
273
300
|
def self._read_json(file)
|
274
301
|
if Utils.pathlike?(file)
|
275
|
-
file = Utils.
|
302
|
+
file = Utils.normalise_filepath(file)
|
276
303
|
end
|
277
304
|
|
278
305
|
_from_rbdf(RbDataFrame.read_json(file))
|
@@ -281,7 +308,7 @@ module Polars
|
|
281
308
|
# @private
|
282
309
|
def self._read_ndjson(file)
|
283
310
|
if Utils.pathlike?(file)
|
284
|
-
file = Utils.
|
311
|
+
file = Utils.normalise_filepath(file)
|
285
312
|
end
|
286
313
|
|
287
314
|
_from_rbdf(RbDataFrame.read_ndjson(file))
|
@@ -312,6 +339,7 @@ module Polars
|
|
312
339
|
end
|
313
340
|
alias_method :count, :height
|
314
341
|
alias_method :length, :height
|
342
|
+
alias_method :size, :height
|
315
343
|
|
316
344
|
# Get the width of the DataFrame.
|
317
345
|
#
|
@@ -522,6 +550,13 @@ module Polars
|
|
522
550
|
end
|
523
551
|
alias_method :inspect, :to_s
|
524
552
|
|
553
|
+
# Returns an array representing the DataFrame
|
554
|
+
#
|
555
|
+
# @return [Array]
|
556
|
+
def to_a
|
557
|
+
rows(named: true)
|
558
|
+
end
|
559
|
+
|
525
560
|
# Check if DataFrame includes column.
|
526
561
|
#
|
527
562
|
# @return [Boolean]
|
@@ -631,7 +666,7 @@ module Polars
|
|
631
666
|
end
|
632
667
|
|
633
668
|
# Ruby-specific
|
634
|
-
if item.is_a?(Expr)
|
669
|
+
if item.is_a?(Expr) || item.is_a?(Series)
|
635
670
|
return filter(item)
|
636
671
|
end
|
637
672
|
|
@@ -641,15 +676,42 @@ module Polars
|
|
641
676
|
# Set item.
|
642
677
|
#
|
643
678
|
# @return [Object]
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
679
|
+
def []=(*key, value)
|
680
|
+
if key.length == 1
|
681
|
+
key = key.first
|
682
|
+
elsif key.length != 2
|
683
|
+
raise ArgumentError, "wrong number of arguments (given #{key.length + 1}, expected 2..3)"
|
684
|
+
end
|
649
685
|
|
650
|
-
|
651
|
-
|
686
|
+
if Utils.strlike?(key)
|
687
|
+
if value.is_a?(Array) || (defined?(Numo::NArray) && value.is_a?(Numo::NArray))
|
688
|
+
value = Series.new(value)
|
689
|
+
elsif !value.is_a?(Series)
|
690
|
+
value = Polars.lit(value)
|
691
|
+
end
|
692
|
+
self._df = with_column(value.alias(key.to_s))._df
|
693
|
+
elsif key.is_a?(Array)
|
694
|
+
row_selection, col_selection = key
|
695
|
+
|
696
|
+
if Utils.strlike?(col_selection)
|
697
|
+
s = self[col_selection]
|
698
|
+
elsif col_selection.is_a?(Integer)
|
699
|
+
raise Todo
|
700
|
+
else
|
701
|
+
raise ArgumentError, "column selection not understood: #{col_selection}"
|
702
|
+
end
|
703
|
+
|
704
|
+
s[row_selection] = value
|
652
705
|
|
706
|
+
if col_selection.is_a?(Integer)
|
707
|
+
replace_at_idx(col_selection, s)
|
708
|
+
elsif Utils.strlike?(col_selection)
|
709
|
+
replace(col_selection, s)
|
710
|
+
end
|
711
|
+
else
|
712
|
+
raise Todo
|
713
|
+
end
|
714
|
+
end
|
653
715
|
|
654
716
|
# Return the dataframe as a scalar.
|
655
717
|
#
|
@@ -774,7 +836,7 @@ module Polars
|
|
774
836
|
row_oriented: false
|
775
837
|
)
|
776
838
|
if Utils.pathlike?(file)
|
777
|
-
file = Utils.
|
839
|
+
file = Utils.normalise_filepath(file)
|
778
840
|
end
|
779
841
|
|
780
842
|
_df.write_json(file, pretty, row_oriented)
|
@@ -789,7 +851,7 @@ module Polars
|
|
789
851
|
# @return [nil]
|
790
852
|
def write_ndjson(file)
|
791
853
|
if Utils.pathlike?(file)
|
792
|
-
file = Utils.
|
854
|
+
file = Utils.normalise_filepath(file)
|
793
855
|
end
|
794
856
|
|
795
857
|
_df.write_ndjson(file)
|
@@ -879,7 +941,7 @@ module Polars
|
|
879
941
|
end
|
880
942
|
|
881
943
|
if Utils.pathlike?(file)
|
882
|
-
file = Utils.
|
944
|
+
file = Utils.normalise_filepath(file)
|
883
945
|
end
|
884
946
|
|
885
947
|
_df.write_csv(
|
@@ -917,7 +979,7 @@ module Polars
|
|
917
979
|
compression = "uncompressed"
|
918
980
|
end
|
919
981
|
if Utils.pathlike?(file)
|
920
|
-
file = Utils.
|
982
|
+
file = Utils.normalise_filepath(file)
|
921
983
|
end
|
922
984
|
|
923
985
|
_df.write_avro(file, compression)
|
@@ -936,7 +998,7 @@ module Polars
|
|
936
998
|
compression = "uncompressed"
|
937
999
|
end
|
938
1000
|
if Utils.pathlike?(file)
|
939
|
-
file = Utils.
|
1001
|
+
file = Utils.normalise_filepath(file)
|
940
1002
|
end
|
941
1003
|
|
942
1004
|
_df.write_ipc(file, compression)
|
@@ -978,7 +1040,7 @@ module Polars
|
|
978
1040
|
compression = "uncompressed"
|
979
1041
|
end
|
980
1042
|
if Utils.pathlike?(file)
|
981
|
-
file = Utils.
|
1043
|
+
file = Utils.normalise_filepath(file)
|
982
1044
|
end
|
983
1045
|
|
984
1046
|
_df.write_parquet(
|
@@ -1438,6 +1500,20 @@ module Polars
|
|
1438
1500
|
end
|
1439
1501
|
end
|
1440
1502
|
|
1503
|
+
# Sort the DataFrame by column in-place.
|
1504
|
+
#
|
1505
|
+
# @param by [String]
|
1506
|
+
# By which column to sort.
|
1507
|
+
# @param reverse [Boolean]
|
1508
|
+
# Reverse/descending sort.
|
1509
|
+
# @param nulls_last [Boolean]
|
1510
|
+
# Place null values last. Can only be used if sorted by a single column.
|
1511
|
+
#
|
1512
|
+
# @return [DataFrame]
|
1513
|
+
def sort!(by, reverse: false, nulls_last: false)
|
1514
|
+
self._df = sort(by, reverse: reverse, nulls_last: nulls_last)._df
|
1515
|
+
end
|
1516
|
+
|
1441
1517
|
# Check if DataFrame is equal to other.
|
1442
1518
|
#
|
1443
1519
|
# @param other [DataFrame]
|
@@ -1495,7 +1571,7 @@ module Polars
|
|
1495
1571
|
# # │ 30 ┆ 6 │
|
1496
1572
|
# # └─────┴─────┘
|
1497
1573
|
def replace(column, new_col)
|
1498
|
-
_df.replace(column, new_col._s)
|
1574
|
+
_df.replace(column.to_s, new_col._s)
|
1499
1575
|
self
|
1500
1576
|
end
|
1501
1577
|
|
@@ -1836,7 +1912,7 @@ module Polars
|
|
1836
1912
|
# "2020-01-08 23:16:43"
|
1837
1913
|
# ]
|
1838
1914
|
# df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
|
1839
|
-
# Polars.col("dt").str.strptime(
|
1915
|
+
# Polars.col("dt").str.strptime(Polars::Datetime)
|
1840
1916
|
# )
|
1841
1917
|
# df.groupby_rolling(index_column: "dt", period: "2d").agg(
|
1842
1918
|
# [
|
@@ -2767,6 +2843,16 @@ module Polars
|
|
2767
2843
|
Utils.wrap_s(_df.drop_in_place(name))
|
2768
2844
|
end
|
2769
2845
|
|
2846
|
+
# Drop in place if exists.
|
2847
|
+
#
|
2848
|
+
# @param name [Object]
|
2849
|
+
# Column to drop.
|
2850
|
+
#
|
2851
|
+
# @return [Series]
|
2852
|
+
def delete(name)
|
2853
|
+
drop_in_place(name) if include?(name)
|
2854
|
+
end
|
2855
|
+
|
2770
2856
|
# Create an empty copy of the current DataFrame.
|
2771
2857
|
#
|
2772
2858
|
# Returns a DataFrame with identical schema but no data.
|
@@ -3042,24 +3128,28 @@ module Polars
|
|
3042
3128
|
if aggregate_fn.is_a?(String)
|
3043
3129
|
case aggregate_fn
|
3044
3130
|
when "first"
|
3045
|
-
|
3131
|
+
aggregate_expr = Polars.element.first._rbexpr
|
3046
3132
|
when "sum"
|
3047
|
-
|
3133
|
+
aggregate_expr = Polars.element.sum._rbexpr
|
3048
3134
|
when "max"
|
3049
|
-
|
3135
|
+
aggregate_expr = Polars.element.max._rbexpr
|
3050
3136
|
when "min"
|
3051
|
-
|
3137
|
+
aggregate_expr = Polars.element.min._rbexpr
|
3052
3138
|
when "mean"
|
3053
|
-
|
3139
|
+
aggregate_expr = Polars.element.mean._rbexpr
|
3054
3140
|
when "median"
|
3055
|
-
|
3141
|
+
aggregate_expr = Polars.element.median._rbexpr
|
3056
3142
|
when "last"
|
3057
|
-
|
3143
|
+
aggregate_expr = Polars.element.last._rbexpr
|
3058
3144
|
when "count"
|
3059
|
-
|
3145
|
+
aggregate_expr = Polars.count._rbexpr
|
3060
3146
|
else
|
3061
3147
|
raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected."
|
3062
3148
|
end
|
3149
|
+
elsif aggregate_fn.nil?
|
3150
|
+
aggregate_expr = nil
|
3151
|
+
else
|
3152
|
+
aggregate_expr = aggregate_function._rbexpr
|
3063
3153
|
end
|
3064
3154
|
|
3065
3155
|
_from_rbdf(
|
@@ -3067,9 +3157,9 @@ module Polars
|
|
3067
3157
|
values,
|
3068
3158
|
index,
|
3069
3159
|
columns,
|
3070
|
-
aggregate_fn._rbexpr,
|
3071
3160
|
maintain_order,
|
3072
3161
|
sort_columns,
|
3162
|
+
aggregate_expr,
|
3073
3163
|
separator
|
3074
3164
|
)
|
3075
3165
|
)
|
@@ -3174,7 +3264,7 @@ module Polars
|
|
3174
3264
|
# # │ B ┆ 1 │
|
3175
3265
|
# # │ C ┆ 2 │
|
3176
3266
|
# # │ D ┆ 3 │
|
3177
|
-
# # │
|
3267
|
+
# # │ E ┆ 4 │
|
3178
3268
|
# # │ F ┆ 5 │
|
3179
3269
|
# # │ G ┆ 6 │
|
3180
3270
|
# # │ H ┆ 7 │
|
@@ -4053,15 +4143,12 @@ module Polars
|
|
4053
4143
|
# # │ 5 ┆ 3.0 ┆ true │
|
4054
4144
|
# # └─────┴─────┴───────┘
|
4055
4145
|
def unique(maintain_order: true, subset: nil, keep: "first")
|
4056
|
-
|
4057
|
-
|
4058
|
-
subset
|
4059
|
-
|
4060
|
-
|
4061
|
-
|
4062
|
-
end
|
4063
|
-
|
4064
|
-
_from_rbdf(_df.unique(maintain_order, subset, keep))
|
4146
|
+
self._from_rbdf(
|
4147
|
+
lazy
|
4148
|
+
.unique(maintain_order: maintain_order, subset: subset, keep: keep)
|
4149
|
+
.collect(no_optimization: true)
|
4150
|
+
._df
|
4151
|
+
)
|
4065
4152
|
end
|
4066
4153
|
|
4067
4154
|
# Return the number of unique rows, or the number of unique row-subsets.
|
@@ -4403,7 +4490,7 @@ module Polars
|
|
4403
4490
|
end
|
4404
4491
|
end
|
4405
4492
|
|
4406
|
-
# Returns an iterator over the DataFrame of rows of
|
4493
|
+
# Returns an iterator over the DataFrame of rows of Ruby-native values.
|
4407
4494
|
#
|
4408
4495
|
# @param named [Boolean]
|
4409
4496
|
# Return hashes instead of arrays. The hashes are a mapping of
|
@@ -4464,6 +4551,24 @@ module Polars
|
|
4464
4551
|
end
|
4465
4552
|
end
|
4466
4553
|
|
4554
|
+
# Returns an iterator over the DataFrame of rows of Ruby-native values.
|
4555
|
+
#
|
4556
|
+
# @param named [Boolean]
|
4557
|
+
# Return hashes instead of arrays. The hashes are a mapping of
|
4558
|
+
# column name to row value. This is more expensive than returning an
|
4559
|
+
# array, but allows for accessing values by column name.
|
4560
|
+
# @param buffer_size [Integer]
|
4561
|
+
# Determines the number of rows that are buffered internally while iterating
|
4562
|
+
# over the data; you should only modify this in very specific cases where the
|
4563
|
+
# default value is determined not to be a good fit to your access pattern, as
|
4564
|
+
# the speedup from using the buffer is significant (~2-4x). Setting this
|
4565
|
+
# value to zero disables row buffering.
|
4566
|
+
#
|
4567
|
+
# @return [Object]
|
4568
|
+
def each_row(named: true, buffer_size: 500, &block)
|
4569
|
+
iter_rows(named: named, buffer_size: buffer_size, &block)
|
4570
|
+
end
|
4571
|
+
|
4467
4572
|
# Shrink DataFrame memory usage.
|
4468
4573
|
#
|
4469
4574
|
# Shrinks to fit the exact capacity needed to hold the data.
|
@@ -4717,20 +4822,63 @@ module Polars
|
|
4717
4822
|
end
|
4718
4823
|
|
4719
4824
|
# @private
|
4720
|
-
def self.
|
4721
|
-
|
4722
|
-
|
4825
|
+
def self.expand_hash_scalars(data, schema_overrides: nil, order: nil, nan_to_null: false)
|
4826
|
+
updated_data = {}
|
4827
|
+
unless data.empty?
|
4828
|
+
dtypes = schema_overrides || {}
|
4829
|
+
array_len = data.values.map { |val| Utils.arrlen(val) || 0 }.max
|
4830
|
+
if array_len > 0
|
4831
|
+
data.each do |name, val|
|
4832
|
+
dtype = dtypes[name]
|
4833
|
+
if val.is_a?(Hash) && dtype != Struct
|
4834
|
+
updated_data[name] = DataFrame.new(val).to_struct(name)
|
4835
|
+
elsif !Utils.arrlen(val).nil?
|
4836
|
+
updated_data[name] = Series.new(String.new(name), val, dtype: dtype)
|
4837
|
+
elsif val.nil? || [Integer, Float, TrueClass, FalseClass, String, ::Date, ::DateTime, ::Time].any? { |cls| val.is_a?(cls) }
|
4838
|
+
dtype = Polars::Float64 if val.nil? && dtype.nil?
|
4839
|
+
updated_data[name] = Series.new(String.new(name), [val], dtype: dtype).extend_constant(val, array_len - 1)
|
4840
|
+
else
|
4841
|
+
raise Todo
|
4842
|
+
end
|
4843
|
+
end
|
4844
|
+
elsif data.values.all? { |val| Utils.arrlen(val) == 0 }
|
4845
|
+
data.each do |name, val|
|
4846
|
+
updated_data[name] = Series.new(name, val, dtype: dtypes[name])
|
4847
|
+
end
|
4848
|
+
elsif data.values.all? { |val| Utils.arrlen(val).nil? }
|
4849
|
+
data.each do |name, val|
|
4850
|
+
updated_data[name] = Series.new(name, [val], dtype: dtypes[name])
|
4851
|
+
end
|
4852
|
+
end
|
4853
|
+
end
|
4854
|
+
updated_data
|
4855
|
+
end
|
4723
4856
|
|
4724
|
-
|
4725
|
-
|
4726
|
-
|
4727
|
-
|
4857
|
+
# @private
|
4858
|
+
def self.hash_to_rbdf(data, schema: nil, schema_overrides: nil, nan_to_null: nil)
|
4859
|
+
if schema.is_a?(Hash) && !data.empty?
|
4860
|
+
if !data.all? { |col, _| schema[col] }
|
4861
|
+
raise ArgumentError, "The given column-schema names do not match the data dictionary"
|
4728
4862
|
end
|
4729
|
-
|
4730
|
-
|
4863
|
+
|
4864
|
+
data = schema.to_h { |col| [col, data[col]] }
|
4865
|
+
end
|
4866
|
+
|
4867
|
+
column_names, schema_overrides = _unpack_schema(
|
4868
|
+
schema, lookup_names: data.keys, schema_overrides: schema_overrides
|
4869
|
+
)
|
4870
|
+
if column_names.empty?
|
4871
|
+
column_names = data.keys
|
4872
|
+
end
|
4873
|
+
|
4874
|
+
if data.empty? && !schema_overrides.empty?
|
4875
|
+
data_series = column_names.map { |name| Series.new(name, [], dtype: schema_overrides[name], nan_to_null: nan_to_null)._s }
|
4876
|
+
else
|
4877
|
+
data_series = expand_hash_scalars(data, schema_overrides: schema_overrides, nan_to_null: nan_to_null).values.map(&:_s)
|
4731
4878
|
end
|
4732
4879
|
|
4733
|
-
|
4880
|
+
data_series = _handle_columns_arg(data_series, columns: column_names, from_hash: true)
|
4881
|
+
RbDataFrame.new(data_series)
|
4734
4882
|
end
|
4735
4883
|
|
4736
4884
|
# @private
|
@@ -4739,14 +4887,12 @@ module Polars
|
|
4739
4887
|
end
|
4740
4888
|
|
4741
4889
|
# @private
|
4742
|
-
def self.
|
4743
|
-
|
4744
|
-
|
4745
|
-
if columns.is_a?(Hash)
|
4746
|
-
columns = columns.to_a
|
4890
|
+
def self._unpack_schema(schema, schema_overrides: nil, n_expected: nil, lookup_names: nil, include_overrides_in_columns: false)
|
4891
|
+
if schema.is_a?(Hash)
|
4892
|
+
schema = schema.to_a
|
4747
4893
|
end
|
4748
4894
|
column_names =
|
4749
|
-
(
|
4895
|
+
(schema || []).map.with_index do |col, i|
|
4750
4896
|
if col.is_a?(String)
|
4751
4897
|
col || "column_#{i}"
|
4752
4898
|
else
|
@@ -4759,21 +4905,38 @@ module Polars
|
|
4759
4905
|
# TODO zip_longest
|
4760
4906
|
lookup = column_names.zip(lookup_names || []).to_h
|
4761
4907
|
|
4762
|
-
|
4763
|
-
|
4764
|
-
(columns || []).select { |col| !col.is_a?(String) && col[1] }.to_h do |col|
|
4908
|
+
column_dtypes =
|
4909
|
+
(schema || []).select { |col| !col.is_a?(String) && col[1] }.to_h do |col|
|
4765
4910
|
[lookup[col[0]] || col[0], col[1]]
|
4766
4911
|
end
|
4767
|
-
|
4912
|
+
|
4913
|
+
if schema_overrides
|
4914
|
+
raise Todo
|
4915
|
+
end
|
4916
|
+
|
4917
|
+
column_dtypes.each do |col, dtype|
|
4918
|
+
if !Utils.is_polars_dtype(dtype, include_unknown: true) && !dtype.nil?
|
4919
|
+
column_dtypes[col] = Utils.rb_type_to_dtype(dtype)
|
4920
|
+
end
|
4921
|
+
end
|
4922
|
+
|
4923
|
+
[column_names, column_dtypes]
|
4768
4924
|
end
|
4769
4925
|
|
4770
|
-
def self._handle_columns_arg(data, columns: nil)
|
4771
|
-
if columns.nil?
|
4926
|
+
def self._handle_columns_arg(data, columns: nil, from_hash: false)
|
4927
|
+
if columns.nil? || columns.empty?
|
4772
4928
|
data
|
4773
4929
|
else
|
4774
4930
|
if data.empty?
|
4775
4931
|
columns.map { |c| Series.new(c, nil)._s }
|
4776
4932
|
elsif data.length == columns.length
|
4933
|
+
if from_hash
|
4934
|
+
series_map = data.to_h { |s| [s.name, s] }
|
4935
|
+
if columns.all? { |col| series_map.key?(col) }
|
4936
|
+
return columns.map { |col| series_map[col] }
|
4937
|
+
end
|
4938
|
+
end
|
4939
|
+
|
4777
4940
|
columns.each_with_index do |c, i|
|
4778
4941
|
# not in-place?
|
4779
4942
|
data[i].rename(c)
|
@@ -4788,7 +4951,7 @@ module Polars
|
|
4788
4951
|
def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil)
|
4789
4952
|
rbdf_columns = rbdf.columns
|
4790
4953
|
rbdf_dtypes = rbdf.dtypes
|
4791
|
-
columns, dtypes =
|
4954
|
+
columns, dtypes = _unpack_schema(
|
4792
4955
|
(columns || rbdf_columns), schema_overrides: schema_overrides
|
4793
4956
|
)
|
4794
4957
|
column_subset = []
|
@@ -4826,20 +4989,23 @@ module Polars
|
|
4826
4989
|
end
|
4827
4990
|
|
4828
4991
|
# @private
|
4829
|
-
def self.sequence_to_rbdf(data,
|
4992
|
+
def self.sequence_to_rbdf(data, schema: nil, schema_overrides: nil, orient: nil, infer_schema_length: 50)
|
4993
|
+
raise Todo if schema_overrides
|
4994
|
+
columns = schema
|
4995
|
+
|
4830
4996
|
if data.length == 0
|
4831
|
-
return hash_to_rbdf({},
|
4997
|
+
return hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
|
4832
4998
|
end
|
4833
4999
|
|
4834
5000
|
if data[0].is_a?(Series)
|
4835
5001
|
# series_names = data.map(&:name)
|
4836
|
-
# columns, dtypes =
|
5002
|
+
# columns, dtypes = _unpack_schema(columns || series_names, n_expected: data.length)
|
4837
5003
|
data_series = []
|
4838
5004
|
data.each do |s|
|
4839
5005
|
data_series << s._s
|
4840
5006
|
end
|
4841
5007
|
elsif data[0].is_a?(Hash)
|
4842
|
-
column_names, dtypes =
|
5008
|
+
column_names, dtypes = _unpack_schema(columns)
|
4843
5009
|
schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
|
4844
5010
|
rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema_overrides)
|
4845
5011
|
if column_names
|
@@ -4865,11 +5031,21 @@ module Polars
|
|
4865
5031
|
end
|
4866
5032
|
|
4867
5033
|
# @private
|
4868
|
-
def self.series_to_rbdf(data,
|
4869
|
-
|
4870
|
-
|
5034
|
+
def self.series_to_rbdf(data, schema: nil, schema_overrides: nil)
|
5035
|
+
data_series = [data._s]
|
5036
|
+
series_name = data_series.map(&:name)
|
5037
|
+
column_names, schema_overrides = _unpack_schema(
|
5038
|
+
schema || series_name, schema_overrides: schema_overrides, n_expected: 1
|
5039
|
+
)
|
5040
|
+
if schema_overrides.any?
|
5041
|
+
new_dtype = schema_overrides.values[0]
|
5042
|
+
if new_dtype != data.dtype
|
5043
|
+
data_series[0] = data_series[0].cast(new_dtype, true)
|
5044
|
+
end
|
4871
5045
|
end
|
4872
|
-
|
5046
|
+
|
5047
|
+
data_series = _handle_columns_arg(data_series, columns: column_names)
|
5048
|
+
RbDataFrame.new(data_series)
|
4873
5049
|
end
|
4874
5050
|
|
4875
5051
|
def wrap_ldf(ldf)
|
data/lib/polars/data_types.rb
CHANGED
@@ -84,20 +84,22 @@ module Polars
|
|
84
84
|
|
85
85
|
# Calendar date and time type.
|
86
86
|
class Datetime < TemporalType
|
87
|
-
attr_reader :
|
87
|
+
attr_reader :time_unit, :time_zone
|
88
|
+
alias_method :tu, :time_unit
|
88
89
|
|
89
90
|
def initialize(time_unit = "us", time_zone = nil)
|
90
|
-
@
|
91
|
+
@time_unit = time_unit || "us"
|
91
92
|
@time_zone = time_zone
|
92
93
|
end
|
93
94
|
end
|
94
95
|
|
95
96
|
# Time duration/delta type.
|
96
97
|
class Duration < TemporalType
|
97
|
-
attr_reader :
|
98
|
+
attr_reader :time_unit
|
99
|
+
alias_method :tu, :time_unit
|
98
100
|
|
99
101
|
def initialize(time_unit = "us")
|
100
|
-
@
|
102
|
+
@time_unit = time_unit
|
101
103
|
end
|
102
104
|
end
|
103
105
|
|