polars-df 0.8.0 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +30 -1
  3. data/Cargo.lock +107 -59
  4. data/Cargo.toml +0 -3
  5. data/LICENSE.txt +1 -1
  6. data/README.md +2 -2
  7. data/ext/polars/Cargo.toml +15 -7
  8. data/ext/polars/src/batched_csv.rs +4 -4
  9. data/ext/polars/src/conversion/anyvalue.rs +185 -0
  10. data/ext/polars/src/conversion/chunked_array.rs +140 -0
  11. data/ext/polars/src/{conversion.rs → conversion/mod.rs} +260 -340
  12. data/ext/polars/src/dataframe.rs +69 -53
  13. data/ext/polars/src/expr/array.rs +74 -0
  14. data/ext/polars/src/expr/datetime.rs +22 -56
  15. data/ext/polars/src/expr/general.rs +61 -33
  16. data/ext/polars/src/expr/list.rs +52 -4
  17. data/ext/polars/src/expr/meta.rs +48 -0
  18. data/ext/polars/src/expr/rolling.rs +1 -0
  19. data/ext/polars/src/expr/string.rs +59 -8
  20. data/ext/polars/src/expr/struct.rs +8 -4
  21. data/ext/polars/src/functions/aggregation.rs +6 -0
  22. data/ext/polars/src/functions/lazy.rs +103 -48
  23. data/ext/polars/src/functions/meta.rs +45 -1
  24. data/ext/polars/src/functions/string_cache.rs +14 -0
  25. data/ext/polars/src/{lazyframe.rs → lazyframe/mod.rs} +138 -22
  26. data/ext/polars/src/lib.rs +226 -168
  27. data/ext/polars/src/series/aggregation.rs +20 -0
  28. data/ext/polars/src/series/mod.rs +25 -4
  29. data/lib/polars/array_expr.rb +449 -0
  30. data/lib/polars/array_name_space.rb +346 -0
  31. data/lib/polars/cat_expr.rb +24 -0
  32. data/lib/polars/cat_name_space.rb +75 -0
  33. data/lib/polars/config.rb +2 -2
  34. data/lib/polars/data_frame.rb +179 -43
  35. data/lib/polars/data_types.rb +191 -28
  36. data/lib/polars/date_time_expr.rb +31 -14
  37. data/lib/polars/exceptions.rb +12 -1
  38. data/lib/polars/expr.rb +866 -186
  39. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  40. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  41. data/lib/polars/functions/as_datatype.rb +248 -0
  42. data/lib/polars/functions/col.rb +47 -0
  43. data/lib/polars/functions/eager.rb +182 -0
  44. data/lib/polars/functions/lazy.rb +1280 -0
  45. data/lib/polars/functions/len.rb +49 -0
  46. data/lib/polars/functions/lit.rb +35 -0
  47. data/lib/polars/functions/random.rb +16 -0
  48. data/lib/polars/functions/range/date_range.rb +103 -0
  49. data/lib/polars/functions/range/int_range.rb +51 -0
  50. data/lib/polars/functions/repeat.rb +144 -0
  51. data/lib/polars/functions/whenthen.rb +27 -0
  52. data/lib/polars/functions.rb +29 -416
  53. data/lib/polars/group_by.rb +2 -2
  54. data/lib/polars/io.rb +18 -25
  55. data/lib/polars/lazy_frame.rb +367 -53
  56. data/lib/polars/list_expr.rb +152 -6
  57. data/lib/polars/list_name_space.rb +102 -0
  58. data/lib/polars/meta_expr.rb +175 -7
  59. data/lib/polars/series.rb +273 -34
  60. data/lib/polars/string_cache.rb +75 -0
  61. data/lib/polars/string_expr.rb +412 -96
  62. data/lib/polars/string_name_space.rb +4 -4
  63. data/lib/polars/testing.rb +507 -0
  64. data/lib/polars/utils.rb +52 -8
  65. data/lib/polars/version.rb +1 -1
  66. data/lib/polars.rb +15 -2
  67. metadata +35 -5
  68. data/lib/polars/lazy_functions.rb +0 -1181
@@ -1,24 +1,122 @@
1
1
  module Polars
2
2
  # Base class for all Polars data types.
3
3
  class DataType
4
+ # Return this DataType's fundamental/root type class.
5
+ #
6
+ # @return [Class]
7
+ #
8
+ # @example
9
+ # Polars::Datetime.new("ns").base_type
10
+ # # => Polars::Datetime
11
+ # @example
12
+ # Polars::List.new(Polars::Int32).base_type
13
+ # # => Polars::List
14
+ # @example
15
+ # Polars::Struct.new([Polars::Field.new("a", Polars::Int64), Polars::Field.new("b", Polars::Boolean)]).base_type
16
+ # # => Polars::Struct
4
17
  def self.base_type
5
18
  self
6
19
  end
7
20
 
21
+ # Return this DataType's fundamental/root type class.
22
+ #
23
+ # @return [Class]
8
24
  def base_type
9
25
  is_a?(DataType) ? self.class : self
10
26
  end
11
27
 
28
+ # Check if this DataType is the same as another DataType.
29
+ #
30
+ # @return [Boolean]
31
+ def self.==(other)
32
+ eql?(other) || other.is_a?(self)
33
+ end
34
+
35
+ # Check if this DataType is the same as another DataType.
36
+ #
37
+ # @return [Boolean]
38
+ def ==(other)
39
+ if other.is_a?(Class)
40
+ is_a?(other)
41
+ else
42
+ other.instance_of?(self.class)
43
+ end
44
+ end
45
+
46
+ # Check whether the data type is a numeric type.
47
+ #
48
+ # @return [Boolean]
49
+ def self.numeric?
50
+ self < NumericType
51
+ end
52
+
53
+ # Check whether the data type is a decimal type.
54
+ #
55
+ # @return [Boolean]
56
+ def self.decimal?
57
+ self == Decimal
58
+ end
59
+
60
+ # Check whether the data type is an integer type.
61
+ #
62
+ # @return [Boolean]
63
+ def self.integer?
64
+ self < IntegerType
65
+ end
66
+
67
+ # Check whether the data type is a signed integer type.
68
+ #
69
+ # @return [Boolean]
70
+ def self.signed_integer?
71
+ self < SignedIntegerType
72
+ end
73
+
74
+ # Check whether the data type is an unsigned integer type.
75
+ #
76
+ # @return [Boolean]
77
+ def self.unsigned_integer?
78
+ self < UnsignedIntegerType
79
+ end
80
+
81
+ # Check whether the data type is a float type.
82
+ #
83
+ # @return [Boolean]
84
+ def self.float?
85
+ self < FloatType
86
+ end
87
+
88
+ # Check whether the data type is a temporal type.
89
+ #
90
+ # @return [Boolean]
91
+ def self.temporal?
92
+ self < TemporalType
93
+ end
94
+
95
+ # Check whether the data type is a nested type.
96
+ #
97
+ # @return [Boolean]
12
98
  def self.nested?
13
- false
99
+ self < NestedType
14
100
  end
15
101
 
16
- def nested?
17
- self.class.nested?
102
+ [:numeric?, :decimal?, :integer?, :signed_integer?, :unsigned_integer?, :float?, :temporal?, :nested?].each do |v|
103
+ define_method(v) do
104
+ self.class.public_send(v)
105
+ end
18
106
  end
19
107
 
20
- def self.==(other)
21
- eql?(other) || other.is_a?(self)
108
+ # Returns a string representing the data type.
109
+ #
110
+ # @return [String]
111
+ def to_s
112
+ self.class.name
113
+ end
114
+
115
+ # Returns a string representing the data type.
116
+ #
117
+ # @return [String]
118
+ def inspect
119
+ to_s
22
120
  end
23
121
  end
24
122
 
@@ -27,15 +125,22 @@ module Polars
27
125
  end
28
126
 
29
127
  # Base class for integral data types.
30
- class IntegralType < NumericType
128
+ class IntegerType < NumericType
31
129
  end
32
130
 
33
- # Base class for fractional data types.
34
- class FractionalType < NumericType
131
+ # @private
132
+ IntegralType = IntegerType
133
+
134
+ # Base class for signed integer data types.
135
+ class SignedIntegerType < IntegerType
136
+ end
137
+
138
+ # Base class for unsigned integer data types.
139
+ class UnsignedIntegerType < IntegerType
35
140
  end
36
141
 
37
142
  # Base class for float data types.
38
- class FloatType < FractionalType
143
+ class FloatType < NumericType
39
144
  end
40
145
 
41
146
  # Base class for temporal data types.
@@ -44,41 +149,38 @@ module Polars
44
149
 
45
150
  # Base class for nested data types.
46
151
  class NestedType < DataType
47
- def self.nested?
48
- true
49
- end
50
152
  end
51
153
 
52
154
  # 8-bit signed integer type.
53
- class Int8 < IntegralType
155
+ class Int8 < SignedIntegerType
54
156
  end
55
157
 
56
158
  # 16-bit signed integer type.
57
- class Int16 < IntegralType
159
+ class Int16 < SignedIntegerType
58
160
  end
59
161
 
60
162
  # 32-bit signed integer type.
61
- class Int32 < IntegralType
163
+ class Int32 < SignedIntegerType
62
164
  end
63
165
 
64
166
  # 64-bit signed integer type.
65
- class Int64 < IntegralType
167
+ class Int64 < SignedIntegerType
66
168
  end
67
169
 
68
170
  # 8-bit unsigned integer type.
69
- class UInt8 < IntegralType
171
+ class UInt8 < UnsignedIntegerType
70
172
  end
71
173
 
72
174
  # 16-bit unsigned integer type.
73
- class UInt16 < IntegralType
175
+ class UInt16 < UnsignedIntegerType
74
176
  end
75
177
 
76
178
  # 32-bit unsigned integer type.
77
- class UInt32 < IntegralType
179
+ class UInt32 < UnsignedIntegerType
78
180
  end
79
181
 
80
182
  # 64-bit unsigned integer type.
81
- class UInt64 < IntegralType
183
+ class UInt64 < UnsignedIntegerType
82
184
  end
83
185
 
84
186
  # 32-bit floating point type.
@@ -92,7 +194,7 @@ module Polars
92
194
  # Decimal 128-bit type with an optional precision and non-negative scale.
93
195
  #
94
196
  # NOTE: this is an experimental work-in-progress feature and may not work as expected.
95
- class Decimal < FractionalType
197
+ class Decimal < NumericType
96
198
  attr_reader :precision, :scale
97
199
 
98
200
  def initialize(precision, scale)
@@ -123,6 +225,7 @@ module Polars
123
225
  class String < DataType
124
226
  end
125
227
 
228
+ # @private
126
229
  # Allow Utf8 as an alias for String
127
230
  Utf8 = String
128
231
 
@@ -189,6 +292,59 @@ module Polars
189
292
 
190
293
  # A categorical encoding of a set of strings.
191
294
  class Categorical < DataType
295
+ def initialize(ordering = "physical")
296
+ @ordering = ordering
297
+ end
298
+ end
299
+
300
+ # A fixed set categorical encoding of a set of strings.
301
+ #
302
+ # NOTE: this is an experimental work-in-progress feature and may not work as expected.
303
+ class Enum < DataType
304
+ attr_reader :categories
305
+
306
+ def initialize(categories)
307
+ if !categories.is_a?(Series)
308
+ categories = Series.new(categories)
309
+ end
310
+
311
+ if categories.empty?
312
+ self.categories = Series.new("category", [], dtype: String)
313
+ return
314
+ end
315
+
316
+ if categories.null_count > 0
317
+ msg = "Enum categories must not contain null values"
318
+ raise TypeError, msg
319
+ end
320
+
321
+ if (dtype = categories.dtype) != String
322
+ msg = "Enum categories must be strings; found data of type #{dtype}"
323
+ raise TypeError, msg
324
+ end
325
+
326
+ if categories.n_unique != categories.len
327
+ duplicate = categories.filter(categories.is_duplicated)[0]
328
+ msg = "Enum categories must be unique; found duplicate #{duplicate}"
329
+ raise ArgumentError, msg
330
+ end
331
+
332
+ @categories = categories.rechunk.alias("category")
333
+ end
334
+
335
+ def ==(other)
336
+ if other.eql?(Enum)
337
+ true
338
+ elsif other.is_a?(Enum)
339
+ categories == other.categories
340
+ else
341
+ false
342
+ end
343
+ end
344
+
345
+ def to_s
346
+ "#{self.class.name}(categories: #{categories.to_a.inspect})"
347
+ end
192
348
  end
193
349
 
194
350
  # Type for wrapping arbitrary Ruby objects.
@@ -228,27 +384,34 @@ module Polars
228
384
 
229
385
  # Nested list/array type.
230
386
  class Array < NestedType
231
- attr_reader :width, :inner
387
+ attr_reader :inner, :width
232
388
 
233
- def initialize(width, inner = nil)
234
- @width = width
389
+ def initialize(inner, width)
390
+ if width.is_a?(DataType) || (width.is_a?(Class) && width < DataType)
391
+ inner, width = width, inner
392
+ end
235
393
  @inner = Utils.rb_type_to_dtype(inner) if inner
394
+ @width = width
236
395
  end
237
396
 
238
- # TODO check width?
239
397
  def ==(other)
240
398
  if other.eql?(Array)
241
399
  true
242
400
  elsif other.is_a?(Array)
243
- @inner.nil? || other.inner.nil? || @inner == other.inner
401
+ if @width != other.width
402
+ false
403
+ elsif @inner.nil? || other.inner.nil?
404
+ true
405
+ else
406
+ @inner == other.inner
407
+ end
244
408
  else
245
409
  false
246
410
  end
247
411
  end
248
412
 
249
- # TODO add width?
250
413
  def to_s
251
- "#{self.class.name}(#{inner})"
414
+ "#{self.class.name}(#{inner}, width: #{width.inspect})"
252
415
  end
253
416
  end
254
417
 
@@ -1066,9 +1066,10 @@ module Polars
1066
1066
  # # │ 2020-04-01 00:00:00 ┆ 31 │
1067
1067
  # # │ 2020-05-01 00:00:00 ┆ 30 │
1068
1068
  # # └─────────────────────┴───────────┘
1069
- def days
1070
- Utils.wrap_expr(_rbexpr.duration_days)
1069
+ def total_days
1070
+ Utils.wrap_expr(_rbexpr.dt_total_days)
1071
1071
  end
1072
+ alias_method :days, :total_days
1072
1073
 
1073
1074
  # Extract the hours from a Duration type.
1074
1075
  #
@@ -1100,9 +1101,10 @@ module Polars
1100
1101
  # # │ 2020-01-03 00:00:00 ┆ 24 │
1101
1102
  # # │ 2020-01-04 00:00:00 ┆ 24 │
1102
1103
  # # └─────────────────────┴────────────┘
1103
- def hours
1104
- Utils.wrap_expr(_rbexpr.duration_hours)
1104
+ def total_hours
1105
+ Utils.wrap_expr(_rbexpr.dt_total_hours)
1105
1106
  end
1107
+ alias_method :hours, :total_hours
1106
1108
 
1107
1109
  # Extract the minutes from a Duration type.
1108
1110
  #
@@ -1134,9 +1136,10 @@ module Polars
1134
1136
  # # │ 2020-01-03 00:00:00 ┆ 1440 │
1135
1137
  # # │ 2020-01-04 00:00:00 ┆ 1440 │
1136
1138
  # # └─────────────────────┴──────────────┘
1137
- def minutes
1138
- Utils.wrap_expr(_rbexpr.duration_minutes)
1139
+ def total_minutes
1140
+ Utils.wrap_expr(_rbexpr.dt_total_minutes)
1139
1141
  end
1142
+ alias_method :minutes, :total_minutes
1140
1143
 
1141
1144
  # Extract the seconds from a Duration type.
1142
1145
  #
@@ -1169,9 +1172,10 @@ module Polars
1169
1172
  # # │ 2020-01-01 00:03:00 ┆ 60 │
1170
1173
  # # │ 2020-01-01 00:04:00 ┆ 60 │
1171
1174
  # # └─────────────────────┴──────────────┘
1172
- def seconds
1173
- Utils.wrap_expr(_rbexpr.duration_seconds)
1175
+ def total_seconds
1176
+ Utils.wrap_expr(_rbexpr.dt_total_seconds)
1174
1177
  end
1178
+ alias_method :seconds, :total_seconds
1175
1179
 
1176
1180
  # Extract the milliseconds from a Duration type.
1177
1181
  #
@@ -1202,15 +1206,18 @@ module Polars
1202
1206
  # # │ 2020-01-01 00:00:00.001 ┆ 1 │
1203
1207
  # # │ 2020-01-01 00:00:00.002 ┆ 1 │
1204
1208
  # # │ 2020-01-01 00:00:00.003 ┆ 1 │
1209
+ # # │ 2020-01-01 00:00:00.004 ┆ 1 │
1205
1210
  # # │ … ┆ … │
1211
+ # # │ 2020-01-01 00:00:00.996 ┆ 1 │
1206
1212
  # # │ 2020-01-01 00:00:00.997 ┆ 1 │
1207
1213
  # # │ 2020-01-01 00:00:00.998 ┆ 1 │
1208
1214
  # # │ 2020-01-01 00:00:00.999 ┆ 1 │
1209
1215
  # # │ 2020-01-01 00:00:01 ┆ 1 │
1210
1216
  # # └─────────────────────────┴───────────────────┘
1211
- def milliseconds
1212
- Utils.wrap_expr(_rbexpr.duration_milliseconds)
1217
+ def total_milliseconds
1218
+ Utils.wrap_expr(_rbexpr.dt_total_milliseconds)
1213
1219
  end
1220
+ alias_method :milliseconds, :total_milliseconds
1214
1221
 
1215
1222
  # Extract the microseconds from a Duration type.
1216
1223
  #
@@ -1241,15 +1248,18 @@ module Polars
1241
1248
  # # │ 2020-01-01 00:00:00.001 ┆ 1000 │
1242
1249
  # # │ 2020-01-01 00:00:00.002 ┆ 1000 │
1243
1250
  # # │ 2020-01-01 00:00:00.003 ┆ 1000 │
1251
+ # # │ 2020-01-01 00:00:00.004 ┆ 1000 │
1244
1252
  # # │ … ┆ … │
1253
+ # # │ 2020-01-01 00:00:00.996 ┆ 1000 │
1245
1254
  # # │ 2020-01-01 00:00:00.997 ┆ 1000 │
1246
1255
  # # │ 2020-01-01 00:00:00.998 ┆ 1000 │
1247
1256
  # # │ 2020-01-01 00:00:00.999 ┆ 1000 │
1248
1257
  # # │ 2020-01-01 00:00:01 ┆ 1000 │
1249
1258
  # # └─────────────────────────┴───────────────────┘
1250
- def microseconds
1251
- Utils.wrap_expr(_rbexpr.duration_microseconds)
1259
+ def total_microseconds
1260
+ Utils.wrap_expr(_rbexpr.dt_total_microseconds)
1252
1261
  end
1262
+ alias_method :microseconds, :total_microseconds
1253
1263
 
1254
1264
  # Extract the nanoseconds from a Duration type.
1255
1265
  #
@@ -1280,15 +1290,18 @@ module Polars
1280
1290
  # # │ 2020-01-01 00:00:00.001 ┆ 1000000 │
1281
1291
  # # │ 2020-01-01 00:00:00.002 ┆ 1000000 │
1282
1292
  # # │ 2020-01-01 00:00:00.003 ┆ 1000000 │
1293
+ # # │ 2020-01-01 00:00:00.004 ┆ 1000000 │
1283
1294
  # # │ … ┆ … │
1295
+ # # │ 2020-01-01 00:00:00.996 ┆ 1000000 │
1284
1296
  # # │ 2020-01-01 00:00:00.997 ┆ 1000000 │
1285
1297
  # # │ 2020-01-01 00:00:00.998 ┆ 1000000 │
1286
1298
  # # │ 2020-01-01 00:00:00.999 ┆ 1000000 │
1287
1299
  # # │ 2020-01-01 00:00:01 ┆ 1000000 │
1288
1300
  # # └─────────────────────────┴──────────────────┘
1289
- def nanoseconds
1290
- Utils.wrap_expr(_rbexpr.duration_nanoseconds)
1301
+ def total_nanoseconds
1302
+ Utils.wrap_expr(_rbexpr.dt_total_nanoseconds)
1291
1303
  end
1304
+ alias_method :nanoseconds, :total_nanoseconds
1292
1305
 
1293
1306
  # Offset this date by a relative time offset.
1294
1307
  #
@@ -1372,7 +1385,9 @@ module Polars
1372
1385
  # # │ 2000-02-01 02:00:00 │
1373
1386
  # # │ 2000-03-01 02:00:00 │
1374
1387
  # # │ 2000-04-01 02:00:00 │
1388
+ # # │ 2000-05-01 02:00:00 │
1375
1389
  # # │ … │
1390
+ # # │ 2000-08-01 02:00:00 │
1376
1391
  # # │ 2000-09-01 02:00:00 │
1377
1392
  # # │ 2000-10-01 02:00:00 │
1378
1393
  # # │ 2000-11-01 02:00:00 │
@@ -1408,7 +1423,9 @@ module Polars
1408
1423
  # # │ 2000-02-29 02:00:00 │
1409
1424
  # # │ 2000-03-31 02:00:00 │
1410
1425
  # # │ 2000-04-30 02:00:00 │
1426
+ # # │ 2000-05-31 02:00:00 │
1411
1427
  # # │ … │
1428
+ # # │ 2000-08-31 02:00:00 │
1412
1429
  # # │ 2000-09-30 02:00:00 │
1413
1430
  # # │ 2000-10-31 02:00:00 │
1414
1431
  # # │ 2000-11-30 02:00:00 │
@@ -1,15 +1,26 @@
1
1
  module Polars
2
2
  # @private
3
+ # Base class for all Polars errors.
3
4
  class Error < StandardError; end
4
5
 
5
6
  # @private
7
+ # Exception raised when an unsupported testing assert is made.
8
+ class InvalidAssert < Error; end
9
+
10
+ # @private
11
+ # Exception raised when the number of returned rows does not match expectation.
6
12
  class RowsException < Error; end
7
13
 
8
14
  # @private
15
+ # Exception raised when no rows are returned, but at least one row is expected.
16
+ class NoRowsReturned < RowsException; end
17
+
18
+ # @private
19
+ # Exception raised when more rows than expected are returned.
9
20
  class TooManyRowsReturned < RowsException; end
10
21
 
11
22
  # @private
12
- class NoRowsReturned < RowsException; end
23
+ class AssertionError < Error; end
13
24
 
14
25
  # @private
15
26
  class Todo < Error