polars-df 0.10.0-x86_64-linux-musl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +175 -0
  4. data/Cargo.lock +2536 -0
  5. data/Cargo.toml +6 -0
  6. data/LICENSE-THIRD-PARTY.txt +38726 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +437 -0
  9. data/lib/polars/3.1/polars.so +0 -0
  10. data/lib/polars/3.2/polars.so +0 -0
  11. data/lib/polars/3.3/polars.so +0 -0
  12. data/lib/polars/array_expr.rb +537 -0
  13. data/lib/polars/array_name_space.rb +423 -0
  14. data/lib/polars/batched_csv_reader.rb +98 -0
  15. data/lib/polars/binary_expr.rb +77 -0
  16. data/lib/polars/binary_name_space.rb +66 -0
  17. data/lib/polars/cat_expr.rb +72 -0
  18. data/lib/polars/cat_name_space.rb +125 -0
  19. data/lib/polars/config.rb +530 -0
  20. data/lib/polars/convert.rb +93 -0
  21. data/lib/polars/data_frame.rb +5418 -0
  22. data/lib/polars/data_types.rb +466 -0
  23. data/lib/polars/date_time_expr.rb +1444 -0
  24. data/lib/polars/date_time_name_space.rb +1484 -0
  25. data/lib/polars/dynamic_group_by.rb +52 -0
  26. data/lib/polars/exceptions.rb +31 -0
  27. data/lib/polars/expr.rb +6105 -0
  28. data/lib/polars/expr_dispatch.rb +22 -0
  29. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  30. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  31. data/lib/polars/functions/as_datatype.rb +248 -0
  32. data/lib/polars/functions/col.rb +47 -0
  33. data/lib/polars/functions/eager.rb +182 -0
  34. data/lib/polars/functions/lazy.rb +1280 -0
  35. data/lib/polars/functions/len.rb +49 -0
  36. data/lib/polars/functions/lit.rb +35 -0
  37. data/lib/polars/functions/random.rb +16 -0
  38. data/lib/polars/functions/range/date_range.rb +103 -0
  39. data/lib/polars/functions/range/int_range.rb +51 -0
  40. data/lib/polars/functions/repeat.rb +144 -0
  41. data/lib/polars/functions/whenthen.rb +96 -0
  42. data/lib/polars/functions.rb +57 -0
  43. data/lib/polars/group_by.rb +548 -0
  44. data/lib/polars/io.rb +890 -0
  45. data/lib/polars/lazy_frame.rb +2833 -0
  46. data/lib/polars/lazy_group_by.rb +84 -0
  47. data/lib/polars/list_expr.rb +791 -0
  48. data/lib/polars/list_name_space.rb +445 -0
  49. data/lib/polars/meta_expr.rb +222 -0
  50. data/lib/polars/name_expr.rb +198 -0
  51. data/lib/polars/plot.rb +109 -0
  52. data/lib/polars/rolling_group_by.rb +37 -0
  53. data/lib/polars/series.rb +4527 -0
  54. data/lib/polars/slice.rb +104 -0
  55. data/lib/polars/sql_context.rb +194 -0
  56. data/lib/polars/string_cache.rb +75 -0
  57. data/lib/polars/string_expr.rb +1519 -0
  58. data/lib/polars/string_name_space.rb +810 -0
  59. data/lib/polars/struct_expr.rb +98 -0
  60. data/lib/polars/struct_name_space.rb +96 -0
  61. data/lib/polars/testing.rb +507 -0
  62. data/lib/polars/utils.rb +422 -0
  63. data/lib/polars/version.rb +4 -0
  64. data/lib/polars/whenthen.rb +83 -0
  65. data/lib/polars-df.rb +1 -0
  66. data/lib/polars.rb +72 -0
  67. metadata +125 -0
@@ -0,0 +1,423 @@
1
+ module Polars
2
+ # Series.arr namespace.
3
+ class ArrayNameSpace
4
+ include ExprDispatch
5
+
6
+ self._accessor = "arr"
7
+
8
+ # @private
9
+ def initialize(series)
10
+ self._s = series._s
11
+ end
12
+
13
+ # Compute the min values of the sub-arrays.
14
+ #
15
+ # @return [Series]
16
+ #
17
+ # @example
18
+ # s = Polars::Series.new(
19
+ # "a", [[1, 2], [4, 3]], dtype: Polars::Array.new(2, Polars::Int64)
20
+ # )
21
+ # s.arr.min
22
+ # # =>
23
+ # # shape: (2,)
24
+ # # Series: 'a' [i64]
25
+ # # [
26
+ # # 1
27
+ # # 3
28
+ # # ]
29
+ def min
30
+ super
31
+ end
32
+
33
+ # Compute the max values of the sub-arrays.
34
+ #
35
+ # @return [Series]
36
+ #
37
+ # @example
38
+ # s = Polars::Series.new(
39
+ # "a", [[1, 2], [4, 3]], dtype: Polars::Array.new(2, Polars::Int64)
40
+ # )
41
+ # s.arr.max
42
+ # # =>
43
+ # # shape: (2,)
44
+ # # Series: 'a' [i64]
45
+ # # [
46
+ # # 2
47
+ # # 4
48
+ # # ]
49
+ def max
50
+ super
51
+ end
52
+
53
+ # Compute the sum values of the sub-arrays.
54
+ #
55
+ # @return [Series]
56
+ #
57
+ # @example
58
+ # df = Polars::DataFrame.new(
59
+ # {"a" => [[1, 2], [4, 3]]},
60
+ # schema: {"a" => Polars::Array.new(2, Polars::Int64)}
61
+ # )
62
+ # df.select(Polars.col("a").arr.sum)
63
+ # # =>
64
+ # # shape: (2, 1)
65
+ # # ┌─────┐
66
+ # # │ a │
67
+ # # │ --- │
68
+ # # │ i64 │
69
+ # # ╞═════╡
70
+ # # │ 3 │
71
+ # # │ 7 │
72
+ # # └─────┘
73
+ def sum
74
+ super
75
+ end
76
+
77
+ # Get the unique/distinct values in the array.
78
+ #
79
+ # @param maintain_order [Boolean]
80
+ # Maintain order of data. This requires more work.
81
+ #
82
+ # @return [Series]
83
+ #
84
+ # @example
85
+ # df = Polars::DataFrame.new(
86
+ # {
87
+ # "a" => [[1, 1, 2]]
88
+ # },
89
+ # schema_overrides: {"a" => Polars::Array.new(Polars::Int64, 3)}
90
+ # )
91
+ # df.select(Polars.col("a").arr.unique)
92
+ # # =>
93
+ # # shape: (1, 1)
94
+ # # ┌───────────┐
95
+ # # │ a │
96
+ # # │ --- │
97
+ # # │ list[i64] │
98
+ # # ╞═══════════╡
99
+ # # │ [1, 2] │
100
+ # # └───────────┘
101
+ def unique(maintain_order: false)
102
+ super
103
+ end
104
+
105
+ # Convert an Array column into a List column with the same inner data type.
106
+ #
107
+ # @return [Series]
108
+ #
109
+ # @example
110
+ # s = Polars::Series.new([[1, 2], [3, 4]], dtype: Polars::Array.new(Polars::Int8, 2))
111
+ # s.arr.to_list
112
+ # # =>
113
+ # # shape: (2,)
114
+ # # Series: '' [list[i8]]
115
+ # # [
116
+ # # [1, 2]
117
+ # # [3, 4]
118
+ # # ]
119
+ def to_list
120
+ super
121
+ end
122
+
123
+ # Evaluate whether any boolean value is true for every subarray.
124
+ #
125
+ # @return [Series]
126
+ #
127
+ # @example
128
+ # s = Polars::Series.new(
129
+ # [[true, true], [false, true], [false, false], [nil, nil], nil],
130
+ # dtype: Polars::Array.new(Polars::Boolean, 2)
131
+ # )
132
+ # s.arr.any
133
+ # # =>
134
+ # # shape: (5,)
135
+ # # Series: '' [bool]
136
+ # # [
137
+ # # true
138
+ # # true
139
+ # # false
140
+ # # false
141
+ # # null
142
+ # # ]
143
+ def any
144
+ super
145
+ end
146
+
147
+ # Evaluate whether all boolean values are true for every subarray.
148
+ #
149
+ # @return [Series]
150
+ #
151
+ # @example
152
+ # s = Polars::Series.new(
153
+ # [[true, true], [false, true], [false, false], [nil, nil], nil],
154
+ # dtype: Polars::Array.new(Polars::Boolean, 2)
155
+ # )
156
+ # s.arr.all
157
+ # # =>
158
+ # # shape: (5,)
159
+ # # Series: '' [bool]
160
+ # # [
161
+ # # true
162
+ # # false
163
+ # # false
164
+ # # true
165
+ # # null
166
+ # # ]
167
+ def all
168
+ super
169
+ end
170
+
171
+ # Sort the arrays in this column.
172
+ #
173
+ # @param descending [Boolean]
174
+ # Sort in descending order.
175
+ # @param nulls_last [Boolean]
176
+ # Place null values last.
177
+ #
178
+ # @return [Series]
179
+ #
180
+ # @example
181
+ # s = Polars::Series.new("a", [[3, 2, 1], [9, 1, 2]], dtype: Polars::Array.new(Polars::Int64, 3))
182
+ # s.arr.sort
183
+ # # =>
184
+ # # shape: (2,)
185
+ # # Series: 'a' [array[i64, 3]]
186
+ # # [
187
+ # # [1, 2, 3]
188
+ # # [1, 2, 9]
189
+ # # ]
190
+ #
191
+ # @example
192
+ # s.arr.sort(descending: true)
193
+ # # =>
194
+ # # shape: (2,)
195
+ # # Series: 'a' [array[i64, 3]]
196
+ # # [
197
+ # # [3, 2, 1]
198
+ # # [9, 2, 1]
199
+ # # ]
200
+ def sort(descending: false, nulls_last: false)
201
+ super
202
+ end
203
+
204
+ # Reverse the arrays in this column.
205
+ #
206
+ # @return [Series]
207
+ #
208
+ # @example
209
+ # s = Polars::Series.new("a", [[3, 2, 1], [9, 1, 2]], dtype: Polars::Array.new(Polars::Int64, 3))
210
+ # s.arr.reverse
211
+ # # =>
212
+ # # shape: (2,)
213
+ # # Series: 'a' [array[i64, 3]]
214
+ # # [
215
+ # # [1, 2, 3]
216
+ # # [2, 1, 9]
217
+ # # ]
218
+ def reverse
219
+ super
220
+ end
221
+
222
+ # Retrieve the index of the minimal value in every sub-array.
223
+ #
224
+ # @return [Series]
225
+ #
226
+ # @example
227
+ # s = Polars::Series.new("a", [[3, 2, 1], [9, 1, 2]], dtype: Polars::Array.new(Polars::Int64, 3))
228
+ # s.arr.arg_min
229
+ # # =>
230
+ # # shape: (2,)
231
+ # # Series: 'a' [u32]
232
+ # # [
233
+ # # 2
234
+ # # 1
235
+ # # ]
236
+ def arg_min
237
+ super
238
+ end
239
+
240
+ # Retrieve the index of the maximum value in every sub-array.
241
+ #
242
+ # @return [Series]
243
+ #
244
+ # @example
245
+ # s = Polars::Series.new("a", [[0, 9, 3], [9, 1, 2]], dtype: Polars::Array.new(Polars::Int64, 3))
246
+ # s.arr.arg_max
247
+ # # =>
248
+ # # shape: (2,)
249
+ # # Series: 'a' [u32]
250
+ # # [
251
+ # # 1
252
+ # # 0
253
+ # # ]
254
+ def arg_max
255
+ super
256
+ end
257
+
258
+ # Get the value by index in the sub-arrays.
259
+ #
260
+ # So index `0` would return the first item of every sublist
261
+ # and index `-1` would return the last item of every sublist
262
+ # if an index is out of bounds, it will return a `nil`.
263
+ #
264
+ # @param index [Integer]
265
+ # Index to return per sublist
266
+ #
267
+ # @return [Series]
268
+ #
269
+ # @example
270
+ # s = Polars::Series.new(
271
+ # "a", [[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype: Polars::Array.new(Polars::Int32, 3)
272
+ # )
273
+ # s.arr.get(Polars::Series.new([1, -2, 4]))
274
+ # # =>
275
+ # # shape: (3,)
276
+ # # Series: 'a' [i32]
277
+ # # [
278
+ # # 2
279
+ # # 5
280
+ # # null
281
+ # # ]
282
+ def get(index)
283
+ super
284
+ end
285
+
286
+ # Get the first value of the sub-arrays.
287
+ #
288
+ # @return [Series]
289
+ #
290
+ # @example
291
+ # s = Polars::Series.new(
292
+ # "a", [[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype: Polars::Array.new(Polars::Int32, 3)
293
+ # )
294
+ # s.arr.first
295
+ # # =>
296
+ # # shape: (3,)
297
+ # # Series: 'a' [i32]
298
+ # # [
299
+ # # 1
300
+ # # 4
301
+ # # 7
302
+ # # ]
303
+ def first
304
+ super
305
+ end
306
+
307
+ # Get the last value of the sub-arrays.
308
+ #
309
+ # @return [Series]
310
+ #
311
+ # @example
312
+ # s = Polars::Series.new(
313
+ # "a", [[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype: Polars::Array.new(Polars::Int32, 3)
314
+ # )
315
+ # s.arr.last
316
+ # # =>
317
+ # # shape: (3,)
318
+ # # Series: 'a' [i32]
319
+ # # [
320
+ # # 3
321
+ # # 6
322
+ # # 9
323
+ # # ]
324
+ def last
325
+ super
326
+ end
327
+
328
+ # Join all string items in a sub-array and place a separator between them.
329
+ #
330
+ # This errors if inner type of array `!= String`.
331
+ #
332
+ # @param separator [String]
333
+ # string to separate the items with
334
+ # @param ignore_nulls [Boolean]
335
+ # Ignore null values (default).
336
+ #
337
+ # If set to `False`, null values will be propagated.
338
+ # If the sub-list contains any null values, the output is `nil`.
339
+ #
340
+ # @return [Series]
341
+ #
342
+ # @example
343
+ # s = Polars::Series.new([["x", "y"], ["a", "b"]], dtype: Polars::Array.new(Polars::String, 2))
344
+ # s.arr.join("-")
345
+ # # =>
346
+ # # shape: (2,)
347
+ # # Series: '' [str]
348
+ # # [
349
+ # # "x-y"
350
+ # # "a-b"
351
+ # # ]
352
+ def join(separator, ignore_nulls: true)
353
+ super
354
+ end
355
+
356
+ # Returns a column with a separate row for every array element.
357
+ #
358
+ # @return [Series]
359
+ #
360
+ # @example
361
+ # s = Polars::Series.new("a", [[1, 2, 3], [4, 5, 6]], dtype: Polars::Array.new(Polars::Int64, 3))
362
+ # s.arr.explode
363
+ # # =>
364
+ # # shape: (6,)
365
+ # # Series: 'a' [i64]
366
+ # # [
367
+ # # 1
368
+ # # 2
369
+ # # 3
370
+ # # 4
371
+ # # 5
372
+ # # 6
373
+ # # ]
374
+ def explode
375
+ super
376
+ end
377
+
378
+ # Check if sub-arrays contain the given item.
379
+ #
380
+ # @param item [Object]
381
+ # Item that will be checked for membership
382
+ #
383
+ # @return [Series]
384
+ #
385
+ # @example
386
+ # s = Polars::Series.new(
387
+ # "a", [[3, 2, 1], [1, 2, 3], [4, 5, 6]], dtype: Polars::Array.new(Polars::Int32, 3)
388
+ # )
389
+ # s.arr.contains(1)
390
+ # # =>
391
+ # # shape: (3,)
392
+ # # Series: 'a' [bool]
393
+ # # [
394
+ # # true
395
+ # # true
396
+ # # false
397
+ # # ]
398
+ def contains(item)
399
+ super
400
+ end
401
+
402
+ # Count how often the value produced by `element` occurs.
403
+ #
404
+ # @param element [Object]
405
+ # An expression that produces a single value
406
+ #
407
+ # @return [Series]
408
+ #
409
+ # @example
410
+ # s = Polars::Series.new("a", [[1, 2, 3], [2, 2, 2]], dtype: Polars::Array.new(Polars::Int64, 3))
411
+ # s.arr.count_matches(2)
412
+ # # =>
413
+ # # shape: (2,)
414
+ # # Series: 'a' [u32]
415
+ # # [
416
+ # # 1
417
+ # # 3
418
+ # # ]
419
+ def count_matches(element)
420
+ super
421
+ end
422
+ end
423
+ end
@@ -0,0 +1,98 @@
1
+ module Polars
2
+ # @private
3
+ class BatchedCsvReader
4
+ attr_accessor :_reader, :new_columns
5
+
6
+ def initialize(
7
+ file,
8
+ has_header: true,
9
+ columns: nil,
10
+ sep: ",",
11
+ comment_char: nil,
12
+ quote_char: '"',
13
+ skip_rows: 0,
14
+ dtypes: nil,
15
+ null_values: nil,
16
+ ignore_errors: false,
17
+ parse_dates: false,
18
+ n_threads: nil,
19
+ infer_schema_length: 100,
20
+ batch_size: 50_000,
21
+ n_rows: nil,
22
+ encoding: "utf8",
23
+ low_memory: false,
24
+ rechunk: true,
25
+ skip_rows_after_header: 0,
26
+ row_count_name: nil,
27
+ row_count_offset: 0,
28
+ sample_size: 1024,
29
+ eol_char: "\n",
30
+ new_columns: nil,
31
+ truncate_ragged_lines: false
32
+ )
33
+ if Utils.pathlike?(file)
34
+ path = Utils.normalise_filepath(file)
35
+ end
36
+
37
+ dtype_list = nil
38
+ dtype_slice = nil
39
+ if !dtypes.nil?
40
+ if dtypes.is_a?(Hash)
41
+ dtype_list = []
42
+ dtypes.each do|k, v|
43
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
44
+ end
45
+ elsif dtypes.is_a?(::Array)
46
+ dtype_slice = dtypes
47
+ else
48
+ raise ArgumentError, "dtype arg should be list or dict"
49
+ end
50
+ end
51
+
52
+ processed_null_values = Utils._process_null_values(null_values)
53
+ projection, columns = Utils.handle_projection_columns(columns)
54
+
55
+ self._reader = RbBatchedCsv.new(
56
+ infer_schema_length,
57
+ batch_size,
58
+ has_header,
59
+ ignore_errors,
60
+ n_rows,
61
+ skip_rows,
62
+ projection,
63
+ sep,
64
+ rechunk,
65
+ columns,
66
+ encoding,
67
+ n_threads,
68
+ path,
69
+ dtype_list,
70
+ dtype_slice,
71
+ low_memory,
72
+ comment_char,
73
+ quote_char,
74
+ processed_null_values,
75
+ parse_dates,
76
+ skip_rows_after_header,
77
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
78
+ sample_size,
79
+ eol_char,
80
+ truncate_ragged_lines
81
+ )
82
+ self.new_columns = new_columns
83
+ end
84
+
85
+ def next_batches(n)
86
+ batches = _reader.next_batches(n)
87
+ if !batches.nil?
88
+ if new_columns
89
+ batches.map { |df| Utils._update_columns(Utils.wrap_df(df), new_columns) }
90
+ else
91
+ batches.map { |df| Utils.wrap_df(df) }
92
+ end
93
+ else
94
+ nil
95
+ end
96
+ end
97
+ end
98
+ end
@@ -0,0 +1,77 @@
1
+ module Polars
2
+ # Namespace for binary related expressions.
3
+ class BinaryExpr
4
+ # @private
5
+ attr_accessor :_rbexpr
6
+
7
+ # @private
8
+ def initialize(expr)
9
+ self._rbexpr = expr._rbexpr
10
+ end
11
+
12
+ # Check if binaries in Series contain a binary substring.
13
+ #
14
+ # @param lit [String]
15
+ # The binary substring to look for
16
+ #
17
+ # @return [Expr]
18
+ def contains(lit)
19
+ Utils.wrap_expr(_rbexpr.binary_contains(lit))
20
+ end
21
+
22
+ # Check if string values end with a binary substring.
23
+ #
24
+ # @param sub [String]
25
+ # Suffix substring.
26
+ #
27
+ # @return [Expr]
28
+ def ends_with(sub)
29
+ Utils.wrap_expr(_rbexpr.binary_ends_with(sub))
30
+ end
31
+
32
+ # Check if values start with a binary substring.
33
+ #
34
+ # @param sub [String]
35
+ # Prefix substring.
36
+ #
37
+ # @return [Expr]
38
+ def starts_with(sub)
39
+ Utils.wrap_expr(_rbexpr.binary_starts_with(sub))
40
+ end
41
+
42
+ # Decode a value using the provided encoding.
43
+ #
44
+ # @param encoding ["hex", "base64"]
45
+ # The encoding to use.
46
+ # @param strict [Boolean]
47
+ # Raise an error if the underlying value cannot be decoded,
48
+ # otherwise mask out with a null value.
49
+ #
50
+ # @return [Expr]
51
+ def decode(encoding, strict: true)
52
+ if encoding == "hex"
53
+ Utils.wrap_expr(_rbexpr.binary_hex_decode(strict))
54
+ elsif encoding == "base64"
55
+ Utils.wrap_expr(_rbexpr.binary_base64_decode(strict))
56
+ else
57
+ raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
58
+ end
59
+ end
60
+
61
+ # Encode a value using the provided encoding.
62
+ #
63
+ # @param encoding ["hex", "base64"]
64
+ # The encoding to use.
65
+ #
66
+ # @return [Expr]
67
+ def encode(encoding)
68
+ if encoding == "hex"
69
+ Utils.wrap_expr(_rbexpr.binary_hex_encode)
70
+ elsif encoding == "base64"
71
+ Utils.wrap_expr(_rbexpr.binary_base64_encode)
72
+ else
73
+ raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,66 @@
1
+ module Polars
2
+ # Series.bin namespace.
3
+ class BinaryNameSpace
4
+ include ExprDispatch
5
+
6
+ self._accessor = "bin"
7
+
8
+ # @private
9
+ def initialize(series)
10
+ self._s = series._s
11
+ end
12
+
13
+ # Check if binaries in Series contain a binary substring.
14
+ #
15
+ # @param lit [String]
16
+ # The binary substring to look for
17
+ #
18
+ # @return [Series]
19
+ def contains(lit)
20
+ super
21
+ end
22
+
23
+ # Check if string values end with a binary substring.
24
+ #
25
+ # @param sub [String]
26
+ # Suffix substring.
27
+ #
28
+ # @return [Series]
29
+ def ends_with(sub)
30
+ super
31
+ end
32
+
33
+ # Check if values start with a binary substring.
34
+ #
35
+ # @param sub [String]
36
+ # Prefix substring.
37
+ #
38
+ # @return [Series]
39
+ def starts_with(sub)
40
+ super
41
+ end
42
+
43
+ # Decode a value using the provided encoding.
44
+ #
45
+ # @param encoding ["hex", "base64"]
46
+ # The encoding to use.
47
+ # @param strict [Boolean]
48
+ # Raise an error if the underlying value cannot be decoded,
49
+ # otherwise mask out with a null value.
50
+ #
51
+ # @return [Series]
52
+ def decode(encoding, strict: true)
53
+ super
54
+ end
55
+
56
+ # Encode a value using the provided encoding.
57
+ #
58
+ # @param encoding ["hex", "base64"]
59
+ # The encoding to use.
60
+ #
61
+ # @return [Series]
62
+ def encode(encoding)
63
+ super
64
+ end
65
+ end
66
+ end