polars-df 0.23.0 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +127 -1
  3. data/Cargo.lock +72 -58
  4. data/README.md +31 -27
  5. data/ext/polars/Cargo.toml +15 -6
  6. data/ext/polars/src/batched_csv.rs +35 -39
  7. data/ext/polars/src/c_api/allocator.rs +7 -0
  8. data/ext/polars/src/c_api/mod.rs +1 -0
  9. data/ext/polars/src/catalog/unity.rs +123 -101
  10. data/ext/polars/src/conversion/any_value.rs +13 -17
  11. data/ext/polars/src/conversion/chunked_array.rs +5 -5
  12. data/ext/polars/src/conversion/datetime.rs +3 -2
  13. data/ext/polars/src/conversion/mod.rs +50 -45
  14. data/ext/polars/src/dataframe/export.rs +13 -13
  15. data/ext/polars/src/dataframe/general.rs +223 -223
  16. data/ext/polars/src/dataframe/io.rs +27 -141
  17. data/ext/polars/src/dataframe/mod.rs +13 -5
  18. data/ext/polars/src/dataframe/serde.rs +1 -1
  19. data/ext/polars/src/error.rs +44 -7
  20. data/ext/polars/src/exceptions.rs +45 -12
  21. data/ext/polars/src/expr/array.rs +12 -0
  22. data/ext/polars/src/expr/datatype.rs +2 -2
  23. data/ext/polars/src/expr/datetime.rs +4 -5
  24. data/ext/polars/src/expr/general.rs +49 -13
  25. data/ext/polars/src/expr/list.rs +4 -0
  26. data/ext/polars/src/expr/meta.rs +8 -3
  27. data/ext/polars/src/expr/mod.rs +22 -6
  28. data/ext/polars/src/expr/name.rs +19 -8
  29. data/ext/polars/src/expr/rolling.rs +50 -1
  30. data/ext/polars/src/expr/string.rs +0 -1
  31. data/ext/polars/src/expr/struct.rs +7 -2
  32. data/ext/polars/src/file.rs +136 -103
  33. data/ext/polars/src/functions/aggregation.rs +9 -8
  34. data/ext/polars/src/functions/io.rs +81 -10
  35. data/ext/polars/src/functions/lazy.rs +95 -21
  36. data/ext/polars/src/functions/mod.rs +2 -0
  37. data/ext/polars/src/functions/range.rs +19 -3
  38. data/ext/polars/src/functions/strings.rs +6 -0
  39. data/ext/polars/src/functions/utils.rs +6 -0
  40. data/ext/polars/src/interop/arrow/mod.rs +50 -1
  41. data/ext/polars/src/interop/arrow/{to_ruby.rs → to_rb.rs} +30 -0
  42. data/ext/polars/src/interop/arrow/to_rust.rs +43 -0
  43. data/ext/polars/src/interop/numo/to_numo_df.rs +1 -1
  44. data/ext/polars/src/interop/numo/to_numo_series.rs +1 -1
  45. data/ext/polars/src/lazyframe/exitable.rs +39 -0
  46. data/ext/polars/src/lazyframe/general.rs +340 -236
  47. data/ext/polars/src/lazyframe/mod.rs +46 -10
  48. data/ext/polars/src/lazyframe/optflags.rs +5 -4
  49. data/ext/polars/src/lazyframe/serde.rs +11 -3
  50. data/ext/polars/src/lazyframe/sink.rs +10 -5
  51. data/ext/polars/src/lazygroupby.rs +6 -7
  52. data/ext/polars/src/lib.rs +141 -76
  53. data/ext/polars/src/map/dataframe.rs +12 -12
  54. data/ext/polars/src/map/lazy.rs +7 -5
  55. data/ext/polars/src/map/mod.rs +15 -8
  56. data/ext/polars/src/map/series.rs +3 -3
  57. data/ext/polars/src/on_startup.rs +16 -8
  58. data/ext/polars/src/prelude.rs +1 -0
  59. data/ext/polars/src/rb_modules.rs +19 -49
  60. data/ext/polars/src/series/aggregation.rs +79 -140
  61. data/ext/polars/src/series/arithmetic.rs +16 -22
  62. data/ext/polars/src/series/comparison.rs +101 -222
  63. data/ext/polars/src/series/construction.rs +17 -18
  64. data/ext/polars/src/series/export.rs +1 -1
  65. data/ext/polars/src/series/general.rs +254 -289
  66. data/ext/polars/src/series/import.rs +17 -0
  67. data/ext/polars/src/series/map.rs +178 -160
  68. data/ext/polars/src/series/mod.rs +28 -12
  69. data/ext/polars/src/series/scatter.rs +12 -9
  70. data/ext/polars/src/sql.rs +16 -9
  71. data/ext/polars/src/testing/frame.rs +31 -0
  72. data/ext/polars/src/testing/mod.rs +5 -0
  73. data/ext/polars/src/testing/series.rs +31 -0
  74. data/ext/polars/src/timeout.rs +105 -0
  75. data/ext/polars/src/utils.rs +159 -1
  76. data/lib/polars/array_expr.rb +81 -12
  77. data/lib/polars/array_name_space.rb +74 -7
  78. data/lib/polars/batched_csv_reader.rb +21 -21
  79. data/lib/polars/binary_name_space.rb +1 -1
  80. data/lib/polars/cat_expr.rb +7 -7
  81. data/lib/polars/config.rb +1 -1
  82. data/lib/polars/convert.rb +189 -34
  83. data/lib/polars/data_frame.rb +1066 -831
  84. data/lib/polars/data_frame_plot.rb +173 -0
  85. data/lib/polars/data_type_group.rb +1 -0
  86. data/lib/polars/data_types.rb +31 -12
  87. data/lib/polars/date_time_expr.rb +51 -69
  88. data/lib/polars/date_time_name_space.rb +80 -112
  89. data/lib/polars/dynamic_group_by.rb +7 -7
  90. data/lib/polars/exceptions.rb +50 -10
  91. data/lib/polars/expr.rb +470 -517
  92. data/lib/polars/functions/aggregation/horizontal.rb +0 -1
  93. data/lib/polars/functions/aggregation/vertical.rb +2 -3
  94. data/lib/polars/functions/as_datatype.rb +290 -8
  95. data/lib/polars/functions/eager.rb +204 -10
  96. data/lib/polars/functions/escape_regex.rb +21 -0
  97. data/lib/polars/functions/lazy.rb +409 -169
  98. data/lib/polars/functions/lit.rb +17 -1
  99. data/lib/polars/functions/range/int_range.rb +74 -2
  100. data/lib/polars/functions/range/linear_space.rb +77 -0
  101. data/lib/polars/functions/range/time_range.rb +1 -1
  102. data/lib/polars/functions/repeat.rb +3 -12
  103. data/lib/polars/functions/whenthen.rb +2 -2
  104. data/lib/polars/group_by.rb +72 -20
  105. data/lib/polars/iceberg_dataset.rb +1 -6
  106. data/lib/polars/in_process_query.rb +37 -0
  107. data/lib/polars/io/cloud.rb +18 -0
  108. data/lib/polars/io/csv.rb +265 -126
  109. data/lib/polars/io/database.rb +0 -1
  110. data/lib/polars/io/delta.rb +15 -7
  111. data/lib/polars/io/ipc.rb +24 -17
  112. data/lib/polars/io/ndjson.rb +161 -24
  113. data/lib/polars/io/parquet.rb +101 -38
  114. data/lib/polars/lazy_frame.rb +849 -558
  115. data/lib/polars/lazy_group_by.rb +327 -2
  116. data/lib/polars/list_expr.rb +94 -16
  117. data/lib/polars/list_name_space.rb +88 -24
  118. data/lib/polars/meta_expr.rb +42 -1
  119. data/lib/polars/name_expr.rb +41 -4
  120. data/lib/polars/query_opt_flags.rb +198 -2
  121. data/lib/polars/rolling_group_by.rb +3 -3
  122. data/lib/polars/schema.rb +21 -3
  123. data/lib/polars/selector.rb +37 -2
  124. data/lib/polars/selectors.rb +45 -9
  125. data/lib/polars/series.rb +1156 -728
  126. data/lib/polars/series_plot.rb +72 -0
  127. data/lib/polars/slice.rb +1 -1
  128. data/lib/polars/sql_context.rb +11 -4
  129. data/lib/polars/string_expr.rb +59 -68
  130. data/lib/polars/string_name_space.rb +51 -87
  131. data/lib/polars/struct_expr.rb +36 -18
  132. data/lib/polars/testing.rb +24 -273
  133. data/lib/polars/utils/constants.rb +2 -0
  134. data/lib/polars/utils/construction/data_frame.rb +410 -0
  135. data/lib/polars/utils/construction/series.rb +364 -0
  136. data/lib/polars/utils/construction/utils.rb +9 -0
  137. data/lib/polars/utils/deprecation.rb +11 -0
  138. data/lib/polars/utils/serde.rb +8 -3
  139. data/lib/polars/utils/unstable.rb +19 -0
  140. data/lib/polars/utils/various.rb +59 -0
  141. data/lib/polars/utils.rb +46 -47
  142. data/lib/polars/version.rb +1 -1
  143. data/lib/polars.rb +47 -1
  144. metadata +25 -6
  145. data/ext/polars/src/allocator.rs +0 -13
  146. data/lib/polars/plot.rb +0 -109
@@ -110,7 +110,7 @@ module Polars
110
110
  # # b"\x00\x00\xff"
111
111
  # # ]
112
112
  #
113
- # @example Set `strict=False` to set invalid values to null instead of raising an error.
113
+ # @example Set `strict: false` to set invalid values to null instead of raising an error.
114
114
  # s = Polars::Series.new("colors", ["000000".b, "ffff00".b, "invalid_value".b])
115
115
  # s.bin.decode("hex", strict: false)
116
116
  # # =>
@@ -44,13 +44,13 @@ module Polars
44
44
  # `len_chars` (_O(n)_).
45
45
  #
46
46
  # @example
47
- # df = Polars::DataFrame.new(
48
- # {"a" => Polars::Series.new(["Café", "345", "東京", nil], dtype: Polars::Categorical)}
49
- # )
50
- # df.with_columns(
51
- # Polars.col("a").cat.len_bytes.alias("n_bytes"),
52
- # Polars.col("a").cat.len_chars.alias("n_chars")
53
- # )
47
+ # df = Polars::DataFrame.new(
48
+ # {"a" => Polars::Series.new(["Café", "345", "東京", nil], dtype: Polars::Categorical)}
49
+ # )
50
+ # df.with_columns(
51
+ # Polars.col("a").cat.len_bytes.alias("n_bytes"),
52
+ # Polars.col("a").cat.len_chars.alias("n_chars")
53
+ # )
54
54
  # # =>
55
55
  # # shape: (4, 3)
56
56
  # # ┌──────┬─────────┬─────────┐
data/lib/polars/config.rb CHANGED
@@ -122,7 +122,7 @@ module Polars
122
122
  self
123
123
  end
124
124
 
125
- # Use ASCII characters to display table outlines (set False to revert to UTF8).
125
+ # Use ASCII characters to display table outlines (set false to revert to UTF8).
126
126
  #
127
127
  # @return [Config]
128
128
  #
@@ -17,9 +17,14 @@ module Polars
17
17
  # If you supply an array of column names that does not match the names in the
18
18
  # underlying data, the names given here will overwrite them. The number
19
19
  # of names given in the schema should match the underlying data dimensions.
20
- # @param columns [Array]
21
- # Column labels to use for resulting DataFrame. If specified, overrides any
22
- # labels already present in the data. Must match data dimensions.
20
+ # @param schema_overrides [Hash]
21
+ # Support type specification or override of one or more columns; note that
22
+ # any dtypes inferred from the columns param will be overridden.
23
+ # @param strict [Boolean]
24
+ # Throw an error if any `data` value does not exactly match the given or inferred
25
+ # data type for that column. If set to `false`, values that do not match the data
26
+ # type are cast to that data type or, if casting is not possible, set to null
27
+ # instead.
23
28
  #
24
29
  # @return [DataFrame]
25
30
  #
@@ -36,24 +41,47 @@ module Polars
36
41
  # # │ 1 ┆ 3 │
37
42
  # # │ 2 ┆ 4 │
38
43
  # # └─────┴─────┘
39
- def from_hash(data, schema: nil, columns: nil)
44
+ def from_hash(data, schema: nil, schema_overrides: nil, strict: true)
40
45
  Utils.wrap_df(
41
- DataFrame.hash_to_rbdf(
46
+ Utils.hash_to_rbdf(
42
47
  data,
43
- schema: schema || columns
48
+ schema: schema,
49
+ schema_overrides: schema_overrides,
50
+ strict: strict
44
51
  )
45
52
  )
46
53
  end
47
54
 
48
55
  # Construct a DataFrame from an array of hashes. This operation clones data.
49
56
  #
50
- # @param hashes [Array]
51
- # Array with hashes mapping column name to value.
52
- # @param infer_schema_length [Integer]
53
- # How many hashes/rows to scan to determine the data types
54
- # if set to `nil` all rows are scanned. This will be slow.
57
+ # @param data [Array]
58
+ # Array with hashes mapping column name to value
55
59
  # @param schema [Object]
56
- # Schema that (partially) overwrites the inferred schema.
60
+ # The DataFrame schema may be declared in several ways:
61
+ #
62
+ # * As a dict of \\\\{name:type} pairs; if type is nil, it will be auto-inferred.
63
+ # * As a list of column names; in this case types are automatically inferred.
64
+ # * As a list of (name,type) pairs; this is equivalent to the hash form.
65
+ #
66
+ # If a list of column names is supplied that does NOT match the names in the
67
+ # underlying data, the names given here will overwrite the actual fields in
68
+ # the order that they appear - however, in this case it is typically clearer
69
+ # to rename after loading the frame.
70
+ #
71
+ # If you want to drop some of the fields found in the input hashes, a
72
+ # *partial* schema can be declared, in which case omitted fields will not be
73
+ # loaded. Similarly, you can extend the loaded frame with empty columns by
74
+ # adding them to the schema.
75
+ # @param schema_overrides [Hash]
76
+ # Support override of inferred types for one or more columns.
77
+ # @param strict [Boolean]
78
+ # Throw an error if any `data` value does not exactly match the given or inferred
79
+ # data type for that column. If set to `false`, values that do not match the data
80
+ # type are cast to that data type or, if casting is not possible, set to null
81
+ # instead.
82
+ # @param infer_schema_length [Integer]
83
+ # The maximum number of rows to scan for schema inference.
84
+ # If set to `nil`, the full data may be scanned *(this is slow)*.
57
85
  #
58
86
  # @return [DataFrame]
59
87
  #
@@ -72,37 +100,164 @@ module Polars
72
100
  # # │ 3 ┆ 6 │
73
101
  # # └─────┴─────┘
74
102
  #
75
- # @example Overwrite first column name and dtype
76
- # Polars.from_hashes(data, schema: {"c" => :i32})
103
+ # @example Declaring a partial `schema` will drop the omitted columns.
104
+ # Polars.from_hashes(data, schema: {"a" => Polars::Int32})
105
+ # # =>
106
+ # # shape: (3, 1)
107
+ # # ┌─────┐
108
+ # # │ a │
109
+ # # │ --- │
110
+ # # │ i32 │
111
+ # # ╞═════╡
112
+ # # │ 1 │
113
+ # # │ 2 │
114
+ # # │ 3 │
115
+ # # └─────┘
116
+ def from_hashes(
117
+ data,
118
+ schema: nil,
119
+ schema_overrides: nil,
120
+ strict: true,
121
+ infer_schema_length: N_INFER_DEFAULT
122
+ )
123
+ if !data.any? && !(schema.any? || schema_overrides.any?)
124
+ msg = "no data, cannot infer schema"
125
+ raise NoDataError, msg
126
+ end
127
+
128
+ DataFrame.new(
129
+ data,
130
+ schema: schema,
131
+ schema_overrides: schema_overrides,
132
+ strict: strict,
133
+ infer_schema_length: infer_schema_length
134
+ )
135
+ end
136
+
137
+ # Construct a DataFrame from an array of arrays. This operation clones data.
138
+ #
139
+ # Note that this is slower than creating from columnar memory.
140
+ #
141
+ # @param data [Array]
142
+ # Two-dimensional data represented as an array of arrays.
143
+ # @param schema [Object]
144
+ # The DataFrame schema may be declared in several ways:
145
+ #
146
+ # * As a dict of \\\\{name:type} pairs; if type is nil, it will be auto-inferred.
147
+ # * As a list of column names; in this case types are automatically inferred.
148
+ # * As a list of (name,type) pairs; this is equivalent to the hash form.
149
+ #
150
+ # If you supply a list of column names that does not match the names in the
151
+ # underlying data, the names given here will overwrite them. The number
152
+ # of names given in the schema should match the underlying data dimensions.
153
+ # @param schema_overrides [Hash]
154
+ # Support type specification or override of one or more columns; note that
155
+ # any dtypes inferred from the columns param will be overridden.
156
+ # @param strict [Boolean]
157
+ # Throw an error if any `data` value does not exactly match the given or inferred
158
+ # data type for that column. If set to `false`, values that do not match the data
159
+ # type are cast to that data type or, if casting is not possible, set to null
160
+ # instead.
161
+ # @param orient ['col', 'row']
162
+ # Whether to interpret two-dimensional data as columns or as rows. If nil,
163
+ # the orientation is inferred by matching the columns and data dimensions. If
164
+ # this does not yield conclusive results, column orientation is used.
165
+ # @param infer_schema_length [Integer]
166
+ # The maximum number of rows to scan for schema inference.
167
+ # If set to `nil`, the full data may be scanned *(this is slow)*.
168
+ #
169
+ # @return [DataFrame]
170
+ #
171
+ # @example
172
+ # data = [[1, 2, 3], [4, 5, 6]]
173
+ # Polars.from_records(data, schema: ["a", "b"])
77
174
  # # =>
78
175
  # # shape: (3, 2)
79
176
  # # ┌─────┬─────┐
80
- # # │ c ┆ b │
177
+ # # │ a ┆ b │
81
178
  # # │ --- ┆ --- │
82
- # # │ i32 ┆ i64 │
179
+ # # │ i64 ┆ i64 │
83
180
  # # ╞═════╪═════╡
84
181
  # # │ 1 ┆ 4 │
85
182
  # # │ 2 ┆ 5 │
86
183
  # # │ 3 ┆ 6 │
87
184
  # # └─────┴─────┘
88
- #
89
- # @example Let polars infer the dtypes but inform about a 3rd column
90
- # Polars.from_hashes(data, schema: {"a" => :unknown, "b" => :unknown, "c" => :i32})
91
- # # shape: (3, 3)
92
- # # ┌─────┬─────┬──────┐
93
- # # │ a ┆ b ┆ c │
94
- # # │ --- ┆ --- ┆ --- │
95
- # # │ i64 ┆ i64 ┆ i32 │
96
- # # ╞═════╪═════╪══════╡
97
- # # 1 ┆ 4 ┆ null │
98
- # # 2 ┆ 5 ┆ null
99
- # # 3 ┆ 6 ┆ null
100
- # # └─────┴─────┴──────┘
101
- # def from_hashes(hashes, infer_schema_length: 50, schema: nil)
102
- # DataFrame._from_hashes(hashes, infer_schema_length: infer_schema_length, schema: schema)
103
- # end
185
+ def from_records(
186
+ data,
187
+ schema: nil,
188
+ schema_overrides: nil,
189
+ strict: true,
190
+ orient: nil,
191
+ infer_schema_length: N_INFER_DEFAULT
192
+ )
193
+ if !data.is_a?(::Array)
194
+ msg = (
195
+ "expected data of type Array, got #{data.class.name.inspect}" +
196
+ "\n\nHint: Try passing your data to the DataFrame constructor instead," +
197
+ " e.g. `Polars::DataFrame.new(data)`."
198
+ )
199
+ raise TypeError, msg
200
+ end
201
+
202
+ Utils.wrap_df(
203
+ Utils.sequence_to_rbdf(
204
+ data,
205
+ schema: schema,
206
+ schema_overrides: schema_overrides,
207
+ strict: strict,
208
+ orient: orient,
209
+ infer_schema_length: infer_schema_length
210
+ )
211
+ )
212
+ end
104
213
 
105
- # def from_records
106
- # end
214
+ # Construct a DataFrame from a NumPy ndarray. This operation clones data.
215
+ #
216
+ # Note that this is slower than creating from columnar memory.
217
+ #
218
+ # @param data [Numo::NArray]
219
+ # Two-dimensional data represented as a NumPy ndarray.
220
+ # @param schema [Object]
221
+ # The DataFrame schema may be declared in several ways:
222
+ #
223
+ # * As a dict of \\\\{name:type} pairs; if type is nil, it will be auto-inferred.
224
+ # * As a list of column names; in this case types are automatically inferred.
225
+ # * As a list of (name,type) pairs; this is equivalent to the hash form.
226
+ #
227
+ # If you supply a list of column names that does not match the names in the
228
+ # underlying data, the names given here will overwrite them. The number
229
+ # of names given in the schema should match the underlying data dimensions.
230
+ # @param schema_overrides [Hash]
231
+ # Support type specification or override of one or more columns; note that
232
+ # any dtypes inferred from the columns param will be overridden.
233
+ # @param orient ['col', 'row']
234
+ # Whether to interpret two-dimensional data as columns or as rows. If nil,
235
+ # the orientation is inferred by matching the columns and data dimensions. If
236
+ # this does not yield conclusive results, column orientation is used.
237
+ #
238
+ # @return [DataFrame]
239
+ #
240
+ # @example
241
+ # data = Numo::NArray.cast([[1, 2, 3], [4, 5, 6]])
242
+ # Polars.from_numo(data, schema: ["a", "b"], orient: "col")
243
+ # # =>
244
+ # # shape: (3, 2)
245
+ # # ┌─────┬─────┐
246
+ # # │ a ┆ b │
247
+ # # │ --- ┆ --- │
248
+ # # │ i64 ┆ i64 │
249
+ # # ╞═════╪═════╡
250
+ # # │ 1 ┆ 4 │
251
+ # # │ 2 ┆ 5 │
252
+ # # │ 3 ┆ 6 │
253
+ # # └─────┴─────┘
254
+ def from_numo(
255
+ data,
256
+ schema: nil,
257
+ schema_overrides: nil,
258
+ orient: nil
259
+ )
260
+ raise Todo
261
+ end
107
262
  end
108
263
  end