polars-df 0.23.0 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +127 -1
  3. data/Cargo.lock +72 -58
  4. data/README.md +31 -27
  5. data/ext/polars/Cargo.toml +15 -6
  6. data/ext/polars/src/batched_csv.rs +35 -39
  7. data/ext/polars/src/c_api/allocator.rs +7 -0
  8. data/ext/polars/src/c_api/mod.rs +1 -0
  9. data/ext/polars/src/catalog/unity.rs +123 -101
  10. data/ext/polars/src/conversion/any_value.rs +13 -17
  11. data/ext/polars/src/conversion/chunked_array.rs +5 -5
  12. data/ext/polars/src/conversion/datetime.rs +3 -2
  13. data/ext/polars/src/conversion/mod.rs +50 -45
  14. data/ext/polars/src/dataframe/export.rs +13 -13
  15. data/ext/polars/src/dataframe/general.rs +223 -223
  16. data/ext/polars/src/dataframe/io.rs +27 -141
  17. data/ext/polars/src/dataframe/mod.rs +13 -5
  18. data/ext/polars/src/dataframe/serde.rs +1 -1
  19. data/ext/polars/src/error.rs +44 -7
  20. data/ext/polars/src/exceptions.rs +45 -12
  21. data/ext/polars/src/expr/array.rs +12 -0
  22. data/ext/polars/src/expr/datatype.rs +2 -2
  23. data/ext/polars/src/expr/datetime.rs +4 -5
  24. data/ext/polars/src/expr/general.rs +49 -13
  25. data/ext/polars/src/expr/list.rs +4 -0
  26. data/ext/polars/src/expr/meta.rs +8 -3
  27. data/ext/polars/src/expr/mod.rs +22 -6
  28. data/ext/polars/src/expr/name.rs +19 -8
  29. data/ext/polars/src/expr/rolling.rs +50 -1
  30. data/ext/polars/src/expr/string.rs +0 -1
  31. data/ext/polars/src/expr/struct.rs +7 -2
  32. data/ext/polars/src/file.rs +136 -103
  33. data/ext/polars/src/functions/aggregation.rs +9 -8
  34. data/ext/polars/src/functions/io.rs +81 -10
  35. data/ext/polars/src/functions/lazy.rs +95 -21
  36. data/ext/polars/src/functions/mod.rs +2 -0
  37. data/ext/polars/src/functions/range.rs +19 -3
  38. data/ext/polars/src/functions/strings.rs +6 -0
  39. data/ext/polars/src/functions/utils.rs +6 -0
  40. data/ext/polars/src/interop/arrow/mod.rs +50 -1
  41. data/ext/polars/src/interop/arrow/{to_ruby.rs → to_rb.rs} +30 -0
  42. data/ext/polars/src/interop/arrow/to_rust.rs +43 -0
  43. data/ext/polars/src/interop/numo/to_numo_df.rs +1 -1
  44. data/ext/polars/src/interop/numo/to_numo_series.rs +1 -1
  45. data/ext/polars/src/lazyframe/exitable.rs +39 -0
  46. data/ext/polars/src/lazyframe/general.rs +340 -236
  47. data/ext/polars/src/lazyframe/mod.rs +46 -10
  48. data/ext/polars/src/lazyframe/optflags.rs +5 -4
  49. data/ext/polars/src/lazyframe/serde.rs +11 -3
  50. data/ext/polars/src/lazyframe/sink.rs +10 -5
  51. data/ext/polars/src/lazygroupby.rs +6 -7
  52. data/ext/polars/src/lib.rs +141 -76
  53. data/ext/polars/src/map/dataframe.rs +12 -12
  54. data/ext/polars/src/map/lazy.rs +7 -5
  55. data/ext/polars/src/map/mod.rs +15 -8
  56. data/ext/polars/src/map/series.rs +3 -3
  57. data/ext/polars/src/on_startup.rs +16 -8
  58. data/ext/polars/src/prelude.rs +1 -0
  59. data/ext/polars/src/rb_modules.rs +19 -49
  60. data/ext/polars/src/series/aggregation.rs +79 -140
  61. data/ext/polars/src/series/arithmetic.rs +16 -22
  62. data/ext/polars/src/series/comparison.rs +101 -222
  63. data/ext/polars/src/series/construction.rs +17 -18
  64. data/ext/polars/src/series/export.rs +1 -1
  65. data/ext/polars/src/series/general.rs +254 -289
  66. data/ext/polars/src/series/import.rs +17 -0
  67. data/ext/polars/src/series/map.rs +178 -160
  68. data/ext/polars/src/series/mod.rs +28 -12
  69. data/ext/polars/src/series/scatter.rs +12 -9
  70. data/ext/polars/src/sql.rs +16 -9
  71. data/ext/polars/src/testing/frame.rs +31 -0
  72. data/ext/polars/src/testing/mod.rs +5 -0
  73. data/ext/polars/src/testing/series.rs +31 -0
  74. data/ext/polars/src/timeout.rs +105 -0
  75. data/ext/polars/src/utils.rs +159 -1
  76. data/lib/polars/array_expr.rb +81 -12
  77. data/lib/polars/array_name_space.rb +74 -7
  78. data/lib/polars/batched_csv_reader.rb +21 -21
  79. data/lib/polars/binary_name_space.rb +1 -1
  80. data/lib/polars/cat_expr.rb +7 -7
  81. data/lib/polars/config.rb +1 -1
  82. data/lib/polars/convert.rb +189 -34
  83. data/lib/polars/data_frame.rb +1066 -831
  84. data/lib/polars/data_frame_plot.rb +173 -0
  85. data/lib/polars/data_type_group.rb +1 -0
  86. data/lib/polars/data_types.rb +31 -12
  87. data/lib/polars/date_time_expr.rb +51 -69
  88. data/lib/polars/date_time_name_space.rb +80 -112
  89. data/lib/polars/dynamic_group_by.rb +7 -7
  90. data/lib/polars/exceptions.rb +50 -10
  91. data/lib/polars/expr.rb +470 -517
  92. data/lib/polars/functions/aggregation/horizontal.rb +0 -1
  93. data/lib/polars/functions/aggregation/vertical.rb +2 -3
  94. data/lib/polars/functions/as_datatype.rb +290 -8
  95. data/lib/polars/functions/eager.rb +204 -10
  96. data/lib/polars/functions/escape_regex.rb +21 -0
  97. data/lib/polars/functions/lazy.rb +409 -169
  98. data/lib/polars/functions/lit.rb +17 -1
  99. data/lib/polars/functions/range/int_range.rb +74 -2
  100. data/lib/polars/functions/range/linear_space.rb +77 -0
  101. data/lib/polars/functions/range/time_range.rb +1 -1
  102. data/lib/polars/functions/repeat.rb +3 -12
  103. data/lib/polars/functions/whenthen.rb +2 -2
  104. data/lib/polars/group_by.rb +72 -20
  105. data/lib/polars/iceberg_dataset.rb +1 -6
  106. data/lib/polars/in_process_query.rb +37 -0
  107. data/lib/polars/io/cloud.rb +18 -0
  108. data/lib/polars/io/csv.rb +265 -126
  109. data/lib/polars/io/database.rb +0 -1
  110. data/lib/polars/io/delta.rb +15 -7
  111. data/lib/polars/io/ipc.rb +24 -17
  112. data/lib/polars/io/ndjson.rb +161 -24
  113. data/lib/polars/io/parquet.rb +101 -38
  114. data/lib/polars/lazy_frame.rb +849 -558
  115. data/lib/polars/lazy_group_by.rb +327 -2
  116. data/lib/polars/list_expr.rb +94 -16
  117. data/lib/polars/list_name_space.rb +88 -24
  118. data/lib/polars/meta_expr.rb +42 -1
  119. data/lib/polars/name_expr.rb +41 -4
  120. data/lib/polars/query_opt_flags.rb +198 -2
  121. data/lib/polars/rolling_group_by.rb +3 -3
  122. data/lib/polars/schema.rb +21 -3
  123. data/lib/polars/selector.rb +37 -2
  124. data/lib/polars/selectors.rb +45 -9
  125. data/lib/polars/series.rb +1156 -728
  126. data/lib/polars/series_plot.rb +72 -0
  127. data/lib/polars/slice.rb +1 -1
  128. data/lib/polars/sql_context.rb +11 -4
  129. data/lib/polars/string_expr.rb +59 -68
  130. data/lib/polars/string_name_space.rb +51 -87
  131. data/lib/polars/struct_expr.rb +36 -18
  132. data/lib/polars/testing.rb +24 -273
  133. data/lib/polars/utils/constants.rb +2 -0
  134. data/lib/polars/utils/construction/data_frame.rb +410 -0
  135. data/lib/polars/utils/construction/series.rb +364 -0
  136. data/lib/polars/utils/construction/utils.rb +9 -0
  137. data/lib/polars/utils/deprecation.rb +11 -0
  138. data/lib/polars/utils/serde.rb +8 -3
  139. data/lib/polars/utils/unstable.rb +19 -0
  140. data/lib/polars/utils/various.rb +59 -0
  141. data/lib/polars/utils.rb +46 -47
  142. data/lib/polars/version.rb +1 -1
  143. data/lib/polars.rb +47 -1
  144. metadata +25 -6
  145. data/ext/polars/src/allocator.rs +0 -13
  146. data/lib/polars/plot.rb +0 -109
data/lib/polars/io/csv.rb CHANGED
@@ -16,38 +16,55 @@ module Polars
16
16
  # Rename columns right after parsing the CSV file. If the given
17
17
  # list is shorter than the width of the DataFrame the remaining
18
18
  # columns will have their original name.
19
- # @param sep [String]
20
- # Single byte character to use as delimiter in the file.
21
- # @param comment_char [String]
22
- # Single byte character that indicates the start of a comment line,
23
- # for instance `#`.
19
+ # @param separator [String]
20
+ # Single byte character to use as separator in the file.
21
+ # @param comment_prefix [String]
22
+ # A string used to indicate the start of a comment line. Comment lines are skipped
23
+ # during parsing. Common examples of comment prefixes are `#` and `//`.
24
24
  # @param quote_char [String]
25
25
  # Single byte character used for csv quoting.
26
26
  # Set to nil to turn off special handling and escaping of quotes.
27
27
  # @param skip_rows [Integer]
28
28
  # Start reading after `skip_rows` lines.
29
- # @param dtypes [Object]
30
- # Overwrite dtypes during inference.
29
+ # @param skip_lines [Integer]
30
+ # Start reading after `skip_lines` lines. The header will be parsed at this
31
+ # offset. Note that CSV escaping will not be respected when skipping lines.
32
+ # If you want to skip valid CSV rows, use `skip_rows`.
33
+ # @param schema [Object]
34
+ # Provide the schema. This means that polars doesn't do schema inference.
35
+ # This argument expects the complete schema, whereas `schema_overrides` can be
36
+ # used to partially overwrite a schema. Note that the order of the columns in
37
+ # the provided `schema` must match the order of the columns in the CSV being read.
38
+ # @param schema_overrides [Object]
39
+ # Overwrite dtypes for specific or all columns during schema inference.
31
40
  # @param null_values [Object]
32
41
  # Values to interpret as null values. You can provide a:
33
42
  #
34
43
  # - `String`: All values equal to this string will be null.
35
44
  # - `Array`: All values equal to any string in this array will be null.
36
45
  # - `Hash`: A hash that maps column name to a null value string.
46
+ # @param missing_utf8_is_empty_string [Boolean]
47
+ # By default a missing value is considered to be null; if you would prefer missing
48
+ # utf8 values to be treated as the empty string you can set this param true.
37
49
  # @param ignore_errors [Boolean]
38
50
  # Try to keep reading lines if some lines yield errors.
39
51
  # First try `infer_schema_length: 0` to read all columns as
40
52
  # `:str` to check which values might cause an issue.
41
- # @param parse_dates [Boolean]
53
+ # @param try_parse_dates [Boolean]
42
54
  # Try to automatically parse dates. If this does not succeed,
43
55
  # the column remains of data type `:str`.
44
56
  # @param n_threads [Integer]
45
57
  # Number of threads to use in csv parsing.
46
58
  # Defaults to the number of physical cpu's of your system.
59
+ # @param infer_schema [Boolean]
60
+ # When `true`, the schema is inferred from the data using the first
61
+ # `infer_schema_length` rows.
62
+ # When `false`, the schema is not inferred and will be `Polars::String` if not
63
+ # specified in `schema` or `schema_overrides`.
47
64
  # @param infer_schema_length [Integer]
48
- # Maximum number of lines to read to infer schema.
49
- # If set to 0, all columns will be read as `:utf8`.
50
- # If set to `nil`, a full table scan will be done (slow).
65
+ # The maximum number of rows to scan for schema inference.
66
+ # If set to `nil`, the full data may be scanned *(this is slow)*.
67
+ # Set `infer_schema: false` to read all columns as `Polars::String`.
51
68
  # @param batch_size [Integer]
52
69
  # Number of lines to read into the buffer at once.
53
70
  # Modify this to change performance.
@@ -70,15 +87,22 @@ module Polars
70
87
  # particular storage connection.
71
88
  # @param skip_rows_after_header [Integer]
72
89
  # Skip this number of rows when the header is parsed.
73
- # @param row_count_name [String]
90
+ # @param row_index_name [String]
74
91
  # If not nil, this will insert a row count column with the given name into
75
92
  # the DataFrame.
76
- # @param row_count_offset [Integer]
93
+ # @param row_index_offset [Integer]
77
94
  # Offset to start the row_count column (only used if the name is set).
78
95
  # @param eol_char [String]
79
96
  # Single byte end of line character.
97
+ # @param raise_if_empty [Boolean]
98
+ # When there is no data in the source, `NoDataError` is raised. If this parameter
99
+ # is set to false, an empty DataFrame (with no columns) is returned instead.
80
100
  # @param truncate_ragged_lines [Boolean]
81
101
  # Truncate lines that are longer than the schema.
102
+ # @param decimal_comma [Boolean]
103
+ # Parse floats using a comma as the decimal separator instead of a period.
104
+ # @param glob [Boolean]
105
+ # Expand path given via globbing rules.
82
106
  #
83
107
  # @return [DataFrame]
84
108
  #
@@ -92,30 +116,36 @@ module Polars
92
116
  has_header: true,
93
117
  columns: nil,
94
118
  new_columns: nil,
95
- sep: ",",
96
- comment_char: nil,
119
+ separator: ",",
120
+ comment_prefix: nil,
97
121
  quote_char: '"',
98
122
  skip_rows: 0,
99
- dtypes: nil,
123
+ skip_lines: 0,
124
+ schema: nil,
125
+ schema_overrides: nil,
100
126
  null_values: nil,
127
+ missing_utf8_is_empty_string: false,
101
128
  ignore_errors: false,
102
- parse_dates: false,
129
+ try_parse_dates: false,
103
130
  n_threads: nil,
131
+ infer_schema: true,
104
132
  infer_schema_length: N_INFER_DEFAULT,
105
133
  batch_size: 8192,
106
134
  n_rows: nil,
107
135
  encoding: "utf8",
108
136
  low_memory: false,
109
- rechunk: true,
137
+ rechunk: false,
110
138
  storage_options: nil,
111
139
  skip_rows_after_header: 0,
112
- row_count_name: nil,
113
- row_count_offset: 0,
140
+ row_index_name: nil,
141
+ row_index_offset: 0,
114
142
  eol_char: "\n",
115
- truncate_ragged_lines: false
143
+ raise_if_empty: true,
144
+ truncate_ragged_lines: false,
145
+ decimal_comma: false,
146
+ glob: true
116
147
  )
117
- Utils._check_arg_is_1byte("sep", sep, false)
118
- Utils._check_arg_is_1byte("comment_char", comment_char, false)
148
+ Utils._check_arg_is_1byte("separator", separator, false)
119
149
  Utils._check_arg_is_1byte("quote_char", quote_char, true)
120
150
  Utils._check_arg_is_1byte("eol_char", eol_char, false)
121
151
 
@@ -131,8 +161,8 @@ module Polars
131
161
  end
132
162
  end
133
163
 
134
- if projection || new_columns
135
- raise Todo
164
+ if !infer_schema
165
+ infer_schema_length = 0
136
166
  end
137
167
 
138
168
  df = nil
@@ -141,14 +171,17 @@ module Polars
141
171
  data,
142
172
  has_header: has_header,
143
173
  columns: columns || projection,
144
- sep: sep,
145
- comment_char: comment_char,
174
+ separator: separator,
175
+ comment_prefix: comment_prefix,
146
176
  quote_char: quote_char,
147
177
  skip_rows: skip_rows,
148
- dtypes: dtypes,
178
+ skip_lines: skip_lines,
179
+ schema_overrides: schema_overrides,
180
+ schema: schema,
149
181
  null_values: null_values,
182
+ missing_utf8_is_empty_string: missing_utf8_is_empty_string,
150
183
  ignore_errors: ignore_errors,
151
- parse_dates: parse_dates,
184
+ try_parse_dates: try_parse_dates,
152
185
  n_threads: n_threads,
153
186
  infer_schema_length: infer_schema_length,
154
187
  batch_size: batch_size,
@@ -157,10 +190,13 @@ module Polars
157
190
  low_memory: low_memory,
158
191
  rechunk: rechunk,
159
192
  skip_rows_after_header: skip_rows_after_header,
160
- row_count_name: row_count_name,
161
- row_count_offset: row_count_offset,
193
+ row_index_name: row_index_name,
194
+ row_index_offset: row_index_offset,
162
195
  eol_char: eol_char,
163
- truncate_ragged_lines: truncate_ragged_lines
196
+ raise_if_empty: raise_if_empty,
197
+ truncate_ragged_lines: truncate_ragged_lines,
198
+ decimal_comma: decimal_comma,
199
+ glob: glob
164
200
  )
165
201
  end
166
202
 
@@ -176,26 +212,27 @@ module Polars
176
212
  file,
177
213
  has_header: true,
178
214
  columns: nil,
179
- sep: ",",
180
- comment_char: nil,
215
+ separator: ",",
216
+ comment_prefix: nil,
181
217
  quote_char: '"',
182
218
  skip_rows: 0,
183
- dtypes: nil,
219
+ skip_lines: 0,
184
220
  schema: nil,
221
+ schema_overrides: nil,
185
222
  null_values: nil,
186
223
  missing_utf8_is_empty_string: false,
187
224
  ignore_errors: false,
188
- parse_dates: false,
225
+ try_parse_dates: false,
189
226
  n_threads: nil,
190
227
  infer_schema_length: N_INFER_DEFAULT,
191
228
  batch_size: 8192,
192
229
  n_rows: nil,
193
230
  encoding: "utf8",
194
231
  low_memory: false,
195
- rechunk: true,
232
+ rechunk: false,
196
233
  skip_rows_after_header: 0,
197
- row_count_name: nil,
198
- row_count_offset: 0,
234
+ row_index_name: nil,
235
+ row_index_offset: 0,
199
236
  eol_char: "\n",
200
237
  raise_if_empty: true,
201
238
  truncate_ragged_lines: false,
@@ -213,16 +250,16 @@ module Polars
213
250
 
214
251
  dtype_list = nil
215
252
  dtype_slice = nil
216
- if !dtypes.nil?
217
- if dtypes.is_a?(Hash)
253
+ if !schema_overrides.nil?
254
+ if schema_overrides.is_a?(Hash)
218
255
  dtype_list = []
219
- dtypes.each do |k, v|
220
- dtype_list << [k, Utils.rb_type_to_dtype(v)]
256
+ schema_overrides.each do |k, v|
257
+ dtype_list << [k, Utils.parse_into_dtype(v)]
221
258
  end
222
- elsif dtypes.is_a?(::Array)
223
- dtype_slice = dtypes
259
+ elsif schema_overrides.is_a?(::Array)
260
+ dtype_slice = schema_overrides
224
261
  else
225
- raise ArgumentError, "dtype arg should be list or dict"
262
+ raise TypeError, "dtype arg should be array or hash"
226
263
  end
227
264
  end
228
265
 
@@ -242,11 +279,13 @@ module Polars
242
279
  scan = scan_csv(
243
280
  file,
244
281
  has_header: has_header,
245
- sep: sep,
246
- comment_char: comment_char,
282
+ separator: separator,
283
+ comment_prefix: comment_prefix,
247
284
  quote_char: quote_char,
248
285
  skip_rows: skip_rows,
249
- dtypes: dtypes_dict,
286
+ skip_lines: skip_lines,
287
+ schema: schema,
288
+ schema_overrides: dtypes_dict,
250
289
  null_values: null_values,
251
290
  missing_utf8_is_empty_string: missing_utf8_is_empty_string,
252
291
  ignore_errors: ignore_errors,
@@ -255,9 +294,10 @@ module Polars
255
294
  low_memory: low_memory,
256
295
  rechunk: rechunk,
257
296
  skip_rows_after_header: skip_rows_after_header,
258
- row_count_name: row_count_name,
259
- row_count_offset: row_count_offset,
297
+ row_index_name: row_index_name,
298
+ row_index_offset: row_index_offset,
260
299
  eol_char: eol_char,
300
+ raise_if_empty: raise_if_empty,
261
301
  truncate_ragged_lines: truncate_ragged_lines,
262
302
  decimal_comma: decimal_comma,
263
303
  glob: glob
@@ -282,8 +322,9 @@ module Polars
282
322
  ignore_errors,
283
323
  n_rows,
284
324
  skip_rows,
325
+ skip_lines,
285
326
  projection,
286
- sep,
327
+ separator,
287
328
  rechunk,
288
329
  columns,
289
330
  encoding,
@@ -292,13 +333,13 @@ module Polars
292
333
  dtype_list,
293
334
  dtype_slice,
294
335
  low_memory,
295
- comment_char,
336
+ comment_prefix,
296
337
  quote_char,
297
338
  processed_null_values,
298
339
  missing_utf8_is_empty_string,
299
- parse_dates,
340
+ try_parse_dates,
300
341
  skip_rows_after_header,
301
- Utils.parse_row_index_args(row_count_name, row_count_offset),
342
+ Utils.parse_row_index_args(row_index_name, row_index_offset),
302
343
  eol_char,
303
344
  raise_if_empty,
304
345
  truncate_ragged_lines,
@@ -319,7 +360,7 @@ module Polars
319
360
  # Path to a file or a file-like object.
320
361
  # @param has_header [Boolean]
321
362
  # Indicate if the first row of dataset is a header or not.
322
- # If set to False, column names will be autogenerated in the
363
+ # If set to false, column names will be autogenerated in the
323
364
  # following format: `column_x`, with `x` being an
324
365
  # enumeration over every column in the dataset starting at 1.
325
366
  # @param columns [Object]
@@ -329,17 +370,21 @@ module Polars
329
370
  # Rename columns right after parsing the CSV file. If the given
330
371
  # list is shorter than the width of the DataFrame the remaining
331
372
  # columns will have their original name.
332
- # @param sep [String]
333
- # Single byte character to use as delimiter in the file.
334
- # @param comment_char [String]
335
- # Single byte character that indicates the start of a comment line,
336
- # for instance `#`.
373
+ # @param separator [String]
374
+ # Single byte character to use as separator in the file.
375
+ # @param comment_prefix [String]
376
+ # A string used to indicate the start of a comment line. Comment lines are skipped
377
+ # during parsing. Common examples of comment prefixes are `#` and `//`.
337
378
  # @param quote_char [String]
338
379
  # Single byte character used for csv quoting, default = `"`.
339
380
  # Set to nil to turn off special handling and escaping of quotes.
340
381
  # @param skip_rows [Integer]
341
382
  # Start reading after `skip_rows` lines.
342
- # @param dtypes [Object]
383
+ # @param skip_lines [Integer]
384
+ # Start reading after `skip_lines` lines. The header will be parsed at this
385
+ # offset. Note that CSV escaping will not be respected when skipping lines.
386
+ # If you want to skip valid CSV rows, use `skip_rows`.
387
+ # @param schema_overrides [Object]
343
388
  # Overwrite dtypes during inference.
344
389
  # @param null_values [Object]
345
390
  # Values to interpret as null values. You can provide a:
@@ -354,7 +399,7 @@ module Polars
354
399
  # Try to keep reading lines if some lines yield errors.
355
400
  # First try `infer_schema_length: 0` to read all columns as
356
401
  # `:str` to check which values might cause an issue.
357
- # @param parse_dates [Boolean]
402
+ # @param try_parse_dates [Boolean]
358
403
  # Try to automatically parse dates. If this does not succeed,
359
404
  # the column remains of data type `:str`.
360
405
  # @param n_threads [Integer]
@@ -383,10 +428,10 @@ module Polars
383
428
  # aggregating the chunks into a single array.
384
429
  # @param skip_rows_after_header [Integer]
385
430
  # Skip this number of rows when the header is parsed.
386
- # @param row_count_name [String]
431
+ # @param row_index_name [String]
387
432
  # If not nil, this will insert a row count column with the given name into
388
433
  # the DataFrame.
389
- # @param row_count_offset [Integer]
434
+ # @param row_index_offset [Integer]
390
435
  # Offset to start the row_count column (only used if the name is set).
391
436
  # @param eol_char [String]
392
437
  # Single byte end of line character.
@@ -402,7 +447,7 @@ module Polars
402
447
  #
403
448
  # @example
404
449
  # reader = Polars.read_csv_batched(
405
- # "./tpch/tables_scale_100/lineitem.tbl", sep: "|", parse_dates: true
450
+ # "./tpch/tables_scale_100/lineitem.tbl", separator: "|", try_parse_dates: true
406
451
  # )
407
452
  # reader.next_batches(5)
408
453
  def read_csv_batched(
@@ -410,25 +455,26 @@ module Polars
410
455
  has_header: true,
411
456
  columns: nil,
412
457
  new_columns: nil,
413
- sep: ",",
414
- comment_char: nil,
458
+ separator: ",",
459
+ comment_prefix: nil,
415
460
  quote_char: '"',
416
461
  skip_rows: 0,
417
- dtypes: nil,
462
+ skip_lines: 0,
463
+ schema_overrides: nil,
418
464
  null_values: nil,
419
465
  missing_utf8_is_empty_string: false,
420
466
  ignore_errors: false,
421
- parse_dates: false,
467
+ try_parse_dates: false,
422
468
  n_threads: nil,
423
469
  infer_schema_length: N_INFER_DEFAULT,
424
470
  batch_size: 50_000,
425
471
  n_rows: nil,
426
472
  encoding: "utf8",
427
473
  low_memory: false,
428
- rechunk: true,
474
+ rechunk: false,
429
475
  skip_rows_after_header: 0,
430
- row_count_name: nil,
431
- row_count_offset: 0,
476
+ row_index_name: nil,
477
+ row_index_offset: 0,
432
478
  eol_char: "\n",
433
479
  raise_if_empty: true,
434
480
  truncate_ragged_lines: false,
@@ -444,23 +490,20 @@ module Polars
444
490
  end
445
491
  end
446
492
 
447
- if projection || new_columns
448
- raise Todo
449
- end
450
-
451
493
  BatchedCsvReader.new(
452
494
  source,
453
495
  has_header: has_header,
454
496
  columns: columns || projection,
455
- sep: sep,
456
- comment_char: comment_char,
497
+ separator: separator,
498
+ comment_prefix: comment_prefix,
457
499
  quote_char: quote_char,
458
500
  skip_rows: skip_rows,
459
- dtypes: dtypes,
501
+ skip_lines: skip_lines,
502
+ schema_overrides: schema_overrides,
460
503
  null_values: null_values,
461
504
  missing_utf8_is_empty_string: missing_utf8_is_empty_string,
462
505
  ignore_errors: ignore_errors,
463
- parse_dates: parse_dates,
506
+ try_parse_dates: try_parse_dates,
464
507
  n_threads: n_threads,
465
508
  infer_schema_length: infer_schema_length,
466
509
  batch_size: batch_size,
@@ -469,8 +512,8 @@ module Polars
469
512
  low_memory: low_memory,
470
513
  rechunk: rechunk,
471
514
  skip_rows_after_header: skip_rows_after_header,
472
- row_count_name: row_count_name,
473
- row_count_offset: row_count_offset,
515
+ row_index_name: row_index_name,
516
+ row_index_offset: row_index_offset,
474
517
  eol_char: eol_char,
475
518
  new_columns: new_columns,
476
519
  raise_if_empty: raise_if_empty,
@@ -492,19 +535,28 @@ module Polars
492
535
  # If set to false, column names will be autogenerated in the
493
536
  # following format: `column_x`, with `x` being an
494
537
  # enumeration over every column in the dataset starting at 1.
495
- # @param sep [String]
496
- # Single byte character to use as delimiter in the file.
497
- # @param comment_char [String]
498
- # Single byte character that indicates the start of a comment line,
499
- # for instance `#`.
538
+ # @param separator [String]
539
+ # Single byte character to use as separator in the file.
540
+ # @param comment_prefix [String]
541
+ # A string used to indicate the start of a comment line. Comment lines are skipped
542
+ # during parsing. Common examples of comment prefixes are `#` and `//`.
500
543
  # @param quote_char [String]
501
544
  # Single byte character used for csv quoting.
502
545
  # Set to nil to turn off special handling and escaping of quotes.
503
546
  # @param skip_rows [Integer]
504
547
  # Start reading after `skip_rows` lines. The header will be parsed at this
505
548
  # offset.
506
- # @param dtypes [Object]
507
- # Overwrite dtypes during inference.
549
+ # @param skip_lines [Integer]
550
+ # Start reading after `skip_lines` lines. The header will be parsed at this
551
+ # offset. Note that CSV escaping will not be respected when skipping lines.
552
+ # If you want to skip valid CSV rows, use `skip_rows`.
553
+ # @param schema [Object]
554
+ # Provide the schema. This means that polars doesn't do schema inference.
555
+ # This argument expects the complete schema, whereas `schema_overrides` can be
556
+ # used to partially overwrite a schema. Note that the order of the columns in
557
+ # the provided `schema` must match the order of the columns in the CSV being read.
558
+ # @param schema_overrides [Object]
559
+ # Overwrite dtypes for specific or all columns during schema inference.
508
560
  # @param null_values [Object]
509
561
  # Values to interpret as null values. You can provide a:
510
562
  #
@@ -524,6 +576,11 @@ module Polars
524
576
  # Apply a function over the column names.
525
577
  # This can be used to update a schema just in time, thus before
526
578
  # scanning.
579
+ # @param infer_schema [Boolean]
580
+ # When `true`, the schema is inferred from the data using the first
581
+ # `infer_schema_length` rows.
582
+ # When `false`, the schema is not inferred and will be `Polars::String` if not
583
+ # specified in `schema` or `schema_overrides`.
527
584
  # @param infer_schema_length [Integer]
528
585
  # Maximum number of lines to read to infer schema.
529
586
  # If set to 0, all columns will be read as `:str`.
@@ -539,16 +596,20 @@ module Polars
539
596
  # Reallocate to contiguous memory when all chunks/ files are parsed.
540
597
  # @param skip_rows_after_header [Integer]
541
598
  # Skip this number of rows when the header is parsed.
542
- # @param row_count_name [String]
599
+ # @param row_index_name [String]
543
600
  # If not nil, this will insert a row count column with the given name into
544
601
  # the DataFrame.
545
- # @param row_count_offset [Integer]
602
+ # @param row_index_offset [Integer]
546
603
  # Offset to start the row_count column (only used if the name is set).
547
- # @param parse_dates [Boolean]
604
+ # @param try_parse_dates [Boolean]
548
605
  # Try to automatically parse dates. If this does not succeed,
549
606
  # the column remains of data type `:str`.
550
607
  # @param eol_char [String]
551
608
  # Single byte end of line character.
609
+ # @param new_columns [Array]
610
+ # Provide an explicit list of string column names to use (for example, when
611
+ # scanning a headerless CSV file). If the given list is shorter than the width of
612
+ # the DataFrame the remaining columns will have their original name.
552
613
  # @param raise_if_empty [Boolean]
553
614
  # When there is no data in the source, `NoDataError` is raised. If this parameter
554
615
  # is set to false, an empty LazyFrame (with no columns) is returned instead.
@@ -558,52 +619,100 @@ module Polars
558
619
  # Parse floats using a comma as the decimal separator instead of a period.
559
620
  # @param glob [Boolean]
560
621
  # Expand path given via globbing rules.
622
+ # @param storage_options [Hash]
623
+ # Options that indicate how to connect to a cloud provider.
624
+ #
625
+ # The cloud providers currently supported are AWS, GCP, and Azure.
626
+ # See supported keys here:
627
+ #
628
+ # * [aws](https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html)
629
+ # * [gcp](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html)
630
+ # * [azure](https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html)
631
+ # * Hugging Face (`hf://`): Accepts an API key under the `token` parameter: \
632
+ # `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
633
+ #
634
+ # If `storage_options` is not provided, Polars will try to infer the information
635
+ # from environment variables.
636
+ # @param credential_provider [Object]
637
+ # Provide a function that can be called to provide cloud storage
638
+ # credentials. The function is expected to return a hash of
639
+ # credential keys along with an optional credential expiry time.
640
+ # @param retries [Integer]
641
+ # Number of retries if accessing a cloud instance fails.
642
+ # @param file_cache_ttl [Integer]
643
+ # Amount of time to keep downloaded cloud files since their last access time,
644
+ # in seconds. Uses the `POLARS_FILE_CACHE_TTL` environment variable
645
+ # (which defaults to 1 hour) if not given.
646
+ # @param include_file_paths [String]
647
+ # Include the path of the source file(s) as a column with this name.
561
648
  #
562
649
  # @return [LazyFrame]
563
650
  def scan_csv(
564
651
  source,
565
652
  has_header: true,
566
- sep: ",",
567
- comment_char: nil,
653
+ separator: ",",
654
+ comment_prefix: nil,
568
655
  quote_char: '"',
569
656
  skip_rows: 0,
570
- dtypes: nil,
657
+ skip_lines: 0,
658
+ schema: nil,
659
+ schema_overrides: nil,
571
660
  null_values: nil,
572
661
  missing_utf8_is_empty_string: false,
573
662
  ignore_errors: false,
574
663
  cache: true,
575
664
  with_column_names: nil,
665
+ infer_schema: true,
576
666
  infer_schema_length: N_INFER_DEFAULT,
577
667
  n_rows: nil,
578
668
  encoding: "utf8",
579
669
  low_memory: false,
580
- rechunk: true,
670
+ rechunk: false,
581
671
  skip_rows_after_header: 0,
582
- row_count_name: nil,
583
- row_count_offset: 0,
584
- parse_dates: false,
672
+ row_index_name: nil,
673
+ row_index_offset: 0,
674
+ try_parse_dates: false,
585
675
  eol_char: "\n",
676
+ new_columns: nil,
586
677
  raise_if_empty: true,
587
678
  truncate_ragged_lines: false,
588
679
  decimal_comma: false,
589
- glob: true
680
+ glob: true,
681
+ storage_options: nil,
682
+ credential_provider: "auto",
683
+ retries: 2,
684
+ file_cache_ttl: nil,
685
+ include_file_paths: nil
590
686
  )
591
- Utils._check_arg_is_1byte("sep", sep, false)
592
- Utils._check_arg_is_1byte("comment_char", comment_char, false)
687
+ if new_columns
688
+ raise Todo
689
+ end
690
+
691
+ Utils._check_arg_is_1byte("separator", separator, false)
593
692
  Utils._check_arg_is_1byte("quote_char", quote_char, true)
594
693
 
595
694
  if Utils.pathlike?(source)
596
695
  source = Utils.normalize_filepath(source)
597
696
  end
598
697
 
698
+ if !infer_schema
699
+ infer_schema_length = 0
700
+ end
701
+
702
+ credential_provider_builder = _init_credential_provider_builder(
703
+ credential_provider, source, storage_options, "scan_csv"
704
+ )
705
+
599
706
  _scan_csv_impl(
600
707
  source,
601
708
  has_header: has_header,
602
- sep: sep,
603
- comment_char: comment_char,
709
+ separator: separator,
710
+ comment_prefix: comment_prefix,
604
711
  quote_char: quote_char,
605
712
  skip_rows: skip_rows,
606
- dtypes: dtypes,
713
+ skip_lines: skip_lines,
714
+ schema_overrides: schema_overrides,
715
+ schema: schema,
607
716
  null_values: null_values,
608
717
  ignore_errors: ignore_errors,
609
718
  cache: cache,
@@ -614,11 +723,19 @@ module Polars
614
723
  rechunk: rechunk,
615
724
  skip_rows_after_header: skip_rows_after_header,
616
725
  encoding: encoding,
617
- row_count_name: row_count_name,
618
- row_count_offset: row_count_offset,
619
- parse_dates: parse_dates,
726
+ row_index_name: row_index_name,
727
+ row_index_offset: row_index_offset,
728
+ try_parse_dates: try_parse_dates,
620
729
  eol_char: eol_char,
621
- truncate_ragged_lines: truncate_ragged_lines
730
+ raise_if_empty: raise_if_empty,
731
+ truncate_ragged_lines: truncate_ragged_lines,
732
+ decimal_comma: decimal_comma,
733
+ glob: glob,
734
+ retries: retries,
735
+ storage_options: storage_options,
736
+ credential_provider: credential_provider_builder,
737
+ file_cache_ttl: file_cache_ttl,
738
+ include_file_paths: include_file_paths
622
739
  )
623
740
  end
624
741
 
@@ -626,12 +743,15 @@ module Polars
626
743
  def _scan_csv_impl(
627
744
  source,
628
745
  has_header: true,
629
- sep: ",",
630
- comment_char: nil,
746
+ separator: ",",
747
+ comment_prefix: nil,
631
748
  quote_char: '"',
632
749
  skip_rows: 0,
633
- dtypes: nil,
750
+ skip_lines: 0,
751
+ schema: nil,
752
+ schema_overrides: nil,
634
753
  null_values: nil,
754
+ missing_utf8_is_empty_string: false,
635
755
  ignore_errors: false,
636
756
  cache: true,
637
757
  with_column_names: nil,
@@ -639,19 +759,27 @@ module Polars
639
759
  n_rows: nil,
640
760
  encoding: "utf8",
641
761
  low_memory: false,
642
- rechunk: true,
762
+ rechunk: false,
643
763
  skip_rows_after_header: 0,
644
- row_count_name: nil,
645
- row_count_offset: 0,
646
- parse_dates: false,
764
+ row_index_name: nil,
765
+ row_index_offset: 0,
766
+ try_parse_dates: false,
647
767
  eol_char: "\n",
648
- truncate_ragged_lines: true
768
+ raise_if_empty: true,
769
+ truncate_ragged_lines: true,
770
+ decimal_comma: false,
771
+ glob: true,
772
+ storage_options: nil,
773
+ credential_provider: nil,
774
+ retries: 2,
775
+ file_cache_ttl: nil,
776
+ include_file_paths: nil
649
777
  )
650
778
  dtype_list = nil
651
- if !dtypes.nil?
779
+ if !schema_overrides.nil?
652
780
  dtype_list = []
653
- dtypes.each do |k, v|
654
- dtype_list << [k, Utils.rb_type_to_dtype(v)]
781
+ schema_overrides.each do |k, v|
782
+ dtype_list << [k, Utils.parse_into_dtype(v)]
655
783
  end
656
784
  end
657
785
  processed_null_values = Utils._process_null_values(null_values)
@@ -666,27 +794,38 @@ module Polars
666
794
  rblf =
667
795
  RbLazyFrame.new_from_csv(
668
796
  source,
669
- sep,
797
+ sources,
798
+ separator,
670
799
  has_header,
671
800
  ignore_errors,
672
801
  skip_rows,
802
+ skip_lines,
673
803
  n_rows,
674
804
  cache,
675
805
  dtype_list,
676
806
  low_memory,
677
- comment_char,
807
+ comment_prefix,
678
808
  quote_char,
679
809
  processed_null_values,
810
+ missing_utf8_is_empty_string,
680
811
  infer_schema_length,
681
812
  with_column_names,
682
813
  rechunk,
683
814
  skip_rows_after_header,
684
815
  encoding,
685
- Utils.parse_row_index_args(row_count_name, row_count_offset),
686
- parse_dates,
816
+ Utils.parse_row_index_args(row_index_name, row_index_offset),
817
+ try_parse_dates,
687
818
  eol_char,
819
+ raise_if_empty,
688
820
  truncate_ragged_lines,
689
- sources
821
+ decimal_comma,
822
+ glob,
823
+ schema,
824
+ storage_options,
825
+ credential_provider,
826
+ retries,
827
+ file_cache_ttl,
828
+ include_file_paths
690
829
  )
691
830
  Utils.wrap_ldf(rblf)
692
831
  end