polars-df 0.23.0 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +127 -1
  3. data/Cargo.lock +72 -58
  4. data/README.md +31 -27
  5. data/ext/polars/Cargo.toml +15 -6
  6. data/ext/polars/src/batched_csv.rs +35 -39
  7. data/ext/polars/src/c_api/allocator.rs +7 -0
  8. data/ext/polars/src/c_api/mod.rs +1 -0
  9. data/ext/polars/src/catalog/unity.rs +123 -101
  10. data/ext/polars/src/conversion/any_value.rs +13 -17
  11. data/ext/polars/src/conversion/chunked_array.rs +5 -5
  12. data/ext/polars/src/conversion/datetime.rs +3 -2
  13. data/ext/polars/src/conversion/mod.rs +50 -45
  14. data/ext/polars/src/dataframe/export.rs +13 -13
  15. data/ext/polars/src/dataframe/general.rs +223 -223
  16. data/ext/polars/src/dataframe/io.rs +27 -141
  17. data/ext/polars/src/dataframe/mod.rs +13 -5
  18. data/ext/polars/src/dataframe/serde.rs +1 -1
  19. data/ext/polars/src/error.rs +44 -7
  20. data/ext/polars/src/exceptions.rs +45 -12
  21. data/ext/polars/src/expr/array.rs +12 -0
  22. data/ext/polars/src/expr/datatype.rs +2 -2
  23. data/ext/polars/src/expr/datetime.rs +4 -5
  24. data/ext/polars/src/expr/general.rs +49 -13
  25. data/ext/polars/src/expr/list.rs +4 -0
  26. data/ext/polars/src/expr/meta.rs +8 -3
  27. data/ext/polars/src/expr/mod.rs +22 -6
  28. data/ext/polars/src/expr/name.rs +19 -8
  29. data/ext/polars/src/expr/rolling.rs +50 -1
  30. data/ext/polars/src/expr/string.rs +0 -1
  31. data/ext/polars/src/expr/struct.rs +7 -2
  32. data/ext/polars/src/file.rs +136 -103
  33. data/ext/polars/src/functions/aggregation.rs +9 -8
  34. data/ext/polars/src/functions/io.rs +81 -10
  35. data/ext/polars/src/functions/lazy.rs +95 -21
  36. data/ext/polars/src/functions/mod.rs +2 -0
  37. data/ext/polars/src/functions/range.rs +19 -3
  38. data/ext/polars/src/functions/strings.rs +6 -0
  39. data/ext/polars/src/functions/utils.rs +6 -0
  40. data/ext/polars/src/interop/arrow/mod.rs +50 -1
  41. data/ext/polars/src/interop/arrow/{to_ruby.rs → to_rb.rs} +30 -0
  42. data/ext/polars/src/interop/arrow/to_rust.rs +43 -0
  43. data/ext/polars/src/interop/numo/to_numo_df.rs +1 -1
  44. data/ext/polars/src/interop/numo/to_numo_series.rs +1 -1
  45. data/ext/polars/src/lazyframe/exitable.rs +39 -0
  46. data/ext/polars/src/lazyframe/general.rs +340 -236
  47. data/ext/polars/src/lazyframe/mod.rs +46 -10
  48. data/ext/polars/src/lazyframe/optflags.rs +5 -4
  49. data/ext/polars/src/lazyframe/serde.rs +11 -3
  50. data/ext/polars/src/lazyframe/sink.rs +10 -5
  51. data/ext/polars/src/lazygroupby.rs +6 -7
  52. data/ext/polars/src/lib.rs +141 -76
  53. data/ext/polars/src/map/dataframe.rs +12 -12
  54. data/ext/polars/src/map/lazy.rs +7 -5
  55. data/ext/polars/src/map/mod.rs +15 -8
  56. data/ext/polars/src/map/series.rs +3 -3
  57. data/ext/polars/src/on_startup.rs +16 -8
  58. data/ext/polars/src/prelude.rs +1 -0
  59. data/ext/polars/src/rb_modules.rs +19 -49
  60. data/ext/polars/src/series/aggregation.rs +79 -140
  61. data/ext/polars/src/series/arithmetic.rs +16 -22
  62. data/ext/polars/src/series/comparison.rs +101 -222
  63. data/ext/polars/src/series/construction.rs +17 -18
  64. data/ext/polars/src/series/export.rs +1 -1
  65. data/ext/polars/src/series/general.rs +254 -289
  66. data/ext/polars/src/series/import.rs +17 -0
  67. data/ext/polars/src/series/map.rs +178 -160
  68. data/ext/polars/src/series/mod.rs +28 -12
  69. data/ext/polars/src/series/scatter.rs +12 -9
  70. data/ext/polars/src/sql.rs +16 -9
  71. data/ext/polars/src/testing/frame.rs +31 -0
  72. data/ext/polars/src/testing/mod.rs +5 -0
  73. data/ext/polars/src/testing/series.rs +31 -0
  74. data/ext/polars/src/timeout.rs +105 -0
  75. data/ext/polars/src/utils.rs +159 -1
  76. data/lib/polars/array_expr.rb +81 -12
  77. data/lib/polars/array_name_space.rb +74 -7
  78. data/lib/polars/batched_csv_reader.rb +21 -21
  79. data/lib/polars/binary_name_space.rb +1 -1
  80. data/lib/polars/cat_expr.rb +7 -7
  81. data/lib/polars/config.rb +1 -1
  82. data/lib/polars/convert.rb +189 -34
  83. data/lib/polars/data_frame.rb +1066 -831
  84. data/lib/polars/data_frame_plot.rb +173 -0
  85. data/lib/polars/data_type_group.rb +1 -0
  86. data/lib/polars/data_types.rb +31 -12
  87. data/lib/polars/date_time_expr.rb +51 -69
  88. data/lib/polars/date_time_name_space.rb +80 -112
  89. data/lib/polars/dynamic_group_by.rb +7 -7
  90. data/lib/polars/exceptions.rb +50 -10
  91. data/lib/polars/expr.rb +470 -517
  92. data/lib/polars/functions/aggregation/horizontal.rb +0 -1
  93. data/lib/polars/functions/aggregation/vertical.rb +2 -3
  94. data/lib/polars/functions/as_datatype.rb +290 -8
  95. data/lib/polars/functions/eager.rb +204 -10
  96. data/lib/polars/functions/escape_regex.rb +21 -0
  97. data/lib/polars/functions/lazy.rb +409 -169
  98. data/lib/polars/functions/lit.rb +17 -1
  99. data/lib/polars/functions/range/int_range.rb +74 -2
  100. data/lib/polars/functions/range/linear_space.rb +77 -0
  101. data/lib/polars/functions/range/time_range.rb +1 -1
  102. data/lib/polars/functions/repeat.rb +3 -12
  103. data/lib/polars/functions/whenthen.rb +2 -2
  104. data/lib/polars/group_by.rb +72 -20
  105. data/lib/polars/iceberg_dataset.rb +1 -6
  106. data/lib/polars/in_process_query.rb +37 -0
  107. data/lib/polars/io/cloud.rb +18 -0
  108. data/lib/polars/io/csv.rb +265 -126
  109. data/lib/polars/io/database.rb +0 -1
  110. data/lib/polars/io/delta.rb +15 -7
  111. data/lib/polars/io/ipc.rb +24 -17
  112. data/lib/polars/io/ndjson.rb +161 -24
  113. data/lib/polars/io/parquet.rb +101 -38
  114. data/lib/polars/lazy_frame.rb +849 -558
  115. data/lib/polars/lazy_group_by.rb +327 -2
  116. data/lib/polars/list_expr.rb +94 -16
  117. data/lib/polars/list_name_space.rb +88 -24
  118. data/lib/polars/meta_expr.rb +42 -1
  119. data/lib/polars/name_expr.rb +41 -4
  120. data/lib/polars/query_opt_flags.rb +198 -2
  121. data/lib/polars/rolling_group_by.rb +3 -3
  122. data/lib/polars/schema.rb +21 -3
  123. data/lib/polars/selector.rb +37 -2
  124. data/lib/polars/selectors.rb +45 -9
  125. data/lib/polars/series.rb +1156 -728
  126. data/lib/polars/series_plot.rb +72 -0
  127. data/lib/polars/slice.rb +1 -1
  128. data/lib/polars/sql_context.rb +11 -4
  129. data/lib/polars/string_expr.rb +59 -68
  130. data/lib/polars/string_name_space.rb +51 -87
  131. data/lib/polars/struct_expr.rb +36 -18
  132. data/lib/polars/testing.rb +24 -273
  133. data/lib/polars/utils/constants.rb +2 -0
  134. data/lib/polars/utils/construction/data_frame.rb +410 -0
  135. data/lib/polars/utils/construction/series.rb +364 -0
  136. data/lib/polars/utils/construction/utils.rb +9 -0
  137. data/lib/polars/utils/deprecation.rb +11 -0
  138. data/lib/polars/utils/serde.rb +8 -3
  139. data/lib/polars/utils/unstable.rb +19 -0
  140. data/lib/polars/utils/various.rb +59 -0
  141. data/lib/polars/utils.rb +46 -47
  142. data/lib/polars/version.rb +1 -1
  143. data/lib/polars.rb +47 -1
  144. metadata +25 -6
  145. data/ext/polars/src/allocator.rs +0 -13
  146. data/lib/polars/plot.rb +0 -109
@@ -85,6 +85,5 @@ module Polars
85
85
 
86
86
  DataFrame.new(data, schema_overrides: schema_overrides)
87
87
  end
88
- alias_method :read_sql, :read_database
89
88
  end
90
89
  end
@@ -21,19 +21,23 @@ module Polars
21
21
  source,
22
22
  version: nil,
23
23
  columns: nil,
24
- rechunk: false,
24
+ rechunk: nil,
25
25
  storage_options: nil,
26
26
  delta_table_options: nil
27
27
  )
28
- dl_tbl =
29
- _get_delta_lake_table(
28
+ df =
29
+ scan_delta(
30
30
  source,
31
31
  version: version,
32
32
  storage_options: storage_options,
33
- delta_table_options: delta_table_options
33
+ delta_table_options: delta_table_options,
34
+ rechunk: rechunk
34
35
  )
35
36
 
36
- dl_tbl.to_polars(columns: columns, rechunk: rechunk)
37
+ if !columns.nil?
38
+ df = df.select(columns)
39
+ end
40
+ df.collect
37
41
  end
38
42
 
39
43
  # Lazily read from a Delta lake table.
@@ -46,13 +50,17 @@ module Polars
46
50
  # Extra options for the storage backends supported by `deltalake-rb`.
47
51
  # @param delta_table_options [Hash]
48
52
  # Additional keyword arguments while reading a Delta lake Table.
53
+ # @param rechunk [Boolean]
54
+ # Make sure that all columns are contiguous in memory by
55
+ # aggregating the chunks into a single array.
49
56
  #
50
57
  # @return [LazyFrame]
51
58
  def scan_delta(
52
59
  source,
53
60
  version: nil,
54
61
  storage_options: nil,
55
- delta_table_options: nil
62
+ delta_table_options: nil,
63
+ rechunk: nil
56
64
  )
57
65
  dl_tbl =
58
66
  _get_delta_lake_table(
@@ -62,7 +70,7 @@ module Polars
62
70
  delta_table_options: delta_table_options
63
71
  )
64
72
 
65
- dl_tbl.to_polars(eager: false)
73
+ dl_tbl.to_polars(eager: false, rechunk: rechunk || false)
66
74
  end
67
75
 
68
76
  private
data/lib/polars/io/ipc.rb CHANGED
@@ -15,10 +15,10 @@ module Polars
15
15
  # Only uncompressed IPC files can be memory mapped.
16
16
  # @param storage_options [Hash]
17
17
  # Extra options that make sense for a particular storage connection.
18
- # @param row_count_name [String]
18
+ # @param row_index_name [String]
19
19
  # If not nil, this will insert a row count column with give name into the
20
20
  # DataFrame.
21
- # @param row_count_offset [Integer]
21
+ # @param row_index_offset [Integer]
22
22
  # Offset to start the row_count column (only use if the name is set).
23
23
  # @param rechunk [Boolean]
24
24
  # Make sure that all data is contiguous.
@@ -30,8 +30,8 @@ module Polars
30
30
  n_rows: nil,
31
31
  memory_map: true,
32
32
  storage_options: nil,
33
- row_count_name: nil,
34
- row_count_offset: 0,
33
+ row_index_name: nil,
34
+ row_index_offset: 0,
35
35
  rechunk: true
36
36
  )
37
37
  storage_options ||= {}
@@ -40,8 +40,8 @@ module Polars
40
40
  data,
41
41
  columns: columns,
42
42
  n_rows: n_rows,
43
- row_count_name: row_count_name,
44
- row_count_offset: row_count_offset,
43
+ row_index_name: row_index_name,
44
+ row_index_offset: row_index_offset,
45
45
  rechunk: rechunk,
46
46
  memory_map: memory_map
47
47
  )
@@ -53,8 +53,8 @@ module Polars
53
53
  file,
54
54
  columns: nil,
55
55
  n_rows: nil,
56
- row_count_name: nil,
57
- row_count_offset: 0,
56
+ row_index_name: nil,
57
+ row_index_offset: 0,
58
58
  rechunk: true,
59
59
  memory_map: true
60
60
  )
@@ -76,7 +76,7 @@ module Polars
76
76
  columns,
77
77
  projection,
78
78
  n_rows,
79
- Utils.parse_row_index_args(row_count_name, row_count_offset),
79
+ Utils.parse_row_index_args(row_index_name, row_index_offset),
80
80
  memory_map
81
81
  )
82
82
  Utils.wrap_df(rbdf)
@@ -182,15 +182,19 @@ module Polars
182
182
  # Cache the result after reading.
183
183
  # @param rechunk [Boolean]
184
184
  # Reallocate to contiguous memory when all chunks/ files are parsed.
185
- # @param row_count_name [String]
185
+ # @param row_index_name [String]
186
186
  # If not nil, this will insert a row count column with give name into the
187
187
  # DataFrame.
188
- # @param row_count_offset [Integer]
188
+ # @param row_index_offset [Integer]
189
189
  # Offset to start the row_count column (only use if the name is set).
190
190
  # @param glob [Boolean]
191
191
  # Expand path given via globbing rules.
192
192
  # @param storage_options [Hash]
193
193
  # Extra options that make sense for a particular storage connection.
194
+ # @param credential_provider [Object]
195
+ # Provide a function that can be called to provide cloud storage
196
+ # credentials. The function is expected to return a hash of
197
+ # credential keys along with an optional credential expiry time.
194
198
  # @param retries [Integer]
195
199
  # Number of retries if accessing a cloud instance fails.
196
200
  # @param file_cache_ttl [Integer]
@@ -215,11 +219,12 @@ module Polars
215
219
  source,
216
220
  n_rows: nil,
217
221
  cache: true,
218
- rechunk: true,
219
- row_count_name: nil,
220
- row_count_offset: 0,
222
+ rechunk: false,
223
+ row_index_name: nil,
224
+ row_index_offset: 0,
221
225
  glob: true,
222
226
  storage_options: nil,
227
+ credential_provider: "auto",
223
228
  retries: 2,
224
229
  file_cache_ttl: nil,
225
230
  hive_partitioning: nil,
@@ -227,11 +232,12 @@ module Polars
227
232
  try_parse_hive_dates: true,
228
233
  include_file_paths: nil
229
234
  )
230
- row_index_name = row_count_name
231
- row_index_offset = row_count_offset
232
-
233
235
  sources = get_sources(source)
234
236
 
237
+ credential_provider_builder = _init_credential_provider_builder(
238
+ credential_provider, sources, storage_options, "scan_parquet"
239
+ )
240
+
235
241
  rblf =
236
242
  RbLazyFrame.new_from_ipc(
237
243
  sources,
@@ -246,6 +252,7 @@ module Polars
246
252
  rechunk: rechunk,
247
253
  cache: cache,
248
254
  storage_options: !storage_options.nil? ? storage_options.to_a : nil,
255
+ credential_provider: credential_provider_builder,
249
256
  retries: retries
250
257
  ),
251
258
  file_cache_ttl
@@ -2,41 +2,106 @@ module Polars
2
2
  module IO
3
3
  # Read into a DataFrame from a newline delimited JSON file.
4
4
  #
5
- # @param source [Object]
6
- # Path to a file or a file-like object.
5
+ # @param source [String]
6
+ # Path to a file.
7
7
  # @param schema [Object]
8
8
  # The DataFrame schema may be declared in several ways:
9
9
  #
10
- # * As a hash of \\\\{name:type} pairs; if type is nil, it will be auto-inferred.
11
- # * As an array of column names; in this case types are automatically inferred.
12
- # * As an array of [name,type] pairs; this is equivalent to the hash form.
10
+ # * As a dict of \\\\{name:type} pairs; if type is nil, it will be auto-inferred.
11
+ # * As a list of column names; in this case types are automatically inferred.
12
+ # * As a list of (name,type) pairs; this is equivalent to the hash form.
13
13
  #
14
- # If you supply an array of column names that does not match the names in the
14
+ # If you supply a list of column names that does not match the names in the
15
15
  # underlying data, the names given here will overwrite them. The number
16
16
  # of names given in the schema should match the underlying data dimensions.
17
17
  # @param schema_overrides [Hash]
18
18
  # Support type specification or override of one or more columns; note that
19
19
  # any dtypes inferred from the schema param will be overridden.
20
+ # @param infer_schema_length [Integer]
21
+ # Infer the schema length from the first `infer_schema_length` rows.
22
+ # @param batch_size [Integer]
23
+ # Number of rows to read in each batch.
24
+ # @param n_rows [Integer]
25
+ # Stop reading from JSON file after reading `n_rows`.
26
+ # @param low_memory [Boolean]
27
+ # Reduce memory pressure at the expense of performance.
28
+ # @param rechunk [Boolean]
29
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
30
+ # @param row_index_name [String]
31
+ # If not nil, this will insert a row count column with give name into the
32
+ # DataFrame.
33
+ # @param row_index_offset [Integer]
34
+ # Offset to start the row_count column (only use if the name is set).
35
+ # @param ignore_errors [Boolean]
36
+ # Return `Null` if parsing fails because of schema mismatches.
37
+ # @param storage_options [Hash]
38
+ # Options that indicate how to connect to a cloud provider.
39
+ #
40
+ # The cloud providers currently supported are AWS, GCP, and Azure.
41
+ # See supported keys here:
42
+ #
43
+ # * [aws](https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html)
44
+ # * [gcp](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html)
45
+ # * [azure](https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html)
46
+ # * Hugging Face (`hf://`): Accepts an API key under the `token` parameter: \
47
+ # `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
48
+ #
49
+ # If `storage_options` is not provided, Polars will try to infer the information
50
+ # from environment variables.
51
+ # @param credential_provider [Object]
52
+ # Provide a function that can be called to provide cloud storage
53
+ # credentials. The function is expected to return a hash of
54
+ # credential keys along with an optional credential expiry time.
55
+ # @param retries [Integer]
56
+ # Number of retries if accessing a cloud instance fails.
57
+ # @param file_cache_ttl [Integer]
58
+ # Amount of time to keep downloaded cloud files since their last access time,
59
+ # in seconds. Uses the `POLARS_FILE_CACHE_TTL` environment variable
60
+ # (which defaults to 1 hour) if not given.
61
+ # @param include_file_paths [String]
62
+ # Include the path of the source file(s) as a column with this name.
20
63
  #
21
64
  # @return [DataFrame]
22
65
  def read_ndjson(
23
66
  source,
24
67
  schema: nil,
25
68
  schema_overrides: nil,
26
- ignore_errors: false
69
+ infer_schema_length: N_INFER_DEFAULT,
70
+ batch_size: 1024,
71
+ n_rows: nil,
72
+ low_memory: false,
73
+ rechunk: false,
74
+ row_index_name: nil,
75
+ row_index_offset: 0,
76
+ ignore_errors: false,
77
+ storage_options: nil,
78
+ credential_provider: "auto",
79
+ retries: 2,
80
+ file_cache_ttl: nil,
81
+ include_file_paths: nil
27
82
  )
28
- if Utils.pathlike?(source)
29
- source = Utils.normalize_filepath(source)
30
- end
83
+ credential_provider_builder = _init_credential_provider_builder(
84
+ credential_provider, source, storage_options, "read_ndjson"
85
+ )
31
86
 
32
- rbdf =
33
- RbDataFrame.read_ndjson(
34
- source,
35
- ignore_errors,
36
- schema,
37
- schema_overrides
38
- )
39
- Utils.wrap_df(rbdf)
87
+ scan_ndjson(
88
+ source,
89
+ schema: schema,
90
+ schema_overrides: schema_overrides,
91
+ infer_schema_length: infer_schema_length,
92
+ batch_size: batch_size,
93
+ n_rows: n_rows,
94
+ low_memory: low_memory,
95
+ rechunk: rechunk,
96
+ row_index_name: row_index_name,
97
+ row_index_offset: row_index_offset,
98
+ ignore_errors: ignore_errors,
99
+ include_file_paths: include_file_paths,
100
+ retries: retries,
101
+ storage_options: storage_options,
102
+ credential_provider: credential_provider_builder,
103
+ file_cache_ttl: file_cache_ttl,
104
+ ).collect
40
105
  end
41
106
 
42
107
  # Lazily read from a newline delimited JSON file.
@@ -46,6 +111,19 @@ module Polars
46
111
  #
47
112
  # @param source [String]
48
113
  # Path to a file.
114
+ # @param schema [Object]
115
+ # The DataFrame schema may be declared in several ways:
116
+ #
117
+ # * As a dict of \\\\{name:type} pairs; if type is nil, it will be auto-inferred.
118
+ # * As a list of column names; in this case types are automatically inferred.
119
+ # * As a list of (name,type) pairs; this is equivalent to the hash form.
120
+ #
121
+ # If you supply a list of column names that does not match the names in the
122
+ # underlying data, the names given here will overwrite them. The number
123
+ # of names given in the schema should match the underlying data dimensions.
124
+ # @param schema_overrides [Hash]
125
+ # Support type specification or override of one or more columns; note that
126
+ # any dtypes inferred from the schema param will be overridden.
49
127
  # @param infer_schema_length [Integer]
50
128
  # Infer the schema length from the first `infer_schema_length` rows.
51
129
  # @param batch_size [Integer]
@@ -56,22 +134,58 @@ module Polars
56
134
  # Reduce memory pressure at the expense of performance.
57
135
  # @param rechunk [Boolean]
58
136
  # Reallocate to contiguous memory when all chunks/ files are parsed.
59
- # @param row_count_name [String]
137
+ # @param row_index_name [String]
60
138
  # If not nil, this will insert a row count column with give name into the
61
139
  # DataFrame.
62
- # @param row_count_offset [Integer]
140
+ # @param row_index_offset [Integer]
63
141
  # Offset to start the row_count column (only use if the name is set).
142
+ # @param ignore_errors [Boolean]
143
+ # Return `Null` if parsing fails because of schema mismatches.
144
+ # @param storage_options [Hash]
145
+ # Options that indicate how to connect to a cloud provider.
146
+ #
147
+ # The cloud providers currently supported are AWS, GCP, and Azure.
148
+ # See supported keys here:
149
+ #
150
+ # * [aws](https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html)
151
+ # * [gcp](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html)
152
+ # * [azure](https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html)
153
+ # * Hugging Face (`hf://`): Accepts an API key under the `token` parameter: \
154
+ # `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
155
+ #
156
+ # If `storage_options` is not provided, Polars will try to infer the information
157
+ # from environment variables.
158
+ # @param credential_provider [Object]
159
+ # Provide a function that can be called to provide cloud storage
160
+ # credentials. The function is expected to return a hash of
161
+ # credential keys along with an optional credential expiry time.
162
+ # @param retries [Integer]
163
+ # Number of retries if accessing a cloud instance fails.
164
+ # @param file_cache_ttl [Integer]
165
+ # Amount of time to keep downloaded cloud files since their last access time,
166
+ # in seconds. Uses the `POLARS_FILE_CACHE_TTL` environment variable
167
+ # (which defaults to 1 hour) if not given.
168
+ # @param include_file_paths [String]
169
+ # Include the path of the source file(s) as a column with this name.
64
170
  #
65
171
  # @return [LazyFrame]
66
172
  def scan_ndjson(
67
173
  source,
174
+ schema: nil,
175
+ schema_overrides: nil,
68
176
  infer_schema_length: N_INFER_DEFAULT,
69
177
  batch_size: 1024,
70
178
  n_rows: nil,
71
179
  low_memory: false,
72
- rechunk: true,
73
- row_count_name: nil,
74
- row_count_offset: 0
180
+ rechunk: false,
181
+ row_index_name: nil,
182
+ row_index_offset: 0,
183
+ ignore_errors: false,
184
+ storage_options: nil,
185
+ credential_provider: "auto",
186
+ retries: 2,
187
+ file_cache_ttl: nil,
188
+ include_file_paths: nil
75
189
  )
76
190
  sources = []
77
191
  if Utils.pathlike?(source)
@@ -86,16 +200,39 @@ module Polars
86
200
  source = nil
87
201
  end
88
202
 
203
+ if infer_schema_length == 0
204
+ msg = "'infer_schema_length' should be positive"
205
+ raise ArgumentError, msg
206
+ end
207
+
208
+ credential_provider_builder = _init_credential_provider_builder(
209
+ credential_provider, source, storage_options, "scan_ndjson"
210
+ )
211
+
212
+ if storage_options&.any?
213
+ storage_options = storage_options.map { |k, v| [k.to_s, v.to_s] }
214
+ else
215
+ storage_options = nil
216
+ end
217
+
89
218
  rblf =
90
219
  RbLazyFrame.new_from_ndjson(
91
220
  source,
92
221
  sources,
93
222
  infer_schema_length,
223
+ schema,
224
+ schema_overrides,
94
225
  batch_size,
95
226
  n_rows,
96
227
  low_memory,
97
228
  rechunk,
98
- Utils.parse_row_index_args(row_count_name, row_count_offset)
229
+ Utils.parse_row_index_args(row_index_name, row_index_offset),
230
+ ignore_errors,
231
+ include_file_paths,
232
+ storage_options,
233
+ credential_provider_builder,
234
+ retries,
235
+ file_cache_ttl
99
236
  )
100
237
  Utils.wrap_ldf(rblf)
101
238
  end