RubyGems - polars-df - Versions diffs - 0.23.0 → 0.24.0 - Mend

polars-df 0.23.0 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (146) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +127 -1
data/Cargo.lock +72 -58
data/README.md +31 -27
data/ext/polars/Cargo.toml +15 -6
data/ext/polars/src/batched_csv.rs +35 -39
data/ext/polars/src/c_api/allocator.rs +7 -0
data/ext/polars/src/c_api/mod.rs +1 -0
data/ext/polars/src/catalog/unity.rs +123 -101
data/ext/polars/src/conversion/any_value.rs +13 -17
data/ext/polars/src/conversion/chunked_array.rs +5 -5
data/ext/polars/src/conversion/datetime.rs +3 -2
data/ext/polars/src/conversion/mod.rs +50 -45
data/ext/polars/src/dataframe/export.rs +13 -13
data/ext/polars/src/dataframe/general.rs +223 -223
data/ext/polars/src/dataframe/io.rs +27 -141
data/ext/polars/src/dataframe/mod.rs +13 -5
data/ext/polars/src/dataframe/serde.rs +1 -1
data/ext/polars/src/error.rs +44 -7
data/ext/polars/src/exceptions.rs +45 -12
data/ext/polars/src/expr/array.rs +12 -0
data/ext/polars/src/expr/datatype.rs +2 -2
data/ext/polars/src/expr/datetime.rs +4 -5
data/ext/polars/src/expr/general.rs +49 -13
data/ext/polars/src/expr/list.rs +4 -0
data/ext/polars/src/expr/meta.rs +8 -3
data/ext/polars/src/expr/mod.rs +22 -6
data/ext/polars/src/expr/name.rs +19 -8
data/ext/polars/src/expr/rolling.rs +50 -1
data/ext/polars/src/expr/string.rs +0 -1
data/ext/polars/src/expr/struct.rs +7 -2
data/ext/polars/src/file.rs +136 -103
data/ext/polars/src/functions/aggregation.rs +9 -8
data/ext/polars/src/functions/io.rs +81 -10
data/ext/polars/src/functions/lazy.rs +95 -21
data/ext/polars/src/functions/mod.rs +2 -0
data/ext/polars/src/functions/range.rs +19 -3
data/ext/polars/src/functions/strings.rs +6 -0
data/ext/polars/src/functions/utils.rs +6 -0
data/ext/polars/src/interop/arrow/mod.rs +50 -1
data/ext/polars/src/interop/arrow/{to_ruby.rs → to_rb.rs} +30 -0
data/ext/polars/src/interop/arrow/to_rust.rs +43 -0
data/ext/polars/src/interop/numo/to_numo_df.rs +1 -1
data/ext/polars/src/interop/numo/to_numo_series.rs +1 -1
data/ext/polars/src/lazyframe/exitable.rs +39 -0
data/ext/polars/src/lazyframe/general.rs +340 -236
data/ext/polars/src/lazyframe/mod.rs +46 -10
data/ext/polars/src/lazyframe/optflags.rs +5 -4
data/ext/polars/src/lazyframe/serde.rs +11 -3
data/ext/polars/src/lazyframe/sink.rs +10 -5
data/ext/polars/src/lazygroupby.rs +6 -7
data/ext/polars/src/lib.rs +141 -76
data/ext/polars/src/map/dataframe.rs +12 -12
data/ext/polars/src/map/lazy.rs +7 -5
data/ext/polars/src/map/mod.rs +15 -8
data/ext/polars/src/map/series.rs +3 -3
data/ext/polars/src/on_startup.rs +16 -8
data/ext/polars/src/prelude.rs +1 -0
data/ext/polars/src/rb_modules.rs +19 -49
data/ext/polars/src/series/aggregation.rs +79 -140
data/ext/polars/src/series/arithmetic.rs +16 -22
data/ext/polars/src/series/comparison.rs +101 -222
data/ext/polars/src/series/construction.rs +17 -18
data/ext/polars/src/series/export.rs +1 -1
data/ext/polars/src/series/general.rs +254 -289
data/ext/polars/src/series/import.rs +17 -0
data/ext/polars/src/series/map.rs +178 -160
data/ext/polars/src/series/mod.rs +28 -12
data/ext/polars/src/series/scatter.rs +12 -9
data/ext/polars/src/sql.rs +16 -9
data/ext/polars/src/testing/frame.rs +31 -0
data/ext/polars/src/testing/mod.rs +5 -0
data/ext/polars/src/testing/series.rs +31 -0
data/ext/polars/src/timeout.rs +105 -0
data/ext/polars/src/utils.rs +159 -1
data/lib/polars/array_expr.rb +81 -12
data/lib/polars/array_name_space.rb +74 -7
data/lib/polars/batched_csv_reader.rb +21 -21
data/lib/polars/binary_name_space.rb +1 -1
data/lib/polars/cat_expr.rb +7 -7
data/lib/polars/config.rb +1 -1
data/lib/polars/convert.rb +189 -34
data/lib/polars/data_frame.rb +1066 -831
data/lib/polars/data_frame_plot.rb +173 -0
data/lib/polars/data_type_group.rb +1 -0
data/lib/polars/data_types.rb +31 -12
data/lib/polars/date_time_expr.rb +51 -69
data/lib/polars/date_time_name_space.rb +80 -112
data/lib/polars/dynamic_group_by.rb +7 -7
data/lib/polars/exceptions.rb +50 -10
data/lib/polars/expr.rb +470 -517
data/lib/polars/functions/aggregation/horizontal.rb +0 -1
data/lib/polars/functions/aggregation/vertical.rb +2 -3
data/lib/polars/functions/as_datatype.rb +290 -8
data/lib/polars/functions/eager.rb +204 -10
data/lib/polars/functions/escape_regex.rb +21 -0
data/lib/polars/functions/lazy.rb +409 -169
data/lib/polars/functions/lit.rb +17 -1
data/lib/polars/functions/range/int_range.rb +74 -2
data/lib/polars/functions/range/linear_space.rb +77 -0
data/lib/polars/functions/range/time_range.rb +1 -1
data/lib/polars/functions/repeat.rb +3 -12
data/lib/polars/functions/whenthen.rb +2 -2
data/lib/polars/group_by.rb +72 -20
data/lib/polars/iceberg_dataset.rb +1 -6
data/lib/polars/in_process_query.rb +37 -0
data/lib/polars/io/cloud.rb +18 -0
data/lib/polars/io/csv.rb +265 -126
data/lib/polars/io/database.rb +0 -1
data/lib/polars/io/delta.rb +15 -7
data/lib/polars/io/ipc.rb +24 -17
data/lib/polars/io/ndjson.rb +161 -24
data/lib/polars/io/parquet.rb +101 -38
data/lib/polars/lazy_frame.rb +849 -558
data/lib/polars/lazy_group_by.rb +327 -2
data/lib/polars/list_expr.rb +94 -16
data/lib/polars/list_name_space.rb +88 -24
data/lib/polars/meta_expr.rb +42 -1
data/lib/polars/name_expr.rb +41 -4
data/lib/polars/query_opt_flags.rb +198 -2
data/lib/polars/rolling_group_by.rb +3 -3
data/lib/polars/schema.rb +21 -3
data/lib/polars/selector.rb +37 -2
data/lib/polars/selectors.rb +45 -9
data/lib/polars/series.rb +1156 -728
data/lib/polars/series_plot.rb +72 -0
data/lib/polars/slice.rb +1 -1
data/lib/polars/sql_context.rb +11 -4
data/lib/polars/string_expr.rb +59 -68
data/lib/polars/string_name_space.rb +51 -87
data/lib/polars/struct_expr.rb +36 -18
data/lib/polars/testing.rb +24 -273
data/lib/polars/utils/constants.rb +2 -0
data/lib/polars/utils/construction/data_frame.rb +410 -0
data/lib/polars/utils/construction/series.rb +364 -0
data/lib/polars/utils/construction/utils.rb +9 -0
data/lib/polars/utils/deprecation.rb +11 -0
data/lib/polars/utils/serde.rb +8 -3
data/lib/polars/utils/unstable.rb +19 -0
data/lib/polars/utils/various.rb +59 -0
data/lib/polars/utils.rb +46 -47
data/lib/polars/version.rb +1 -1
data/lib/polars.rb +47 -1
metadata +25 -6
data/ext/polars/src/allocator.rs +0 -13
data/lib/polars/plot.rb +0 -109

data/lib/polars/io/csv.rb CHANGED Viewed

@@ -16,38 +16,55 @@ module Polars
     #   Rename columns right after parsing the CSV file. If the given
     #   list is shorter than the width of the DataFrame the remaining
     #   columns will have their original name.
-    # @param sep [String]
-    #   Single byte character to use as delimiter in the file.
-    # @param comment_char [String]
-    #   Single byte character that indicates the start of a comment line,
-    #   for instance `#`.
+    # @param separator [String]
+    #   Single byte character to use as separator in the file.
+    # @param comment_prefix [String]
+    #   A string used to indicate the start of a comment line. Comment lines are skipped
+    #   during parsing. Common examples of comment prefixes are `#` and `//`.
     # @param quote_char [String]
     #   Single byte character used for csv quoting.
     #   Set to nil to turn off special handling and escaping of quotes.
     # @param skip_rows [Integer]
     #   Start reading after `skip_rows` lines.
-    # @param dtypes [Object]
-    #   Overwrite dtypes during inference.
+    # @param skip_lines [Integer]
+    #   Start reading after `skip_lines` lines. The header will be parsed at this
+    #   offset. Note that CSV escaping will not be respected when skipping lines.
+    #   If you want to skip valid CSV rows, use `skip_rows`.
+    # @param schema [Object]
+    #   Provide the schema. This means that polars doesn't do schema inference.
+    #   This argument expects the complete schema, whereas `schema_overrides` can be
+    #   used to partially overwrite a schema. Note that the order of the columns in
+    #   the provided `schema` must match the order of the columns in the CSV being read.
+    # @param schema_overrides [Object]
+    #   Overwrite dtypes for specific or all columns during schema inference.
     # @param null_values [Object]
     #   Values to interpret as null values. You can provide a:
     #
     #   - `String`: All values equal to this string will be null.
     #   - `Array`: All values equal to any string in this array will be null.
     #   - `Hash`: A hash that maps column name to a null value string.
+    # @param missing_utf8_is_empty_string [Boolean]
+    #   By default a missing value is considered to be null; if you would prefer missing
+    #   utf8 values to be treated as the empty string you can set this param true.
     # @param ignore_errors [Boolean]
     #   Try to keep reading lines if some lines yield errors.
     #   First try `infer_schema_length: 0` to read all columns as
     #   `:str` to check which values might cause an issue.
-    # @param parse_dates [Boolean]
+    # @param try_parse_dates [Boolean]
     #   Try to automatically parse dates. If this does not succeed,
     #   the column remains of data type `:str`.
     # @param n_threads [Integer]
     #   Number of threads to use in csv parsing.
     #   Defaults to the number of physical cpu's of your system.
+    # @param infer_schema [Boolean]
+    #   When `true`, the schema is inferred from the data using the first
+    #   `infer_schema_length` rows.
+    #   When `false`, the schema is not inferred and will be `Polars::String` if not
+    #   specified in `schema` or `schema_overrides`.
     # @param infer_schema_length [Integer]
-    #   Maximum number of lines to read to infer schema.
-    #   If set to 0, all columns will be read as `:utf8`.
-    #   If set to `nil`, a full table scan will be done (slow).
+    #   The maximum number of rows to scan for schema inference.
+    #   If set to `nil`, the full data may be scanned *(this is slow)*.
+    #   Set `infer_schema: false` to read all columns as `Polars::String`.
     # @param batch_size [Integer]
     #   Number of lines to read into the buffer at once.
     #   Modify this to change performance.
@@ -70,15 +87,22 @@ module Polars
     #   particular storage connection.
     # @param skip_rows_after_header [Integer]
     #   Skip this number of rows when the header is parsed.
-    # @param row_count_name [String]
+    # @param row_index_name [String]
     #   If not nil, this will insert a row count column with the given name into
     #   the DataFrame.
-    # @param row_count_offset [Integer]
+    # @param row_index_offset [Integer]
     #   Offset to start the row_count column (only used if the name is set).
     # @param eol_char [String]
     #   Single byte end of line character.
+    # @param raise_if_empty [Boolean]
+    #   When there is no data in the source, `NoDataError` is raised. If this parameter
+    #   is set to false, an empty DataFrame (with no columns) is returned instead.
     # @param truncate_ragged_lines [Boolean]
     #   Truncate lines that are longer than the schema.
+    # @param decimal_comma [Boolean]
+    #   Parse floats using a comma as the decimal separator instead of a period.
+    # @param glob [Boolean]
+    #   Expand path given via globbing rules.
     #
     # @return [DataFrame]
     #
@@ -92,30 +116,36 @@ module Polars
       has_header: true,
       columns: nil,
       new_columns: nil,
-      sep: ",",
-      comment_char: nil,
+      separator: ",",
+      comment_prefix: nil,
       quote_char: '"',
       skip_rows: 0,
-      dtypes: nil,
+      skip_lines: 0,
+      schema: nil,
+      schema_overrides: nil,
       null_values: nil,
+      missing_utf8_is_empty_string: false,
       ignore_errors: false,
-      parse_dates: false,
+      try_parse_dates: false,
       n_threads: nil,
+      infer_schema: true,
       infer_schema_length: N_INFER_DEFAULT,
       batch_size: 8192,
       n_rows: nil,
       encoding: "utf8",
       low_memory: false,
-      rechunk: true,
+      rechunk: false,
       storage_options: nil,
       skip_rows_after_header: 0,
-      row_count_name: nil,
-      row_count_offset: 0,
+      row_index_name: nil,
+      row_index_offset: 0,
       eol_char: "\n",
-      truncate_ragged_lines: false
+      raise_if_empty: true,
+      truncate_ragged_lines: false,
+      decimal_comma: false,
+      glob: true
     )
-      Utils._check_arg_is_1byte("sep", sep, false)
-      Utils._check_arg_is_1byte("comment_char", comment_char, false)
+      Utils._check_arg_is_1byte("separator", separator, false)
       Utils._check_arg_is_1byte("quote_char", quote_char, true)
       Utils._check_arg_is_1byte("eol_char", eol_char, false)
@@ -131,8 +161,8 @@ module Polars
         end
       end
-      if projection || new_columns
-        raise Todo
+      if !infer_schema
+        infer_schema_length = 0
       end
       df = nil
@@ -141,14 +171,17 @@ module Polars
           data,
           has_header: has_header,
           columns: columns || projection,
-          sep: sep,
-          comment_char: comment_char,
+          separator: separator,
+          comment_prefix: comment_prefix,
           quote_char: quote_char,
           skip_rows: skip_rows,
-          dtypes: dtypes,
+          skip_lines: skip_lines,
+          schema_overrides: schema_overrides,
+          schema: schema,
           null_values: null_values,
+          missing_utf8_is_empty_string: missing_utf8_is_empty_string,
           ignore_errors: ignore_errors,
-          parse_dates: parse_dates,
+          try_parse_dates: try_parse_dates,
           n_threads: n_threads,
           infer_schema_length: infer_schema_length,
           batch_size: batch_size,
@@ -157,10 +190,13 @@ module Polars
           low_memory: low_memory,
           rechunk: rechunk,
           skip_rows_after_header: skip_rows_after_header,
-          row_count_name: row_count_name,
-          row_count_offset: row_count_offset,
+          row_index_name: row_index_name,
+          row_index_offset: row_index_offset,
           eol_char: eol_char,
-          truncate_ragged_lines: truncate_ragged_lines
+          raise_if_empty: raise_if_empty,
+          truncate_ragged_lines: truncate_ragged_lines,
+          decimal_comma: decimal_comma,
+          glob: glob
         )
       end
@@ -176,26 +212,27 @@ module Polars
       file,
       has_header: true,
       columns: nil,
-      sep: ",",
-      comment_char: nil,
+      separator: ",",
+      comment_prefix: nil,
       quote_char: '"',
       skip_rows: 0,
-      dtypes: nil,
+      skip_lines: 0,
       schema: nil,
+      schema_overrides: nil,
       null_values: nil,
       missing_utf8_is_empty_string: false,
       ignore_errors: false,
-      parse_dates: false,
+      try_parse_dates: false,
       n_threads: nil,
       infer_schema_length: N_INFER_DEFAULT,
       batch_size: 8192,
       n_rows: nil,
       encoding: "utf8",
       low_memory: false,
-      rechunk: true,
+      rechunk: false,
       skip_rows_after_header: 0,
-      row_count_name: nil,
-      row_count_offset: 0,
+      row_index_name: nil,
+      row_index_offset: 0,
       eol_char: "\n",
       raise_if_empty: true,
       truncate_ragged_lines: false,
@@ -213,16 +250,16 @@ module Polars
       dtype_list = nil
       dtype_slice = nil
-      if !dtypes.nil?
-        if dtypes.is_a?(Hash)
+      if !schema_overrides.nil?
+        if schema_overrides.is_a?(Hash)
           dtype_list = []
-          dtypes.each do |k, v|
-            dtype_list << [k, Utils.rb_type_to_dtype(v)]
+          schema_overrides.each do |k, v|
+            dtype_list << [k, Utils.parse_into_dtype(v)]
           end
-        elsif dtypes.is_a?(::Array)
-          dtype_slice = dtypes
+        elsif schema_overrides.is_a?(::Array)
+          dtype_slice = schema_overrides
         else
-          raise ArgumentError, "dtype arg should be list or dict"
+          raise TypeError, "dtype arg should be array or hash"
         end
       end
@@ -242,11 +279,13 @@ module Polars
         scan = scan_csv(
           file,
           has_header: has_header,
-          sep: sep,
-          comment_char: comment_char,
+          separator: separator,
+          comment_prefix: comment_prefix,
           quote_char: quote_char,
           skip_rows: skip_rows,
-          dtypes: dtypes_dict,
+          skip_lines: skip_lines,
+          schema: schema,
+          schema_overrides: dtypes_dict,
           null_values: null_values,
           missing_utf8_is_empty_string: missing_utf8_is_empty_string,
           ignore_errors: ignore_errors,
@@ -255,9 +294,10 @@ module Polars
           low_memory: low_memory,
           rechunk: rechunk,
           skip_rows_after_header: skip_rows_after_header,
-          row_count_name: row_count_name,
-          row_count_offset: row_count_offset,
+          row_index_name: row_index_name,
+          row_index_offset: row_index_offset,
           eol_char: eol_char,
+          raise_if_empty: raise_if_empty,
           truncate_ragged_lines: truncate_ragged_lines,
           decimal_comma: decimal_comma,
           glob: glob
@@ -282,8 +322,9 @@ module Polars
           ignore_errors,
           n_rows,
           skip_rows,
+          skip_lines,
           projection,
-          sep,
+          separator,
           rechunk,
           columns,
           encoding,
@@ -292,13 +333,13 @@ module Polars
           dtype_list,
           dtype_slice,
           low_memory,
-          comment_char,
+          comment_prefix,
           quote_char,
           processed_null_values,
           missing_utf8_is_empty_string,
-          parse_dates,
+          try_parse_dates,
           skip_rows_after_header,
-          Utils.parse_row_index_args(row_count_name, row_count_offset),
+          Utils.parse_row_index_args(row_index_name, row_index_offset),
           eol_char,
           raise_if_empty,
           truncate_ragged_lines,
@@ -319,7 +360,7 @@ module Polars
     #   Path to a file or a file-like object.
     # @param has_header [Boolean]
     #   Indicate if the first row of dataset is a header or not.
-    #   If set to False, column names will be autogenerated in the
+    #   If set to false, column names will be autogenerated in the
     #   following format: `column_x`, with `x` being an
     #   enumeration over every column in the dataset starting at 1.
     # @param columns [Object]
@@ -329,17 +370,21 @@ module Polars
     #   Rename columns right after parsing the CSV file. If the given
     #   list is shorter than the width of the DataFrame the remaining
     #   columns will have their original name.
-    # @param sep [String]
-    #   Single byte character to use as delimiter in the file.
-    # @param comment_char [String]
-    #   Single byte character that indicates the start of a comment line,
-    #   for instance `#`.
+    # @param separator [String]
+    #   Single byte character to use as separator in the file.
+    # @param comment_prefix [String]
+    #   A string used to indicate the start of a comment line. Comment lines are skipped
+    #   during parsing. Common examples of comment prefixes are `#` and `//`.
     # @param quote_char [String]
     #   Single byte character used for csv quoting, default = `"`.
     #   Set to nil to turn off special handling and escaping of quotes.
     # @param skip_rows [Integer]
     #   Start reading after `skip_rows` lines.
-    # @param dtypes [Object]
+    # @param skip_lines [Integer]
+    #   Start reading after `skip_lines` lines. The header will be parsed at this
+    #   offset. Note that CSV escaping will not be respected when skipping lines.
+    #   If you want to skip valid CSV rows, use `skip_rows`.
+    # @param schema_overrides [Object]
     #   Overwrite dtypes during inference.
     # @param null_values [Object]
     #   Values to interpret as null values. You can provide a:
@@ -354,7 +399,7 @@ module Polars
     #   Try to keep reading lines if some lines yield errors.
     #   First try `infer_schema_length: 0` to read all columns as
     #   `:str` to check which values might cause an issue.
-    # @param parse_dates [Boolean]
+    # @param try_parse_dates [Boolean]
     #   Try to automatically parse dates. If this does not succeed,
     #   the column remains of data type `:str`.
     # @param n_threads [Integer]
@@ -383,10 +428,10 @@ module Polars
     #   aggregating the chunks into a single array.
     # @param skip_rows_after_header [Integer]
     #   Skip this number of rows when the header is parsed.
-    # @param row_count_name [String]
+    # @param row_index_name [String]
     #   If not nil, this will insert a row count column with the given name into
     #   the DataFrame.
-    # @param row_count_offset [Integer]
+    # @param row_index_offset [Integer]
     #   Offset to start the row_count column (only used if the name is set).
     # @param eol_char [String]
     #   Single byte end of line character.
@@ -402,7 +447,7 @@ module Polars
     #
     # @example
     #   reader = Polars.read_csv_batched(
-    #     "./tpch/tables_scale_100/lineitem.tbl", sep: "|", parse_dates: true
+    #     "./tpch/tables_scale_100/lineitem.tbl", separator: "|", try_parse_dates: true
     #   )
     #   reader.next_batches(5)
     def read_csv_batched(
@@ -410,25 +455,26 @@ module Polars
       has_header: true,
       columns: nil,
       new_columns: nil,
-      sep: ",",
-      comment_char: nil,
+      separator: ",",
+      comment_prefix: nil,
       quote_char: '"',
       skip_rows: 0,
-      dtypes: nil,
+      skip_lines: 0,
+      schema_overrides: nil,
       null_values: nil,
       missing_utf8_is_empty_string: false,
       ignore_errors: false,
-      parse_dates: false,
+      try_parse_dates: false,
       n_threads: nil,
       infer_schema_length: N_INFER_DEFAULT,
       batch_size: 50_000,
       n_rows: nil,
       encoding: "utf8",
       low_memory: false,
-      rechunk: true,
+      rechunk: false,
       skip_rows_after_header: 0,
-      row_count_name: nil,
-      row_count_offset: 0,
+      row_index_name: nil,
+      row_index_offset: 0,
       eol_char: "\n",
       raise_if_empty: true,
       truncate_ragged_lines: false,
@@ -444,23 +490,20 @@ module Polars
         end
       end
-      if projection || new_columns
-        raise Todo
-      end
       BatchedCsvReader.new(
         source,
         has_header: has_header,
         columns: columns || projection,
-        sep: sep,
-        comment_char: comment_char,
+        separator: separator,
+        comment_prefix: comment_prefix,
         quote_char: quote_char,
         skip_rows: skip_rows,
-        dtypes: dtypes,
+        skip_lines: skip_lines,
+        schema_overrides: schema_overrides,
         null_values: null_values,
         missing_utf8_is_empty_string: missing_utf8_is_empty_string,
         ignore_errors: ignore_errors,
-        parse_dates: parse_dates,
+        try_parse_dates: try_parse_dates,
         n_threads: n_threads,
         infer_schema_length: infer_schema_length,
         batch_size: batch_size,
@@ -469,8 +512,8 @@ module Polars
         low_memory: low_memory,
         rechunk: rechunk,
         skip_rows_after_header: skip_rows_after_header,
-        row_count_name: row_count_name,
-        row_count_offset: row_count_offset,
+        row_index_name: row_index_name,
+        row_index_offset: row_index_offset,
         eol_char: eol_char,
         new_columns: new_columns,
         raise_if_empty: raise_if_empty,
@@ -492,19 +535,28 @@ module Polars
     #   If set to false, column names will be autogenerated in the
     #   following format: `column_x`, with `x` being an
     #   enumeration over every column in the dataset starting at 1.
-    # @param sep [String]
-    #   Single byte character to use as delimiter in the file.
-    # @param comment_char [String]
-    #   Single byte character that indicates the start of a comment line,
-    #   for instance `#`.
+    # @param separator [String]
+    #   Single byte character to use as separator in the file.
+    # @param comment_prefix [String]
+    #   A string used to indicate the start of a comment line. Comment lines are skipped
+    #   during parsing. Common examples of comment prefixes are `#` and `//`.
     # @param quote_char [String]
     #   Single byte character used for csv quoting.
     #   Set to nil to turn off special handling and escaping of quotes.
     # @param skip_rows [Integer]
     #   Start reading after `skip_rows` lines. The header will be parsed at this
     #   offset.
-    # @param dtypes [Object]
-    #   Overwrite dtypes during inference.
+    # @param skip_lines [Integer]
+    #   Start reading after `skip_lines` lines. The header will be parsed at this
+    #   offset. Note that CSV escaping will not be respected when skipping lines.
+    #   If you want to skip valid CSV rows, use `skip_rows`.
+    # @param schema [Object]
+    #   Provide the schema. This means that polars doesn't do schema inference.
+    #   This argument expects the complete schema, whereas `schema_overrides` can be
+    #   used to partially overwrite a schema. Note that the order of the columns in
+    #   the provided `schema` must match the order of the columns in the CSV being read.
+    # @param schema_overrides [Object]
+    #   Overwrite dtypes for specific or all columns during schema inference.
     # @param null_values [Object]
     #   Values to interpret as null values. You can provide a:
     #
@@ -524,6 +576,11 @@ module Polars
     #   Apply a function over the column names.
     #   This can be used to update a schema just in time, thus before
     #   scanning.
+    # @param infer_schema [Boolean]
+    #   When `true`, the schema is inferred from the data using the first
+    #   `infer_schema_length` rows.
+    #   When `false`, the schema is not inferred and will be `Polars::String` if not
+    #   specified in `schema` or `schema_overrides`.
     # @param infer_schema_length [Integer]
     #   Maximum number of lines to read to infer schema.
     #   If set to 0, all columns will be read as `:str`.
@@ -539,16 +596,20 @@ module Polars
     #   Reallocate to contiguous memory when all chunks/ files are parsed.
     # @param skip_rows_after_header [Integer]
     #   Skip this number of rows when the header is parsed.
-    # @param row_count_name [String]
+    # @param row_index_name [String]
     #   If not nil, this will insert a row count column with the given name into
     #   the DataFrame.
-    # @param row_count_offset [Integer]
+    # @param row_index_offset [Integer]
     #   Offset to start the row_count column (only used if the name is set).
-    # @param parse_dates [Boolean]
+    # @param try_parse_dates [Boolean]
     #   Try to automatically parse dates. If this does not succeed,
     #   the column remains of data type `:str`.
     # @param eol_char [String]
     #   Single byte end of line character.
+    # @param new_columns [Array]
+    #   Provide an explicit list of string column names to use (for example, when
+    #   scanning a headerless CSV file). If the given list is shorter than the width of
+    #   the DataFrame the remaining columns will have their original name.
     # @param raise_if_empty [Boolean]
     #   When there is no data in the source, `NoDataError` is raised. If this parameter
     #   is set to false, an empty LazyFrame (with no columns) is returned instead.
@@ -558,52 +619,100 @@ module Polars
     #   Parse floats using a comma as the decimal separator instead of a period.
     # @param glob [Boolean]
     #   Expand path given via globbing rules.
+    # @param storage_options [Hash]
+    #   Options that indicate how to connect to a cloud provider.
+    #
+    #   The cloud providers currently supported are AWS, GCP, and Azure.
+    #   See supported keys here:
+    #
+    #   * [aws](https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html)
+    #   * [gcp](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html)
+    #   * [azure](https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html)
+    #   * Hugging Face (`hf://`): Accepts an API key under the `token` parameter: \
+    #     `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
+    #
+    #   If `storage_options` is not provided, Polars will try to infer the information
+    #   from environment variables.
+    # @param credential_provider [Object]
+    #   Provide a function that can be called to provide cloud storage
+    #   credentials. The function is expected to return a hash of
+    #   credential keys along with an optional credential expiry time.
+    # @param retries [Integer]
+    #   Number of retries if accessing a cloud instance fails.
+    # @param file_cache_ttl [Integer]
+    #   Amount of time to keep downloaded cloud files since their last access time,
+    #   in seconds. Uses the `POLARS_FILE_CACHE_TTL` environment variable
+    #   (which defaults to 1 hour) if not given.
+    # @param include_file_paths [String]
+    #   Include the path of the source file(s) as a column with this name.
     #
     # @return [LazyFrame]
     def scan_csv(
       source,
       has_header: true,
-      sep: ",",
-      comment_char: nil,
+      separator: ",",
+      comment_prefix: nil,
       quote_char: '"',
       skip_rows: 0,
-      dtypes: nil,
+      skip_lines: 0,
+      schema: nil,
+      schema_overrides: nil,
       null_values: nil,
       missing_utf8_is_empty_string: false,
       ignore_errors: false,
       cache: true,
       with_column_names: nil,
+      infer_schema: true,
       infer_schema_length: N_INFER_DEFAULT,
       n_rows: nil,
       encoding: "utf8",
       low_memory: false,
-      rechunk: true,
+      rechunk: false,
       skip_rows_after_header: 0,
-      row_count_name: nil,
-      row_count_offset: 0,
-      parse_dates: false,
+      row_index_name: nil,
+      row_index_offset: 0,
+      try_parse_dates: false,
       eol_char: "\n",
+      new_columns: nil,
       raise_if_empty: true,
       truncate_ragged_lines: false,
       decimal_comma: false,
-      glob: true
+      glob: true,
+      storage_options: nil,
+      credential_provider: "auto",
+      retries: 2,
+      file_cache_ttl: nil,
+      include_file_paths: nil
     )
-      Utils._check_arg_is_1byte("sep", sep, false)
-      Utils._check_arg_is_1byte("comment_char", comment_char, false)
+      if new_columns
+        raise Todo
+      end
+      Utils._check_arg_is_1byte("separator", separator, false)
       Utils._check_arg_is_1byte("quote_char", quote_char, true)
       if Utils.pathlike?(source)
         source = Utils.normalize_filepath(source)
       end
+      if !infer_schema
+        infer_schema_length = 0
+      end
+      credential_provider_builder = _init_credential_provider_builder(
+        credential_provider, source, storage_options, "scan_csv"
+      )
       _scan_csv_impl(
         source,
         has_header: has_header,
-        sep: sep,
-        comment_char: comment_char,
+        separator: separator,
+        comment_prefix: comment_prefix,
         quote_char: quote_char,
         skip_rows: skip_rows,
-        dtypes: dtypes,
+        skip_lines: skip_lines,
+        schema_overrides: schema_overrides,
+        schema: schema,
         null_values: null_values,
         ignore_errors: ignore_errors,
         cache: cache,
@@ -614,11 +723,19 @@ module Polars
         rechunk: rechunk,
         skip_rows_after_header: skip_rows_after_header,
         encoding: encoding,
-        row_count_name: row_count_name,
-        row_count_offset: row_count_offset,
-        parse_dates: parse_dates,
+        row_index_name: row_index_name,
+        row_index_offset: row_index_offset,
+        try_parse_dates: try_parse_dates,
         eol_char: eol_char,
-        truncate_ragged_lines: truncate_ragged_lines
+        raise_if_empty: raise_if_empty,
+        truncate_ragged_lines: truncate_ragged_lines,
+        decimal_comma: decimal_comma,
+        glob: glob,
+        retries: retries,
+        storage_options: storage_options,
+        credential_provider: credential_provider_builder,
+        file_cache_ttl: file_cache_ttl,
+        include_file_paths: include_file_paths
       )
     end
@@ -626,12 +743,15 @@ module Polars
     def _scan_csv_impl(
       source,
       has_header: true,
-      sep: ",",
-      comment_char: nil,
+      separator: ",",
+      comment_prefix: nil,
       quote_char: '"',
       skip_rows: 0,
-      dtypes: nil,
+      skip_lines: 0,
+      schema: nil,
+      schema_overrides: nil,
       null_values: nil,
+      missing_utf8_is_empty_string: false,
       ignore_errors: false,
       cache: true,
       with_column_names: nil,
@@ -639,19 +759,27 @@ module Polars
       n_rows: nil,
       encoding: "utf8",
       low_memory: false,
-      rechunk: true,
+      rechunk: false,
       skip_rows_after_header: 0,
-      row_count_name: nil,
-      row_count_offset: 0,
-      parse_dates: false,
+      row_index_name: nil,
+      row_index_offset: 0,
+      try_parse_dates: false,
       eol_char: "\n",
-      truncate_ragged_lines: true
+      raise_if_empty: true,
+      truncate_ragged_lines: true,
+      decimal_comma: false,
+      glob: true,
+      storage_options: nil,
+      credential_provider: nil,
+      retries: 2,
+      file_cache_ttl: nil,
+      include_file_paths: nil
     )
       dtype_list = nil
-      if !dtypes.nil?
+      if !schema_overrides.nil?
         dtype_list = []
-        dtypes.each do |k, v|
-          dtype_list << [k, Utils.rb_type_to_dtype(v)]
+        schema_overrides.each do |k, v|
+          dtype_list << [k, Utils.parse_into_dtype(v)]
         end
       end
       processed_null_values = Utils._process_null_values(null_values)
@@ -666,27 +794,38 @@ module Polars
       rblf =
         RbLazyFrame.new_from_csv(
           source,
-          sep,
+          sources,
+          separator,
           has_header,
           ignore_errors,
           skip_rows,
+          skip_lines,
           n_rows,
           cache,
           dtype_list,
           low_memory,
-          comment_char,
+          comment_prefix,
           quote_char,
           processed_null_values,
+          missing_utf8_is_empty_string,
           infer_schema_length,
           with_column_names,
           rechunk,
           skip_rows_after_header,
           encoding,
-          Utils.parse_row_index_args(row_count_name, row_count_offset),
-          parse_dates,
+          Utils.parse_row_index_args(row_index_name, row_index_offset),
+          try_parse_dates,
           eol_char,
+          raise_if_empty,
           truncate_ragged_lines,
-          sources
+          decimal_comma,
+          glob,
+          schema,
+          storage_options,
+          credential_provider,
+          retries,
+          file_cache_ttl,
+          include_file_paths
         )
       Utils.wrap_ldf(rblf)
     end