RubyGems - polars-df - Versions diffs - 0.23.0 → 0.24.0 - Mend

polars-df 0.23.0 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (146) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +127 -1
data/Cargo.lock +72 -58
data/README.md +31 -27
data/ext/polars/Cargo.toml +15 -6
data/ext/polars/src/batched_csv.rs +35 -39
data/ext/polars/src/c_api/allocator.rs +7 -0
data/ext/polars/src/c_api/mod.rs +1 -0
data/ext/polars/src/catalog/unity.rs +123 -101
data/ext/polars/src/conversion/any_value.rs +13 -17
data/ext/polars/src/conversion/chunked_array.rs +5 -5
data/ext/polars/src/conversion/datetime.rs +3 -2
data/ext/polars/src/conversion/mod.rs +50 -45
data/ext/polars/src/dataframe/export.rs +13 -13
data/ext/polars/src/dataframe/general.rs +223 -223
data/ext/polars/src/dataframe/io.rs +27 -141
data/ext/polars/src/dataframe/mod.rs +13 -5
data/ext/polars/src/dataframe/serde.rs +1 -1
data/ext/polars/src/error.rs +44 -7
data/ext/polars/src/exceptions.rs +45 -12
data/ext/polars/src/expr/array.rs +12 -0
data/ext/polars/src/expr/datatype.rs +2 -2
data/ext/polars/src/expr/datetime.rs +4 -5
data/ext/polars/src/expr/general.rs +49 -13
data/ext/polars/src/expr/list.rs +4 -0
data/ext/polars/src/expr/meta.rs +8 -3
data/ext/polars/src/expr/mod.rs +22 -6
data/ext/polars/src/expr/name.rs +19 -8
data/ext/polars/src/expr/rolling.rs +50 -1
data/ext/polars/src/expr/string.rs +0 -1
data/ext/polars/src/expr/struct.rs +7 -2
data/ext/polars/src/file.rs +136 -103
data/ext/polars/src/functions/aggregation.rs +9 -8
data/ext/polars/src/functions/io.rs +81 -10
data/ext/polars/src/functions/lazy.rs +95 -21
data/ext/polars/src/functions/mod.rs +2 -0
data/ext/polars/src/functions/range.rs +19 -3
data/ext/polars/src/functions/strings.rs +6 -0
data/ext/polars/src/functions/utils.rs +6 -0
data/ext/polars/src/interop/arrow/mod.rs +50 -1
data/ext/polars/src/interop/arrow/{to_ruby.rs → to_rb.rs} +30 -0
data/ext/polars/src/interop/arrow/to_rust.rs +43 -0
data/ext/polars/src/interop/numo/to_numo_df.rs +1 -1
data/ext/polars/src/interop/numo/to_numo_series.rs +1 -1
data/ext/polars/src/lazyframe/exitable.rs +39 -0
data/ext/polars/src/lazyframe/general.rs +340 -236
data/ext/polars/src/lazyframe/mod.rs +46 -10
data/ext/polars/src/lazyframe/optflags.rs +5 -4
data/ext/polars/src/lazyframe/serde.rs +11 -3
data/ext/polars/src/lazyframe/sink.rs +10 -5
data/ext/polars/src/lazygroupby.rs +6 -7
data/ext/polars/src/lib.rs +141 -76
data/ext/polars/src/map/dataframe.rs +12 -12
data/ext/polars/src/map/lazy.rs +7 -5
data/ext/polars/src/map/mod.rs +15 -8
data/ext/polars/src/map/series.rs +3 -3
data/ext/polars/src/on_startup.rs +16 -8
data/ext/polars/src/prelude.rs +1 -0
data/ext/polars/src/rb_modules.rs +19 -49
data/ext/polars/src/series/aggregation.rs +79 -140
data/ext/polars/src/series/arithmetic.rs +16 -22
data/ext/polars/src/series/comparison.rs +101 -222
data/ext/polars/src/series/construction.rs +17 -18
data/ext/polars/src/series/export.rs +1 -1
data/ext/polars/src/series/general.rs +254 -289
data/ext/polars/src/series/import.rs +17 -0
data/ext/polars/src/series/map.rs +178 -160
data/ext/polars/src/series/mod.rs +28 -12
data/ext/polars/src/series/scatter.rs +12 -9
data/ext/polars/src/sql.rs +16 -9
data/ext/polars/src/testing/frame.rs +31 -0
data/ext/polars/src/testing/mod.rs +5 -0
data/ext/polars/src/testing/series.rs +31 -0
data/ext/polars/src/timeout.rs +105 -0
data/ext/polars/src/utils.rs +159 -1
data/lib/polars/array_expr.rb +81 -12
data/lib/polars/array_name_space.rb +74 -7
data/lib/polars/batched_csv_reader.rb +21 -21
data/lib/polars/binary_name_space.rb +1 -1
data/lib/polars/cat_expr.rb +7 -7
data/lib/polars/config.rb +1 -1
data/lib/polars/convert.rb +189 -34
data/lib/polars/data_frame.rb +1066 -831
data/lib/polars/data_frame_plot.rb +173 -0
data/lib/polars/data_type_group.rb +1 -0
data/lib/polars/data_types.rb +31 -12
data/lib/polars/date_time_expr.rb +51 -69
data/lib/polars/date_time_name_space.rb +80 -112
data/lib/polars/dynamic_group_by.rb +7 -7
data/lib/polars/exceptions.rb +50 -10
data/lib/polars/expr.rb +470 -517
data/lib/polars/functions/aggregation/horizontal.rb +0 -1
data/lib/polars/functions/aggregation/vertical.rb +2 -3
data/lib/polars/functions/as_datatype.rb +290 -8
data/lib/polars/functions/eager.rb +204 -10
data/lib/polars/functions/escape_regex.rb +21 -0
data/lib/polars/functions/lazy.rb +409 -169
data/lib/polars/functions/lit.rb +17 -1
data/lib/polars/functions/range/int_range.rb +74 -2
data/lib/polars/functions/range/linear_space.rb +77 -0
data/lib/polars/functions/range/time_range.rb +1 -1
data/lib/polars/functions/repeat.rb +3 -12
data/lib/polars/functions/whenthen.rb +2 -2
data/lib/polars/group_by.rb +72 -20
data/lib/polars/iceberg_dataset.rb +1 -6
data/lib/polars/in_process_query.rb +37 -0
data/lib/polars/io/cloud.rb +18 -0
data/lib/polars/io/csv.rb +265 -126
data/lib/polars/io/database.rb +0 -1
data/lib/polars/io/delta.rb +15 -7
data/lib/polars/io/ipc.rb +24 -17
data/lib/polars/io/ndjson.rb +161 -24
data/lib/polars/io/parquet.rb +101 -38
data/lib/polars/lazy_frame.rb +849 -558
data/lib/polars/lazy_group_by.rb +327 -2
data/lib/polars/list_expr.rb +94 -16
data/lib/polars/list_name_space.rb +88 -24
data/lib/polars/meta_expr.rb +42 -1
data/lib/polars/name_expr.rb +41 -4
data/lib/polars/query_opt_flags.rb +198 -2
data/lib/polars/rolling_group_by.rb +3 -3
data/lib/polars/schema.rb +21 -3
data/lib/polars/selector.rb +37 -2
data/lib/polars/selectors.rb +45 -9
data/lib/polars/series.rb +1156 -728
data/lib/polars/series_plot.rb +72 -0
data/lib/polars/slice.rb +1 -1
data/lib/polars/sql_context.rb +11 -4
data/lib/polars/string_expr.rb +59 -68
data/lib/polars/string_name_space.rb +51 -87
data/lib/polars/struct_expr.rb +36 -18
data/lib/polars/testing.rb +24 -273
data/lib/polars/utils/constants.rb +2 -0
data/lib/polars/utils/construction/data_frame.rb +410 -0
data/lib/polars/utils/construction/series.rb +364 -0
data/lib/polars/utils/construction/utils.rb +9 -0
data/lib/polars/utils/deprecation.rb +11 -0
data/lib/polars/utils/serde.rb +8 -3
data/lib/polars/utils/unstable.rb +19 -0
data/lib/polars/utils/various.rb +59 -0
data/lib/polars/utils.rb +46 -47
data/lib/polars/version.rb +1 -1
data/lib/polars.rb +47 -1
metadata +25 -6
data/ext/polars/src/allocator.rs +0 -13
data/lib/polars/plot.rb +0 -109

data/lib/polars/io/database.rb CHANGED Viewed

@@ -85,6 +85,5 @@ module Polars
       DataFrame.new(data, schema_overrides: schema_overrides)
     end
-    alias_method :read_sql, :read_database
   end
 end

data/lib/polars/io/delta.rb CHANGED Viewed

@@ -21,19 +21,23 @@ module Polars
       source,
       version: nil,
       columns: nil,
-      rechunk: false,
+      rechunk: nil,
       storage_options: nil,
       delta_table_options: nil
     )
-      dl_tbl =
-        _get_delta_lake_table(
+      df =
+        scan_delta(
           source,
           version: version,
           storage_options: storage_options,
-          delta_table_options: delta_table_options
+          delta_table_options: delta_table_options,
+          rechunk: rechunk
         )
-      dl_tbl.to_polars(columns: columns, rechunk: rechunk)
+      if !columns.nil?
+        df = df.select(columns)
+      end
+      df.collect
     end
     # Lazily read from a Delta lake table.
@@ -46,13 +50,17 @@ module Polars
     #   Extra options for the storage backends supported by `deltalake-rb`.
     # @param delta_table_options [Hash]
     #   Additional keyword arguments while reading a Delta lake Table.
+    # @param rechunk [Boolean]
+    #   Make sure that all columns are contiguous in memory by
+    #   aggregating the chunks into a single array.
     #
     # @return [LazyFrame]
     def scan_delta(
       source,
       version: nil,
       storage_options: nil,
-      delta_table_options: nil
+      delta_table_options: nil,
+      rechunk: nil
     )
       dl_tbl =
         _get_delta_lake_table(
@@ -62,7 +70,7 @@ module Polars
           delta_table_options: delta_table_options
         )
-      dl_tbl.to_polars(eager: false)
+      dl_tbl.to_polars(eager: false, rechunk: rechunk || false)
     end
     private

data/lib/polars/io/ipc.rb CHANGED Viewed

@@ -15,10 +15,10 @@ module Polars
     #   Only uncompressed IPC files can be memory mapped.
     # @param storage_options [Hash]
     #   Extra options that make sense for a particular storage connection.
-    # @param row_count_name [String]
+    # @param row_index_name [String]
     #   If not nil, this will insert a row count column with give name into the
     #   DataFrame.
-    # @param row_count_offset [Integer]
+    # @param row_index_offset [Integer]
     #   Offset to start the row_count column (only use if the name is set).
     # @param rechunk [Boolean]
     #   Make sure that all data is contiguous.
@@ -30,8 +30,8 @@ module Polars
       n_rows: nil,
       memory_map: true,
       storage_options: nil,
-      row_count_name: nil,
-      row_count_offset: 0,
+      row_index_name: nil,
+      row_index_offset: 0,
       rechunk: true
     )
       storage_options ||= {}
@@ -40,8 +40,8 @@ module Polars
           data,
           columns: columns,
           n_rows: n_rows,
-          row_count_name: row_count_name,
-          row_count_offset: row_count_offset,
+          row_index_name: row_index_name,
+          row_index_offset: row_index_offset,
           rechunk: rechunk,
           memory_map: memory_map
         )
@@ -53,8 +53,8 @@ module Polars
       file,
       columns: nil,
       n_rows: nil,
-      row_count_name: nil,
-      row_count_offset: 0,
+      row_index_name: nil,
+      row_index_offset: 0,
       rechunk: true,
       memory_map: true
     )
@@ -76,7 +76,7 @@ module Polars
           columns,
           projection,
           n_rows,
-          Utils.parse_row_index_args(row_count_name, row_count_offset),
+          Utils.parse_row_index_args(row_index_name, row_index_offset),
           memory_map
         )
       Utils.wrap_df(rbdf)
@@ -182,15 +182,19 @@ module Polars
     #   Cache the result after reading.
     # @param rechunk [Boolean]
     #   Reallocate to contiguous memory when all chunks/ files are parsed.
-    # @param row_count_name [String]
+    # @param row_index_name [String]
     #   If not nil, this will insert a row count column with give name into the
     #   DataFrame.
-    # @param row_count_offset [Integer]
+    # @param row_index_offset [Integer]
     #   Offset to start the row_count column (only use if the name is set).
     # @param glob [Boolean]
     #   Expand path given via globbing rules.
     # @param storage_options [Hash]
     #   Extra options that make sense for a particular storage connection.
+    # @param credential_provider [Object]
+    #   Provide a function that can be called to provide cloud storage
+    #   credentials. The function is expected to return a hash of
+    #   credential keys along with an optional credential expiry time.
     # @param retries [Integer]
     #   Number of retries if accessing a cloud instance fails.
     # @param file_cache_ttl [Integer]
@@ -215,11 +219,12 @@ module Polars
       source,
       n_rows: nil,
       cache: true,
-      rechunk: true,
-      row_count_name: nil,
-      row_count_offset: 0,
+      rechunk: false,
+      row_index_name: nil,
+      row_index_offset: 0,
       glob: true,
       storage_options: nil,
+      credential_provider: "auto",
       retries: 2,
       file_cache_ttl: nil,
       hive_partitioning: nil,
@@ -227,11 +232,12 @@ module Polars
       try_parse_hive_dates: true,
       include_file_paths: nil
     )
-      row_index_name = row_count_name
-      row_index_offset = row_count_offset
       sources = get_sources(source)
+      credential_provider_builder = _init_credential_provider_builder(
+        credential_provider, sources, storage_options, "scan_parquet"
+      )
       rblf =
         RbLazyFrame.new_from_ipc(
           sources,
@@ -246,6 +252,7 @@ module Polars
             rechunk: rechunk,
             cache: cache,
             storage_options: !storage_options.nil? ? storage_options.to_a : nil,
+            credential_provider: credential_provider_builder,
             retries: retries
           ),
           file_cache_ttl

data/lib/polars/io/ndjson.rb CHANGED Viewed

@@ -2,41 +2,106 @@ module Polars
   module IO
     # Read into a DataFrame from a newline delimited JSON file.
     #
-    # @param source [Object]
-    #   Path to a file or a file-like object.
+    # @param source [String]
+    #   Path to a file.
     # @param schema [Object]
     #   The DataFrame schema may be declared in several ways:
     #
-    #   * As a hash of \\\\{name:type} pairs; if type is nil, it will be auto-inferred.
-    #   * As an array of column names; in this case types are automatically inferred.
-    #   * As an array of [name,type] pairs; this is equivalent to the hash form.
+    #   * As a dict of \\\\{name:type} pairs; if type is nil, it will be auto-inferred.
+    #   * As a list of column names; in this case types are automatically inferred.
+    #   * As a list of (name,type) pairs; this is equivalent to the hash form.
     #
-    #   If you supply an array of column names that does not match the names in the
+    #   If you supply a list of column names that does not match the names in the
     #   underlying data, the names given here will overwrite them. The number
     #   of names given in the schema should match the underlying data dimensions.
     # @param schema_overrides [Hash]
     #   Support type specification or override of one or more columns; note that
     #   any dtypes inferred from the schema param will be overridden.
+    # @param infer_schema_length [Integer]
+    #   Infer the schema length from the first `infer_schema_length` rows.
+    # @param batch_size [Integer]
+    #   Number of rows to read in each batch.
+    # @param n_rows [Integer]
+    #   Stop reading from JSON file after reading `n_rows`.
+    # @param low_memory [Boolean]
+    #   Reduce memory pressure at the expense of performance.
+    # @param rechunk [Boolean]
+    #   Reallocate to contiguous memory when all chunks/ files are parsed.
+    # @param row_index_name [String]
+    #   If not nil, this will insert a row count column with give name into the
+    #   DataFrame.
+    # @param row_index_offset [Integer]
+    #   Offset to start the row_count column (only use if the name is set).
+    # @param ignore_errors [Boolean]
+    #   Return `Null` if parsing fails because of schema mismatches.
+    # @param storage_options [Hash]
+    #   Options that indicate how to connect to a cloud provider.
+    #
+    #   The cloud providers currently supported are AWS, GCP, and Azure.
+    #   See supported keys here:
+    #
+    #   * [aws](https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html)
+    #   * [gcp](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html)
+    #   * [azure](https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html)
+    #   * Hugging Face (`hf://`): Accepts an API key under the `token` parameter: \
+    #     `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
+    #
+    #   If `storage_options` is not provided, Polars will try to infer the information
+    #   from environment variables.
+    # @param credential_provider [Object]
+    #   Provide a function that can be called to provide cloud storage
+    #   credentials. The function is expected to return a hash of
+    #   credential keys along with an optional credential expiry time.
+    # @param retries [Integer]
+    #   Number of retries if accessing a cloud instance fails.
+    # @param file_cache_ttl [Integer]
+    #   Amount of time to keep downloaded cloud files since their last access time,
+    #   in seconds. Uses the `POLARS_FILE_CACHE_TTL` environment variable
+    #   (which defaults to 1 hour) if not given.
+    # @param include_file_paths [String]
+    #   Include the path of the source file(s) as a column with this name.
     #
     # @return [DataFrame]
     def read_ndjson(
       source,
       schema: nil,
       schema_overrides: nil,
-      ignore_errors: false
+      infer_schema_length: N_INFER_DEFAULT,
+      batch_size: 1024,
+      n_rows: nil,
+      low_memory: false,
+      rechunk: false,
+      row_index_name: nil,
+      row_index_offset: 0,
+      ignore_errors: false,
+      storage_options: nil,
+      credential_provider: "auto",
+      retries: 2,
+      file_cache_ttl: nil,
+      include_file_paths: nil
     )
-      if Utils.pathlike?(source)
-        source = Utils.normalize_filepath(source)
-      end
+      credential_provider_builder = _init_credential_provider_builder(
+        credential_provider, source, storage_options, "read_ndjson"
+      )
-      rbdf =
-        RbDataFrame.read_ndjson(
-          source,
-          ignore_errors,
-          schema,
-          schema_overrides
-        )
-      Utils.wrap_df(rbdf)
+      scan_ndjson(
+        source,
+        schema: schema,
+        schema_overrides: schema_overrides,
+        infer_schema_length: infer_schema_length,
+        batch_size: batch_size,
+        n_rows: n_rows,
+        low_memory: low_memory,
+        rechunk: rechunk,
+        row_index_name: row_index_name,
+        row_index_offset: row_index_offset,
+        ignore_errors: ignore_errors,
+        include_file_paths: include_file_paths,
+        retries: retries,
+        storage_options: storage_options,
+        credential_provider: credential_provider_builder,
+        file_cache_ttl: file_cache_ttl,
+      ).collect
     end
     # Lazily read from a newline delimited JSON file.
@@ -46,6 +111,19 @@ module Polars
     #
     # @param source [String]
     #   Path to a file.
+    # @param schema [Object]
+    #   The DataFrame schema may be declared in several ways:
+    #
+    #   * As a dict of \\\\{name:type} pairs; if type is nil, it will be auto-inferred.
+    #   * As a list of column names; in this case types are automatically inferred.
+    #   * As a list of (name,type) pairs; this is equivalent to the hash form.
+    #
+    #   If you supply a list of column names that does not match the names in the
+    #   underlying data, the names given here will overwrite them. The number
+    #   of names given in the schema should match the underlying data dimensions.
+    # @param schema_overrides [Hash]
+    #   Support type specification or override of one or more columns; note that
+    #   any dtypes inferred from the schema param will be overridden.
     # @param infer_schema_length [Integer]
     #   Infer the schema length from the first `infer_schema_length` rows.
     # @param batch_size [Integer]
@@ -56,22 +134,58 @@ module Polars
     #   Reduce memory pressure at the expense of performance.
     # @param rechunk [Boolean]
     #   Reallocate to contiguous memory when all chunks/ files are parsed.
-    # @param row_count_name [String]
+    # @param row_index_name [String]
     #   If not nil, this will insert a row count column with give name into the
     #   DataFrame.
-    # @param row_count_offset [Integer]
+    # @param row_index_offset [Integer]
     #   Offset to start the row_count column (only use if the name is set).
+    # @param ignore_errors [Boolean]
+    #   Return `Null` if parsing fails because of schema mismatches.
+    # @param storage_options [Hash]
+    #   Options that indicate how to connect to a cloud provider.
+    #
+    #   The cloud providers currently supported are AWS, GCP, and Azure.
+    #   See supported keys here:
+    #
+    #   * [aws](https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html)
+    #   * [gcp](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html)
+    #   * [azure](https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html)
+    #   * Hugging Face (`hf://`): Accepts an API key under the `token` parameter: \
+    #     `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
+    #
+    #   If `storage_options` is not provided, Polars will try to infer the information
+    #   from environment variables.
+    # @param credential_provider [Object]
+    #   Provide a function that can be called to provide cloud storage
+    #   credentials. The function is expected to return a hash of
+    #   credential keys along with an optional credential expiry time.
+    # @param retries [Integer]
+    #   Number of retries if accessing a cloud instance fails.
+    # @param file_cache_ttl [Integer]
+    #   Amount of time to keep downloaded cloud files since their last access time,
+    #   in seconds. Uses the `POLARS_FILE_CACHE_TTL` environment variable
+    #   (which defaults to 1 hour) if not given.
+    # @param include_file_paths [String]
+    #   Include the path of the source file(s) as a column with this name.
     #
     # @return [LazyFrame]
     def scan_ndjson(
       source,
+      schema: nil,
+      schema_overrides: nil,
       infer_schema_length: N_INFER_DEFAULT,
       batch_size: 1024,
       n_rows: nil,
       low_memory: false,
-      rechunk: true,
-      row_count_name: nil,
-      row_count_offset: 0
+      rechunk: false,
+      row_index_name: nil,
+      row_index_offset: 0,
+      ignore_errors: false,
+      storage_options: nil,
+      credential_provider: "auto",
+      retries: 2,
+      file_cache_ttl: nil,
+      include_file_paths: nil
     )
       sources = []
       if Utils.pathlike?(source)
@@ -86,16 +200,39 @@ module Polars
         source = nil
       end
+      if infer_schema_length == 0
+        msg = "'infer_schema_length' should be positive"
+        raise ArgumentError, msg
+      end
+      credential_provider_builder = _init_credential_provider_builder(
+        credential_provider, source, storage_options, "scan_ndjson"
+      )
+      if storage_options&.any?
+        storage_options = storage_options.map { |k, v| [k.to_s, v.to_s] }
+      else
+        storage_options = nil
+      end
       rblf =
         RbLazyFrame.new_from_ndjson(
           source,
           sources,
           infer_schema_length,
+          schema,
+          schema_overrides,
           batch_size,
           n_rows,
           low_memory,
           rechunk,
-          Utils.parse_row_index_args(row_count_name, row_count_offset)
+          Utils.parse_row_index_args(row_index_name, row_index_offset),
+          ignore_errors,
+          include_file_paths,
+          storage_options,
+          credential_provider_builder,
+          retries,
+          file_cache_ttl
         )
       Utils.wrap_ldf(rblf)
     end