RubyGems - polars-df - Versions diffs - 0.13.0-x86_64-linux-musl → 0.15.0-x86_64-linux-musl - Mend

polars-df 0.13.0-x86_64-linux-musl → 0.15.0-x86_64-linux-musl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +30 -0
data/Cargo.lock +1368 -319
data/LICENSE-THIRD-PARTY.txt +24801 -13447
data/LICENSE.txt +1 -0
data/README.md +1 -2
data/lib/polars/3.1/polars.so +0 -0
data/lib/polars/3.2/polars.so +0 -0
data/lib/polars/3.3/polars.so +0 -0
data/lib/polars/batched_csv_reader.rb +0 -2
data/lib/polars/binary_expr.rb +133 -9
data/lib/polars/binary_name_space.rb +101 -6
data/lib/polars/config.rb +4 -0
data/lib/polars/data_frame.rb +285 -62
data/lib/polars/data_type_group.rb +28 -0
data/lib/polars/data_types.rb +2 -0
data/lib/polars/date_time_expr.rb +244 -0
data/lib/polars/date_time_name_space.rb +87 -0
data/lib/polars/expr.rb +109 -8
data/lib/polars/functions/as_datatype.rb +51 -2
data/lib/polars/functions/col.rb +1 -1
data/lib/polars/functions/eager.rb +1 -3
data/lib/polars/functions/lazy.rb +88 -10
data/lib/polars/functions/range/time_range.rb +21 -21
data/lib/polars/io/csv.rb +14 -16
data/lib/polars/io/database.rb +2 -2
data/lib/polars/io/ipc.rb +14 -12
data/lib/polars/io/ndjson.rb +10 -0
data/lib/polars/io/parquet.rb +168 -111
data/lib/polars/lazy_frame.rb +649 -15
data/lib/polars/list_name_space.rb +169 -0
data/lib/polars/selectors.rb +1144 -0
data/lib/polars/series.rb +470 -40
data/lib/polars/string_cache.rb +27 -1
data/lib/polars/string_expr.rb +0 -1
data/lib/polars/string_name_space.rb +73 -3
data/lib/polars/struct_name_space.rb +31 -7
data/lib/polars/utils/various.rb +5 -1
data/lib/polars/utils.rb +45 -10
data/lib/polars/version.rb +1 -1
data/lib/polars.rb +2 -1
metadata +4 -3
data/lib/polars/functions.rb +0 -57

data/lib/polars/io/parquet.rb CHANGED Viewed

@@ -2,120 +2,108 @@ module Polars
   module IO
     # Read into a DataFrame from a parquet file.
     #
-    # @param source [String, Pathname, StringIO]
+    # @param source [Object]
     #   Path to a file or a file-like object.
     # @param columns [Object]
     #   Columns to select. Accepts a list of column indices (starting at zero) or a list
     #   of column names.
     # @param n_rows [Integer]
     #   Stop reading from parquet file after reading `n_rows`.
-    # @param storage_options [Hash]
-    #   Extra options that make sense for a particular storage connection.
-    # @param parallel ["auto", "columns", "row_groups", "none"]
-    #   This determines the direction of parallelism. 'auto' will try to determine the
-    #   optimal direction.
     # @param row_count_name [String]
     #   If not nil, this will insert a row count column with give name into the
     #   DataFrame.
     # @param row_count_offset [Integer]
     #   Offset to start the row_count column (only use if the name is set).
-    # @param low_memory [Boolean]
-    #   Reduce memory pressure at the expense of performance.
+    # @param parallel ["auto", "columns", "row_groups", "none"]
+    #   This determines the direction of parallelism. 'auto' will try to determine the
+    #   optimal direction.
     # @param use_statistics [Boolean]
     #   Use statistics in the parquet to determine if pages
     #   can be skipped from reading.
+    # @param hive_partitioning [Boolean]
+    #   Infer statistics and schema from hive partitioned URL and use them
+    #   to prune reads.
+    # @param glob [Boolean]
+    #   Expand path given via globbing rules.
+    # @param schema [Object]
+    #   Specify the datatypes of the columns. The datatypes must match the
+    #   datatypes in the file(s). If there are extra columns that are not in the
+    #   file(s), consider also enabling `allow_missing_columns`.
+    # @param hive_schema [Object]
+    #   The column names and data types of the columns by which the data is partitioned.
+    #   If set to `nil` (default), the schema of the Hive partitions is inferred.
+    # @param try_parse_hive_dates [Boolean]
+    #   Whether to try parsing hive values as date/datetime types.
     # @param rechunk [Boolean]
-    #   Make sure that all columns are contiguous in memory by
-    #   aggregating the chunks into a single array.
+    #   In case of reading multiple files via a glob pattern rechunk the final DataFrame
+    #   into contiguous memory chunks.
+    # @param low_memory [Boolean]
+    #   Reduce memory pressure at the expense of performance.
+    # @param storage_options [Hash]
+    #   Extra options that make sense for a particular storage connection.
+    # @param credential_provider [Object]
+    #   Provide a function that can be called to provide cloud storage
+    #   credentials. The function is expected to return a dictionary of
+    #   credential keys along with an optional credential expiry time.
+    # @param retries [Integer]
+    #   Number of retries if accessing a cloud instance fails.
+    # @param include_file_paths [String]
+    #   Include the path of the source file(s) as a column with this name.
     #
     # @return [DataFrame]
-    #
-    # @note
-    #   This operation defaults to a `rechunk` operation at the end, meaning that
-    #   all data will be stored continuously in memory.
-    #   Set `rechunk: false` if you are benchmarking the parquet-reader. A `rechunk` is
-    #   an expensive operation.
     def read_parquet(
       source,
       columns: nil,
       n_rows: nil,
-      storage_options: nil,
-      parallel: "auto",
       row_count_name: nil,
       row_count_offset: 0,
-      low_memory: false,
+      parallel: "auto",
       use_statistics: true,
-      rechunk: true
+      hive_partitioning: nil,
+      glob: true,
+      schema: nil,
+      hive_schema: nil,
+      try_parse_hive_dates: true,
+      rechunk: false,
+      low_memory: false,
+      storage_options: nil,
+      credential_provider: nil,
+      retries: 2,
+      include_file_paths: nil,
+      allow_missing_columns: false
     )
-      _prepare_file_arg(source) do |data|
-        _read_parquet_impl(
-          data,
-          columns: columns,
+      lf =
+        scan_parquet(
+          source,
           n_rows: n_rows,
-          parallel: parallel,
           row_count_name: row_count_name,
           row_count_offset: row_count_offset,
-          low_memory: low_memory,
+          parallel: parallel,
           use_statistics: use_statistics,
-          rechunk: rechunk
+          hive_partitioning: hive_partitioning,
+          schema: schema,
+          hive_schema: hive_schema,
+          try_parse_hive_dates: try_parse_hive_dates,
+          rechunk: rechunk,
+          low_memory: low_memory,
+          cache: false,
+          storage_options: storage_options,
+          credential_provider: credential_provider,
+          retries: retries,
+          glob: glob,
+          include_file_paths: include_file_paths,
+          allow_missing_columns: allow_missing_columns
         )
-      end
-    end
-    # @private
-    def _read_parquet_impl(
-      source,
-      columns: nil,
-      n_rows: nil,
-      parallel: "auto",
-      row_count_name: nil,
-      row_count_offset: 0,
-      low_memory: false,
-      use_statistics: true,
-      rechunk: true
-    )
-      if Utils.pathlike?(source)
-        source = Utils.normalize_filepath(source)
-      end
-      if columns.is_a?(::String)
-        columns = [columns]
-      end
-      if source.is_a?(::String) && source.include?("*") && Utils.local_file?(source)
-        scan =
-          scan_parquet(
-            source,
-            n_rows: n_rows,
-            rechunk: true,
-            parallel: parallel,
-            row_count_name: row_count_name,
-            row_count_offset: row_count_offset,
-            low_memory: low_memory
-          )
-        if columns.nil?
-          return scan.collect
-        elsif Utils.is_str_sequence(columns, allow_str: false)
-          return scan.select(columns).collect
+      if !columns.nil?
+        if Utils.is_int_sequence(columns)
+          lf = lf.select(F.nth(columns))
         else
-          raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: Array[String]"
+          lf = lf.select(columns)
         end
       end
-      projection, columns = Utils.handle_projection_columns(columns)
-      rbdf =
-        RbDataFrame.read_parquet(
-          source,
-          columns,
-          projection,
-          n_rows,
-          parallel,
-          Utils.parse_row_index_args(row_count_name, row_count_offset),
-          low_memory,
-          use_statistics,
-          rechunk
-        )
-      Utils.wrap_df(rbdf)
+      lf.collect
     end
     # Get a schema of the Parquet file without reading data.
@@ -137,46 +125,83 @@ module Polars
     # This allows the query optimizer to push down predicates and projections to the scan
     # level, thereby potentially reducing memory overhead.
     #
-    # @param source [String]
-    #   Path to a file.
+    # @param source [Object]
+    #   Path to a file or a file-like object.
     # @param n_rows [Integer]
     #   Stop reading from parquet file after reading `n_rows`.
-    # @param cache [Boolean]
-    #   Cache the result after reading.
+    # @param row_count_name [String]
+    #   If not nil, this will insert a row count column with give name into the
+    #   DataFrame.
+    # @param row_count_offset [Integer]
+    #   Offset to start the row_count column (only use if the name is set).
     # @param parallel ["auto", "columns", "row_groups", "none"]
     #   This determines the direction of parallelism. 'auto' will try to determine the
     #   optimal direction.
+    # @param use_statistics [Boolean]
+    #   Use statistics in the parquet to determine if pages
+    #   can be skipped from reading.
+    # @param hive_partitioning [Boolean]
+    #   Infer statistics and schema from hive partitioned URL and use them
+    #   to prune reads.
+    # @param glob [Boolean]
+    #   Expand path given via globbing rules.
+    # @param schema [Object]
+    #   Specify the datatypes of the columns. The datatypes must match the
+    #   datatypes in the file(s). If there are extra columns that are not in the
+    #   file(s), consider also enabling `allow_missing_columns`.
+    # @param hive_schema [Object]
+    #   The column names and data types of the columns by which the data is partitioned.
+    #   If set to `nil` (default), the schema of the Hive partitions is inferred.
+    # @param try_parse_hive_dates [Boolean]
+    #   Whether to try parsing hive values as date/datetime types.
     # @param rechunk [Boolean]
     #   In case of reading multiple files via a glob pattern rechunk the final DataFrame
     #   into contiguous memory chunks.
-    # @param row_count_name [String]
-    #   If not nil, this will insert a row count column with give name into the
-    #   DataFrame.
-    # @param row_count_offset [Integer]
-    #   Offset to start the row_count column (only use if the name is set).
-    # @param storage_options [Hash]
-    #   Extra options that make sense for a particular storage connection.
     # @param low_memory [Boolean]
     #   Reduce memory pressure at the expense of performance.
+    # @param cache [Boolean]
+    #   Cache the result after reading.
+    # @param storage_options [Hash]
+    #   Extra options that make sense for a particular storage connection.
+    # @param credential_provider [Object]
+    #   Provide a function that can be called to provide cloud storage
+    #   credentials. The function is expected to return a dictionary of
+    #   credential keys along with an optional credential expiry time.
+    # @param retries [Integer]
+    #   Number of retries if accessing a cloud instance fails.
     # @param include_file_paths [String]
-    #  Include the path of the source file(s) as a column with this name.
+    #   Include the path of the source file(s) as a column with this name.
     #
     # @return [LazyFrame]
     def scan_parquet(
       source,
       n_rows: nil,
-      cache: true,
-      parallel: "auto",
-      glob: true,
-      rechunk: true,
       row_count_name: nil,
       row_count_offset: 0,
-      storage_options: nil,
+      parallel: "auto",
+      use_statistics: true,
+      hive_partitioning: nil,
+      glob: true,
+      schema: nil,
+      hive_schema: nil,
+      try_parse_hive_dates: true,
+      rechunk: false,
       low_memory: false,
-      include_file_paths: nil
+      cache: true,
+      storage_options: nil,
+      credential_provider: nil,
+      retries: 2,
+      include_file_paths: nil,
+      allow_missing_columns: false
     )
       if Utils.pathlike?(source)
-        source = Utils.normalize_filepath(source)
+        source = Utils.normalize_filepath(source, check_not_directory: false)
+      elsif Utils.is_path_or_str_sequence(source)
+        source = source.map { |s| Utils.normalize_filepath(s, check_not_directory: false) }
+      end
+      if credential_provider
+        raise Todo
       end
       _scan_parquet_impl(
@@ -185,47 +210,79 @@ module Polars
         cache: cache,
         parallel: parallel,
         rechunk: rechunk,
-        row_count_name: row_count_name,
-        row_count_offset: row_count_offset,
+        row_index_name: row_count_name,
+        row_index_offset: row_count_offset,
         storage_options: storage_options,
+        credential_provider: credential_provider,
         low_memory: low_memory,
+        use_statistics: use_statistics,
+        hive_partitioning: hive_partitioning,
+        schema: schema,
+        hive_schema: hive_schema,
+        try_parse_hive_dates: try_parse_hive_dates,
+        retries: retries,
         glob: glob,
-        include_file_paths: include_file_paths
+        include_file_paths: include_file_paths,
+        allow_missing_columns: allow_missing_columns
       )
     end
     # @private
     def _scan_parquet_impl(
-      file,
+      source,
       n_rows: nil,
       cache: true,
       parallel: "auto",
       rechunk: true,
-      row_count_name: nil,
-      row_count_offset: 0,
+      row_index_name: nil,
+      row_index_offset: 0,
       storage_options: nil,
+      credential_provider: nil,
       low_memory: false,
       use_statistics: true,
       hive_partitioning: nil,
       glob: true,
-      include_file_paths: nil
+      schema: nil,
+      hive_schema: nil,
+      try_parse_hive_dates: true,
+      retries: 2,
+      include_file_paths: nil,
+      allow_missing_columns: false
     )
+      if source.is_a?(::Array)
+        sources = source
+        source = nil
+      else
+        sources = []
+      end
+      if storage_options
+        storage_options = storage_options.map { |k, v| [k.to_s, v.to_s] }
+      else
+        storage_options = nil
+      end
       rblf =
         RbLazyFrame.new_from_parquet(
-          file,
-          [],
+          source,
+          sources,
           n_rows,
           cache,
           parallel,
           rechunk,
-          Utils.parse_row_index_args(row_count_name, row_count_offset),
+          Utils.parse_row_index_args(row_index_name, row_index_offset),
           low_memory,
+          storage_options,
+          credential_provider,
           use_statistics,
           hive_partitioning,
-          nil,
-          true,
+          schema,
+          hive_schema,
+          try_parse_hive_dates,
+          retries,
           glob,
-          include_file_paths
+          include_file_paths,
+          allow_missing_columns
         )
       Utils.wrap_ldf(rblf)
     end