RubyGems - polars-df - Versions diffs - 0.11.0-x86_64-linux → 0.13.0-x86_64-linux - Mend

polars-df 0.11.0-x86_64-linux → 0.13.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +22 -0
data/Cargo.lock +428 -450
data/LICENSE-THIRD-PARTY.txt +2502 -2242
data/lib/polars/3.1/polars.so +0 -0
data/lib/polars/3.2/polars.so +0 -0
data/lib/polars/3.3/polars.so +0 -0
data/lib/polars/array_expr.rb +4 -4
data/lib/polars/batched_csv_reader.rb +2 -2
data/lib/polars/cat_expr.rb +0 -36
data/lib/polars/cat_name_space.rb +0 -37
data/lib/polars/data_frame.rb +93 -101
data/lib/polars/data_types.rb +1 -1
data/lib/polars/date_time_expr.rb +525 -573
data/lib/polars/date_time_name_space.rb +263 -464
data/lib/polars/dynamic_group_by.rb +3 -3
data/lib/polars/exceptions.rb +3 -0
data/lib/polars/expr.rb +367 -330
data/lib/polars/expr_dispatch.rb +1 -1
data/lib/polars/functions/aggregation/horizontal.rb +8 -8
data/lib/polars/functions/as_datatype.rb +63 -40
data/lib/polars/functions/lazy.rb +63 -14
data/lib/polars/functions/lit.rb +1 -1
data/lib/polars/functions/range/date_range.rb +18 -77
data/lib/polars/functions/range/datetime_range.rb +4 -4
data/lib/polars/functions/range/int_range.rb +2 -2
data/lib/polars/functions/range/time_range.rb +4 -4
data/lib/polars/functions/repeat.rb +1 -1
data/lib/polars/functions/whenthen.rb +1 -1
data/lib/polars/io/csv.rb +8 -8
data/lib/polars/io/ipc.rb +35 -7
data/lib/polars/io/json.rb +13 -2
data/lib/polars/io/ndjson.rb +15 -4
data/lib/polars/io/parquet.rb +15 -8
data/lib/polars/lazy_frame.rb +123 -105
data/lib/polars/lazy_group_by.rb +1 -1
data/lib/polars/list_expr.rb +11 -11
data/lib/polars/list_name_space.rb +5 -1
data/lib/polars/rolling_group_by.rb +5 -7
data/lib/polars/series.rb +108 -191
data/lib/polars/string_expr.rb +51 -76
data/lib/polars/string_name_space.rb +5 -4
data/lib/polars/testing.rb +2 -2
data/lib/polars/utils/constants.rb +9 -0
data/lib/polars/utils/convert.rb +97 -0
data/lib/polars/utils/parse.rb +89 -0
data/lib/polars/utils/various.rb +76 -0
data/lib/polars/utils/wrap.rb +19 -0
data/lib/polars/utils.rb +4 -330
data/lib/polars/version.rb +1 -1
data/lib/polars/whenthen.rb +6 -6
data/lib/polars.rb +11 -0
metadata +7 -2

data/lib/polars/io/csv.rb CHANGED Viewed

@@ -104,7 +104,7 @@ module Polars
       ignore_errors: false,
       parse_dates: false,
       n_threads: nil,
-      infer_schema_length: 100,
+      infer_schema_length: N_INFER_DEFAULT,
       batch_size: 8192,
       n_rows: nil,
       encoding: "utf8",
@@ -192,7 +192,7 @@ module Polars
       ignore_errors: false,
       parse_dates: false,
       n_threads: nil,
-      infer_schema_length: 100,
+      infer_schema_length: N_INFER_DEFAULT,
       batch_size: 8192,
       n_rows: nil,
       encoding: "utf8",
@@ -222,7 +222,7 @@ module Polars
       if !dtypes.nil?
         if dtypes.is_a?(Hash)
           dtype_list = []
-          dtypes.each do|k, v|
+          dtypes.each do |k, v|
             dtype_list << [k, Utils.rb_type_to_dtype(v)]
           end
         elsif dtypes.is_a?(::Array)
@@ -304,7 +304,7 @@ module Polars
           missing_utf8_is_empty_string,
           parse_dates,
           skip_rows_after_header,
-          Utils._prepare_row_count_args(row_count_name, row_count_offset),
+          Utils.parse_row_index_args(row_count_name, row_count_offset),
           sample_size,
           eol_char,
           raise_if_empty,
@@ -422,7 +422,7 @@ module Polars
       ignore_errors: false,
       parse_dates: false,
       n_threads: nil,
-      infer_schema_length: 100,
+      infer_schema_length: N_INFER_DEFAULT,
       batch_size: 50_000,
       n_rows: nil,
       encoding: "utf8",
@@ -567,7 +567,7 @@ module Polars
       ignore_errors: false,
       cache: true,
       with_column_names: nil,
-      infer_schema_length: 100,
+      infer_schema_length: N_INFER_DEFAULT,
       n_rows: nil,
       encoding: "utf8",
       low_memory: false,
@@ -629,7 +629,7 @@ module Polars
       ignore_errors: false,
       cache: true,
       with_column_names: nil,
-      infer_schema_length: 100,
+      infer_schema_length: N_INFER_DEFAULT,
       n_rows: nil,
       encoding: "utf8",
       low_memory: false,
@@ -669,7 +669,7 @@ module Polars
           rechunk,
           skip_rows_after_header,
           encoding,
-          Utils._prepare_row_count_args(row_count_name, row_count_offset),
+          Utils.parse_row_index_args(row_count_name, row_count_offset),
           parse_dates,
           eol_char,
           truncate_ragged_lines

data/lib/polars/io/ipc.rb CHANGED Viewed

@@ -76,7 +76,7 @@ module Polars
           columns,
           projection,
           n_rows,
-          Utils._prepare_row_count_args(row_count_name, row_count_offset),
+          Utils.parse_row_index_args(row_count_name, row_count_offset),
           memory_map
         )
       Utils.wrap_df(rbdf)
@@ -149,7 +149,7 @@ module Polars
         columns,
         projection,
         n_rows,
-        Utils._prepare_row_count_args(row_index_name, row_index_offset),
+        Utils.parse_row_index_args(row_index_name, row_index_offset),
         rechunk
       )
       Utils.wrap_df(pydf)
@@ -193,6 +193,18 @@ module Polars
     #   Try to memory map the file. This can greatly improve performance on repeated
     #   queries as the OS may cache pages.
     #   Only uncompressed IPC files can be memory mapped.
+    # @param hive_partitioning [Boolean]
+    #   Infer statistics and schema from Hive partitioned URL and use them
+    #   to prune reads. This is unset by default (i.e. `nil`), meaning it is
+    #   automatically enabled when a single directory is passed, and otherwise
+    #   disabled.
+    # @param hive_schema [Hash]
+    #   The column names and data types of the columns by which the data is partitioned.
+    #   If set to `nil` (default), the schema of the Hive partitions is inferred.
+    # @param try_parse_hive_dates [Boolean]
+    #   Whether to try parsing hive values as date/datetime types.
+    # @param include_file_paths [String]
+    #   Include the path of the source file(s) as a column with this name.
     #
     # @return [LazyFrame]
     def scan_ipc(
@@ -203,7 +215,11 @@ module Polars
       row_count_name: nil,
       row_count_offset: 0,
       storage_options: nil,
-      memory_map: true
+      memory_map: true,
+      hive_partitioning: nil,
+      hive_schema: nil,
+      try_parse_hive_dates: true,
+      include_file_paths: nil
     )
       _scan_ipc_impl(
         source,
@@ -213,7 +229,11 @@ module Polars
         row_count_name: row_count_name,
         row_count_offset: row_count_offset,
         storage_options: storage_options,
-        memory_map: memory_map
+        memory_map: memory_map,
+        hive_partitioning: hive_partitioning,
+        hive_schema: hive_schema,
+        try_parse_hive_dates: try_parse_hive_dates,
+        include_file_paths: include_file_paths
       )
     end
@@ -226,7 +246,11 @@ module Polars
       row_count_name: nil,
       row_count_offset: 0,
       storage_options: nil,
-      memory_map: true
+      memory_map: true,
+      hive_partitioning: nil,
+      hive_schema: nil,
+      try_parse_hive_dates: true,
+      include_file_paths: nil
     )
       if Utils.pathlike?(file)
         file = Utils.normalize_filepath(file)
@@ -238,8 +262,12 @@ module Polars
           n_rows,
           cache,
           rechunk,
-          Utils._prepare_row_count_args(row_count_name, row_count_offset),
-          memory_map
+          Utils.parse_row_index_args(row_count_name, row_count_offset),
+          memory_map,
+          hive_partitioning,
+          hive_schema,
+          try_parse_hive_dates,
+          include_file_paths
         )
       Utils.wrap_ldf(rblf)
     end

data/lib/polars/io/json.rb CHANGED Viewed

@@ -6,12 +6,23 @@ module Polars
     #   Path to a file or a file-like object.
     #
     # @return [DataFrame]
-    def read_json(source)
+    def read_json(
+      source,
+      schema: nil,
+      schema_overrides: nil,
+      infer_schema_length: N_INFER_DEFAULT
+    )
       if Utils.pathlike?(source)
         source = Utils.normalize_filepath(source)
       end
-      rbdf = RbDataFrame.read_json(source)
+      rbdf =
+        RbDataFrame.read_json(
+          source,
+          infer_schema_length,
+          schema,
+          schema_overrides
+        )
       Utils.wrap_df(rbdf)
     end
   end

data/lib/polars/io/ndjson.rb CHANGED Viewed

@@ -6,12 +6,23 @@ module Polars
     #   Path to a file or a file-like object.
     #
     # @return [DataFrame]
-    def read_ndjson(source)
+    def read_ndjson(
+      source,
+      schema: nil,
+      schema_overrides: nil,
+      ignore_errors: false
+    )
       if Utils.pathlike?(source)
         source = Utils.normalize_filepath(source)
       end
-      rbdf = RbDataFrame.read_ndjson(source)
+      rbdf =
+        RbDataFrame.read_ndjson(
+          source,
+          ignore_errors,
+          schema,
+          schema_overrides
+        )
       Utils.wrap_df(rbdf)
     end
@@ -41,7 +52,7 @@ module Polars
     # @return [LazyFrame]
     def scan_ndjson(
       source,
-      infer_schema_length: 100,
+      infer_schema_length: N_INFER_DEFAULT,
       batch_size: 1024,
       n_rows: nil,
       low_memory: false,
@@ -61,7 +72,7 @@ module Polars
           n_rows,
           low_memory,
           rechunk,
-          Utils._prepare_row_count_args(row_count_name, row_count_offset)
+          Utils.parse_row_index_args(row_count_name, row_count_offset)
         )
       Utils.wrap_ldf(rblf)
     end

data/lib/polars/io/parquet.rb CHANGED Viewed

@@ -110,7 +110,7 @@ module Polars
           projection,
           n_rows,
           parallel,
-          Utils._prepare_row_count_args(row_count_name, row_count_offset),
+          Utils.parse_row_index_args(row_count_name, row_count_offset),
           low_memory,
           use_statistics,
           rechunk
@@ -158,6 +158,8 @@ module Polars
     #   Extra options that make sense for a particular storage connection.
     # @param low_memory [Boolean]
     #   Reduce memory pressure at the expense of performance.
+    # @param include_file_paths [String]
+    #  Include the path of the source file(s) as a column with this name.
     #
     # @return [LazyFrame]
     def scan_parquet(
@@ -170,7 +172,8 @@ module Polars
       row_count_name: nil,
       row_count_offset: 0,
       storage_options: nil,
-      low_memory: false
+      low_memory: false,
+      include_file_paths: nil
     )
       if Utils.pathlike?(source)
         source = Utils.normalize_filepath(source)
@@ -178,7 +181,7 @@ module Polars
       _scan_parquet_impl(
         source,
-        n_rows:n_rows,
+        n_rows: n_rows,
         cache: cache,
         parallel: parallel,
         rechunk: rechunk,
@@ -186,7 +189,8 @@ module Polars
         row_count_offset: row_count_offset,
         storage_options: storage_options,
         low_memory: low_memory,
-        glob: glob
+        glob: glob,
+        include_file_paths: include_file_paths
       )
     end
@@ -202,8 +206,9 @@ module Polars
       storage_options: nil,
       low_memory: false,
       use_statistics: true,
-      hive_partitioning: true,
-      glob: true
+      hive_partitioning: nil,
+      glob: true,
+      include_file_paths: nil
     )
       rblf =
         RbLazyFrame.new_from_parquet(
@@ -213,12 +218,14 @@ module Polars
           cache,
           parallel,
           rechunk,
-          Utils._prepare_row_count_args(row_count_name, row_count_offset),
+          Utils.parse_row_index_args(row_count_name, row_count_offset),
           low_memory,
           use_statistics,
           hive_partitioning,
           nil,
-          glob
+          true,
+          glob,
+          include_file_paths
         )
       Utils.wrap_ldf(rblf)
     end