polars-df 0.3.1-x86_64-linux → 0.5.0-x86_64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +24 -1
- data/Cargo.lock +486 -380
- data/Cargo.toml +0 -2
- data/LICENSE-THIRD-PARTY.txt +7353 -8473
- data/README.md +31 -2
- data/lib/polars/3.0/polars.so +0 -0
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/batched_csv_reader.rb +1 -1
- data/lib/polars/binary_expr.rb +77 -0
- data/lib/polars/binary_name_space.rb +66 -0
- data/lib/polars/convert.rb +2 -2
- data/lib/polars/data_frame.rb +263 -87
- data/lib/polars/data_types.rb +6 -4
- data/lib/polars/date_time_expr.rb +148 -8
- data/lib/polars/expr.rb +78 -11
- data/lib/polars/io.rb +73 -62
- data/lib/polars/lazy_frame.rb +107 -10
- data/lib/polars/lazy_functions.rb +7 -3
- data/lib/polars/list_expr.rb +70 -21
- data/lib/polars/list_name_space.rb +2 -2
- data/lib/polars/series.rb +190 -74
- data/lib/polars/string_expr.rb +150 -44
- data/lib/polars/string_name_space.rb +4 -4
- data/lib/polars/struct_name_space.rb +32 -0
- data/lib/polars/utils.rb +51 -9
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +4 -2
- metadata +4 -2
    
        data/lib/polars/data_frame.rb
    CHANGED
    
    | @@ -18,7 +18,10 @@ module Polars | |
| 18 18 | 
             
                #   Whether to interpret two-dimensional data as columns or as rows. If `nil`,
         | 
| 19 19 | 
             
                #   the orientation is inferred by matching the columns and data dimensions. If
         | 
| 20 20 | 
             
                #   this does not yield conclusive results, column orientation is used.
         | 
| 21 | 
            -
                def initialize(data = nil, columns: nil, orient: nil)
         | 
| 21 | 
            +
                def initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
         | 
| 22 | 
            +
                  schema ||= columns
         | 
| 23 | 
            +
                  raise Todo if schema_overrides
         | 
| 24 | 
            +
             | 
| 22 25 | 
             
                  # TODO deprecate in favor of read_sql
         | 
| 23 26 | 
             
                  if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
         | 
| 24 27 | 
             
                    result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.to_sql)
         | 
| @@ -29,14 +32,14 @@ module Polars | |
| 29 32 | 
             
                  end
         | 
| 30 33 |  | 
| 31 34 | 
             
                  if data.nil?
         | 
| 32 | 
            -
                    self._df = self.class.hash_to_rbdf({},  | 
| 35 | 
            +
                    self._df = self.class.hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
         | 
| 33 36 | 
             
                  elsif data.is_a?(Hash)
         | 
| 34 37 | 
             
                    data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
         | 
| 35 | 
            -
                    self._df = self.class.hash_to_rbdf(data,  | 
| 38 | 
            +
                    self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, nan_to_null: nan_to_null)
         | 
| 36 39 | 
             
                  elsif data.is_a?(Array)
         | 
| 37 | 
            -
                    self._df = self.class.sequence_to_rbdf(data,  | 
| 40 | 
            +
                    self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, orient: orient, infer_schema_length: infer_schema_length)
         | 
| 38 41 | 
             
                  elsif data.is_a?(Series)
         | 
| 39 | 
            -
                    self._df = self.class.series_to_rbdf(data,  | 
| 42 | 
            +
                    self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides)
         | 
| 40 43 | 
             
                  else
         | 
| 41 44 | 
             
                    raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
         | 
| 42 45 | 
             
                  end
         | 
| @@ -56,8 +59,8 @@ module Polars | |
| 56 59 | 
             
                end
         | 
| 57 60 |  | 
| 58 61 | 
             
                # @private
         | 
| 59 | 
            -
                def self._from_hash(data,  | 
| 60 | 
            -
                  _from_rbdf(hash_to_rbdf(data,  | 
| 62 | 
            +
                def self._from_hash(data, schema: nil, schema_overrides: nil)
         | 
| 63 | 
            +
                  _from_rbdf(hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides))
         | 
| 61 64 | 
             
                end
         | 
| 62 65 |  | 
| 63 66 | 
             
                # def self._from_records
         | 
| @@ -97,7 +100,7 @@ module Polars | |
| 97 100 | 
             
                  eol_char: "\n"
         | 
| 98 101 | 
             
                )
         | 
| 99 102 | 
             
                  if Utils.pathlike?(file)
         | 
| 100 | 
            -
                    path = Utils. | 
| 103 | 
            +
                    path = Utils.normalise_filepath(file)
         | 
| 101 104 | 
             
                  else
         | 
| 102 105 | 
             
                    path = nil
         | 
| 103 106 | 
             
                    # if defined?(StringIO) && file.is_a?(StringIO)
         | 
| @@ -196,32 +199,56 @@ module Polars | |
| 196 199 |  | 
| 197 200 | 
             
                # @private
         | 
| 198 201 | 
             
                def self._read_parquet(
         | 
| 199 | 
            -
                   | 
| 202 | 
            +
                  source,
         | 
| 200 203 | 
             
                  columns: nil,
         | 
| 201 204 | 
             
                  n_rows: nil,
         | 
| 202 205 | 
             
                  parallel: "auto",
         | 
| 203 206 | 
             
                  row_count_name: nil,
         | 
| 204 207 | 
             
                  row_count_offset: 0,
         | 
| 205 | 
            -
                  low_memory: false
         | 
| 208 | 
            +
                  low_memory: false,
         | 
| 209 | 
            +
                  use_statistics: true,
         | 
| 210 | 
            +
                  rechunk: true
         | 
| 206 211 | 
             
                )
         | 
| 207 | 
            -
                  if Utils.pathlike?( | 
| 208 | 
            -
                     | 
| 212 | 
            +
                  if Utils.pathlike?(source)
         | 
| 213 | 
            +
                    source = Utils.normalise_filepath(source)
         | 
| 214 | 
            +
                  end
         | 
| 215 | 
            +
                  if columns.is_a?(String)
         | 
| 216 | 
            +
                    columns = [columns]
         | 
| 209 217 | 
             
                  end
         | 
| 210 218 |  | 
| 211 | 
            -
                  if  | 
| 212 | 
            -
                     | 
| 219 | 
            +
                  if source.is_a?(String) && source.include?("*") && Utils.local_file?(source)
         | 
| 220 | 
            +
                    scan =
         | 
| 221 | 
            +
                      Polars.scan_parquet(
         | 
| 222 | 
            +
                        source,
         | 
| 223 | 
            +
                        n_rows: n_rows,
         | 
| 224 | 
            +
                        rechunk: true,
         | 
| 225 | 
            +
                        parallel: parallel,
         | 
| 226 | 
            +
                        row_count_name: row_count_name,
         | 
| 227 | 
            +
                        row_count_offset: row_count_offset,
         | 
| 228 | 
            +
                        low_memory: low_memory
         | 
| 229 | 
            +
                      )
         | 
| 230 | 
            +
             | 
| 231 | 
            +
                    if columns.nil?
         | 
| 232 | 
            +
                      return self._from_rbdf(scan.collect._df)
         | 
| 233 | 
            +
                    elsif Utils.is_str_sequence(columns, allow_str: false)
         | 
| 234 | 
            +
                      return self._from_rbdf(scan.select(columns).collect._df)
         | 
| 235 | 
            +
                    else
         | 
| 236 | 
            +
                      raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: Array[String]"
         | 
| 237 | 
            +
                    end
         | 
| 213 238 | 
             
                  end
         | 
| 214 239 |  | 
| 215 240 | 
             
                  projection, columns = Utils.handle_projection_columns(columns)
         | 
| 216 241 | 
             
                  _from_rbdf(
         | 
| 217 242 | 
             
                    RbDataFrame.read_parquet(
         | 
| 218 | 
            -
                       | 
| 243 | 
            +
                      source,
         | 
| 219 244 | 
             
                      columns,
         | 
| 220 245 | 
             
                      projection,
         | 
| 221 246 | 
             
                      n_rows,
         | 
| 222 247 | 
             
                      parallel,
         | 
| 223 248 | 
             
                      Utils._prepare_row_count_args(row_count_name, row_count_offset),
         | 
| 224 | 
            -
                      low_memory
         | 
| 249 | 
            +
                      low_memory,
         | 
| 250 | 
            +
                      use_statistics,
         | 
| 251 | 
            +
                      rechunk
         | 
| 225 252 | 
             
                    )
         | 
| 226 253 | 
             
                  )
         | 
| 227 254 | 
             
                end
         | 
| @@ -229,7 +256,7 @@ module Polars | |
| 229 256 | 
             
                # @private
         | 
| 230 257 | 
             
                def self._read_avro(file, columns: nil, n_rows: nil)
         | 
| 231 258 | 
             
                  if Utils.pathlike?(file)
         | 
| 232 | 
            -
                    file = Utils. | 
| 259 | 
            +
                    file = Utils.normalise_filepath(file)
         | 
| 233 260 | 
             
                  end
         | 
| 234 261 | 
             
                  projection, columns = Utils.handle_projection_columns(columns)
         | 
| 235 262 | 
             
                  _from_rbdf(RbDataFrame.read_avro(file, columns, projection, n_rows))
         | 
| @@ -246,7 +273,7 @@ module Polars | |
| 246 273 | 
             
                  memory_map: true
         | 
| 247 274 | 
             
                )
         | 
| 248 275 | 
             
                  if Utils.pathlike?(file)
         | 
| 249 | 
            -
                    file = Utils. | 
| 276 | 
            +
                    file = Utils.normalise_filepath(file)
         | 
| 250 277 | 
             
                  end
         | 
| 251 278 | 
             
                  if columns.is_a?(String)
         | 
| 252 279 | 
             
                    columns = [columns]
         | 
| @@ -272,7 +299,7 @@ module Polars | |
| 272 299 | 
             
                # @private
         | 
| 273 300 | 
             
                def self._read_json(file)
         | 
| 274 301 | 
             
                  if Utils.pathlike?(file)
         | 
| 275 | 
            -
                    file = Utils. | 
| 302 | 
            +
                    file = Utils.normalise_filepath(file)
         | 
| 276 303 | 
             
                  end
         | 
| 277 304 |  | 
| 278 305 | 
             
                  _from_rbdf(RbDataFrame.read_json(file))
         | 
| @@ -281,7 +308,7 @@ module Polars | |
| 281 308 | 
             
                # @private
         | 
| 282 309 | 
             
                def self._read_ndjson(file)
         | 
| 283 310 | 
             
                  if Utils.pathlike?(file)
         | 
| 284 | 
            -
                    file = Utils. | 
| 311 | 
            +
                    file = Utils.normalise_filepath(file)
         | 
| 285 312 | 
             
                  end
         | 
| 286 313 |  | 
| 287 314 | 
             
                  _from_rbdf(RbDataFrame.read_ndjson(file))
         | 
| @@ -312,6 +339,7 @@ module Polars | |
| 312 339 | 
             
                end
         | 
| 313 340 | 
             
                alias_method :count, :height
         | 
| 314 341 | 
             
                alias_method :length, :height
         | 
| 342 | 
            +
                alias_method :size, :height
         | 
| 315 343 |  | 
| 316 344 | 
             
                # Get the width of the DataFrame.
         | 
| 317 345 | 
             
                #
         | 
| @@ -522,6 +550,13 @@ module Polars | |
| 522 550 | 
             
                end
         | 
| 523 551 | 
             
                alias_method :inspect, :to_s
         | 
| 524 552 |  | 
| 553 | 
            +
                # Returns an array representing the DataFrame
         | 
| 554 | 
            +
                #
         | 
| 555 | 
            +
                # @return [Array]
         | 
| 556 | 
            +
                def to_a
         | 
| 557 | 
            +
                  rows(named: true)
         | 
| 558 | 
            +
                end
         | 
| 559 | 
            +
             | 
| 525 560 | 
             
                # Check if DataFrame includes column.
         | 
| 526 561 | 
             
                #
         | 
| 527 562 | 
             
                # @return [Boolean]
         | 
| @@ -631,7 +666,7 @@ module Polars | |
| 631 666 | 
             
                  end
         | 
| 632 667 |  | 
| 633 668 | 
             
                  # Ruby-specific
         | 
| 634 | 
            -
                  if item.is_a?(Expr)
         | 
| 669 | 
            +
                  if item.is_a?(Expr) || item.is_a?(Series)
         | 
| 635 670 | 
             
                    return filter(item)
         | 
| 636 671 | 
             
                  end
         | 
| 637 672 |  | 
| @@ -641,15 +676,42 @@ module Polars | |
| 641 676 | 
             
                # Set item.
         | 
| 642 677 | 
             
                #
         | 
| 643 678 | 
             
                # @return [Object]
         | 
| 644 | 
            -
                 | 
| 645 | 
            -
             | 
| 646 | 
            -
             | 
| 647 | 
            -
             | 
| 648 | 
            -
             | 
| 679 | 
            +
                def []=(*key, value)
         | 
| 680 | 
            +
                  if key.length == 1
         | 
| 681 | 
            +
                    key = key.first
         | 
| 682 | 
            +
                  elsif key.length != 2
         | 
| 683 | 
            +
                    raise ArgumentError, "wrong number of arguments (given #{key.length + 1}, expected 2..3)"
         | 
| 684 | 
            +
                  end
         | 
| 649 685 |  | 
| 650 | 
            -
             | 
| 651 | 
            -
             | 
| 686 | 
            +
                  if Utils.strlike?(key)
         | 
| 687 | 
            +
                    if value.is_a?(Array) || (defined?(Numo::NArray) && value.is_a?(Numo::NArray))
         | 
| 688 | 
            +
                      value = Series.new(value)
         | 
| 689 | 
            +
                    elsif !value.is_a?(Series)
         | 
| 690 | 
            +
                      value = Polars.lit(value)
         | 
| 691 | 
            +
                    end
         | 
| 692 | 
            +
                    self._df = with_column(value.alias(key.to_s))._df
         | 
| 693 | 
            +
                  elsif key.is_a?(Array)
         | 
| 694 | 
            +
                    row_selection, col_selection = key
         | 
| 695 | 
            +
             | 
| 696 | 
            +
                    if Utils.strlike?(col_selection)
         | 
| 697 | 
            +
                      s = self[col_selection]
         | 
| 698 | 
            +
                    elsif col_selection.is_a?(Integer)
         | 
| 699 | 
            +
                      raise Todo
         | 
| 700 | 
            +
                    else
         | 
| 701 | 
            +
                      raise ArgumentError, "column selection not understood: #{col_selection}"
         | 
| 702 | 
            +
                    end
         | 
| 703 | 
            +
             | 
| 704 | 
            +
                    s[row_selection] = value
         | 
| 652 705 |  | 
| 706 | 
            +
                    if col_selection.is_a?(Integer)
         | 
| 707 | 
            +
                      replace_at_idx(col_selection, s)
         | 
| 708 | 
            +
                    elsif Utils.strlike?(col_selection)
         | 
| 709 | 
            +
                      replace(col_selection, s)
         | 
| 710 | 
            +
                    end
         | 
| 711 | 
            +
                  else
         | 
| 712 | 
            +
                    raise Todo
         | 
| 713 | 
            +
                  end
         | 
| 714 | 
            +
                end
         | 
| 653 715 |  | 
| 654 716 | 
             
                # Return the dataframe as a scalar.
         | 
| 655 717 | 
             
                #
         | 
| @@ -774,7 +836,7 @@ module Polars | |
| 774 836 | 
             
                  row_oriented: false
         | 
| 775 837 | 
             
                )
         | 
| 776 838 | 
             
                  if Utils.pathlike?(file)
         | 
| 777 | 
            -
                    file = Utils. | 
| 839 | 
            +
                    file = Utils.normalise_filepath(file)
         | 
| 778 840 | 
             
                  end
         | 
| 779 841 |  | 
| 780 842 | 
             
                  _df.write_json(file, pretty, row_oriented)
         | 
| @@ -789,7 +851,7 @@ module Polars | |
| 789 851 | 
             
                # @return [nil]
         | 
| 790 852 | 
             
                def write_ndjson(file)
         | 
| 791 853 | 
             
                  if Utils.pathlike?(file)
         | 
| 792 | 
            -
                    file = Utils. | 
| 854 | 
            +
                    file = Utils.normalise_filepath(file)
         | 
| 793 855 | 
             
                  end
         | 
| 794 856 |  | 
| 795 857 | 
             
                  _df.write_ndjson(file)
         | 
| @@ -879,7 +941,7 @@ module Polars | |
| 879 941 | 
             
                  end
         | 
| 880 942 |  | 
| 881 943 | 
             
                  if Utils.pathlike?(file)
         | 
| 882 | 
            -
                    file = Utils. | 
| 944 | 
            +
                    file = Utils.normalise_filepath(file)
         | 
| 883 945 | 
             
                  end
         | 
| 884 946 |  | 
| 885 947 | 
             
                  _df.write_csv(
         | 
| @@ -917,7 +979,7 @@ module Polars | |
| 917 979 | 
             
                    compression = "uncompressed"
         | 
| 918 980 | 
             
                  end
         | 
| 919 981 | 
             
                  if Utils.pathlike?(file)
         | 
| 920 | 
            -
                    file = Utils. | 
| 982 | 
            +
                    file = Utils.normalise_filepath(file)
         | 
| 921 983 | 
             
                  end
         | 
| 922 984 |  | 
| 923 985 | 
             
                  _df.write_avro(file, compression)
         | 
| @@ -936,7 +998,7 @@ module Polars | |
| 936 998 | 
             
                    compression = "uncompressed"
         | 
| 937 999 | 
             
                  end
         | 
| 938 1000 | 
             
                  if Utils.pathlike?(file)
         | 
| 939 | 
            -
                    file = Utils. | 
| 1001 | 
            +
                    file = Utils.normalise_filepath(file)
         | 
| 940 1002 | 
             
                  end
         | 
| 941 1003 |  | 
| 942 1004 | 
             
                  _df.write_ipc(file, compression)
         | 
| @@ -978,7 +1040,7 @@ module Polars | |
| 978 1040 | 
             
                    compression = "uncompressed"
         | 
| 979 1041 | 
             
                  end
         | 
| 980 1042 | 
             
                  if Utils.pathlike?(file)
         | 
| 981 | 
            -
                    file = Utils. | 
| 1043 | 
            +
                    file = Utils.normalise_filepath(file)
         | 
| 982 1044 | 
             
                  end
         | 
| 983 1045 |  | 
| 984 1046 | 
             
                  _df.write_parquet(
         | 
| @@ -1438,6 +1500,20 @@ module Polars | |
| 1438 1500 | 
             
                  end
         | 
| 1439 1501 | 
             
                end
         | 
| 1440 1502 |  | 
| 1503 | 
            +
                # Sort the DataFrame by column in-place.
         | 
| 1504 | 
            +
                #
         | 
| 1505 | 
            +
                # @param by [String]
         | 
| 1506 | 
            +
                #   By which column to sort.
         | 
| 1507 | 
            +
                # @param reverse [Boolean]
         | 
| 1508 | 
            +
                #   Reverse/descending sort.
         | 
| 1509 | 
            +
                # @param nulls_last [Boolean]
         | 
| 1510 | 
            +
                #   Place null values last. Can only be used if sorted by a single column.
         | 
| 1511 | 
            +
                #
         | 
| 1512 | 
            +
                # @return [DataFrame]
         | 
| 1513 | 
            +
                def sort!(by, reverse: false, nulls_last: false)
         | 
| 1514 | 
            +
                  self._df = sort(by, reverse: reverse, nulls_last: nulls_last)._df
         | 
| 1515 | 
            +
                end
         | 
| 1516 | 
            +
             | 
| 1441 1517 | 
             
                # Check if DataFrame is equal to other.
         | 
| 1442 1518 | 
             
                #
         | 
| 1443 1519 | 
             
                # @param other [DataFrame]
         | 
| @@ -1495,7 +1571,7 @@ module Polars | |
| 1495 1571 | 
             
                #   # │ 30  ┆ 6   │
         | 
| 1496 1572 | 
             
                #   # └─────┴─────┘
         | 
| 1497 1573 | 
             
                def replace(column, new_col)
         | 
| 1498 | 
            -
                  _df.replace(column, new_col._s)
         | 
| 1574 | 
            +
                  _df.replace(column.to_s, new_col._s)
         | 
| 1499 1575 | 
             
                  self
         | 
| 1500 1576 | 
             
                end
         | 
| 1501 1577 |  | 
| @@ -1836,7 +1912,7 @@ module Polars | |
| 1836 1912 | 
             
                #     "2020-01-08 23:16:43"
         | 
| 1837 1913 | 
             
                #   ]
         | 
| 1838 1914 | 
             
                #   df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
         | 
| 1839 | 
            -
                #     Polars.col("dt").str.strptime( | 
| 1915 | 
            +
                #     Polars.col("dt").str.strptime(Polars::Datetime)
         | 
| 1840 1916 | 
             
                #   )
         | 
| 1841 1917 | 
             
                #   df.groupby_rolling(index_column: "dt", period: "2d").agg(
         | 
| 1842 1918 | 
             
                #     [
         | 
| @@ -2767,6 +2843,16 @@ module Polars | |
| 2767 2843 | 
             
                  Utils.wrap_s(_df.drop_in_place(name))
         | 
| 2768 2844 | 
             
                end
         | 
| 2769 2845 |  | 
| 2846 | 
            +
                # Drop in place if exists.
         | 
| 2847 | 
            +
                #
         | 
| 2848 | 
            +
                # @param name [Object]
         | 
| 2849 | 
            +
                #   Column to drop.
         | 
| 2850 | 
            +
                #
         | 
| 2851 | 
            +
                # @return [Series]
         | 
| 2852 | 
            +
                def delete(name)
         | 
| 2853 | 
            +
                  drop_in_place(name) if include?(name)
         | 
| 2854 | 
            +
                end
         | 
| 2855 | 
            +
             | 
| 2770 2856 | 
             
                # Create an empty copy of the current DataFrame.
         | 
| 2771 2857 | 
             
                #
         | 
| 2772 2858 | 
             
                # Returns a DataFrame with identical schema but no data.
         | 
| @@ -3042,24 +3128,28 @@ module Polars | |
| 3042 3128 | 
             
                  if aggregate_fn.is_a?(String)
         | 
| 3043 3129 | 
             
                    case aggregate_fn
         | 
| 3044 3130 | 
             
                    when "first"
         | 
| 3045 | 
            -
                       | 
| 3131 | 
            +
                      aggregate_expr = Polars.element.first._rbexpr
         | 
| 3046 3132 | 
             
                    when "sum"
         | 
| 3047 | 
            -
                       | 
| 3133 | 
            +
                      aggregate_expr = Polars.element.sum._rbexpr
         | 
| 3048 3134 | 
             
                    when "max"
         | 
| 3049 | 
            -
                       | 
| 3135 | 
            +
                      aggregate_expr = Polars.element.max._rbexpr
         | 
| 3050 3136 | 
             
                    when "min"
         | 
| 3051 | 
            -
                       | 
| 3137 | 
            +
                      aggregate_expr = Polars.element.min._rbexpr
         | 
| 3052 3138 | 
             
                    when "mean"
         | 
| 3053 | 
            -
                       | 
| 3139 | 
            +
                      aggregate_expr = Polars.element.mean._rbexpr
         | 
| 3054 3140 | 
             
                    when "median"
         | 
| 3055 | 
            -
                       | 
| 3141 | 
            +
                      aggregate_expr = Polars.element.median._rbexpr
         | 
| 3056 3142 | 
             
                    when "last"
         | 
| 3057 | 
            -
                       | 
| 3143 | 
            +
                      aggregate_expr = Polars.element.last._rbexpr
         | 
| 3058 3144 | 
             
                    when "count"
         | 
| 3059 | 
            -
                       | 
| 3145 | 
            +
                      aggregate_expr = Polars.count._rbexpr
         | 
| 3060 3146 | 
             
                    else
         | 
| 3061 3147 | 
             
                      raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected."
         | 
| 3062 3148 | 
             
                    end
         | 
| 3149 | 
            +
                  elsif aggregate_fn.nil?
         | 
| 3150 | 
            +
                    aggregate_expr = nil
         | 
| 3151 | 
            +
                  else
         | 
| 3152 | 
            +
                    aggregate_expr = aggregate_function._rbexpr
         | 
| 3063 3153 | 
             
                  end
         | 
| 3064 3154 |  | 
| 3065 3155 | 
             
                  _from_rbdf(
         | 
| @@ -3067,9 +3157,9 @@ module Polars | |
| 3067 3157 | 
             
                      values,
         | 
| 3068 3158 | 
             
                      index,
         | 
| 3069 3159 | 
             
                      columns,
         | 
| 3070 | 
            -
                      aggregate_fn._rbexpr,
         | 
| 3071 3160 | 
             
                      maintain_order,
         | 
| 3072 3161 | 
             
                      sort_columns,
         | 
| 3162 | 
            +
                      aggregate_expr,
         | 
| 3073 3163 | 
             
                      separator
         | 
| 3074 3164 | 
             
                    )
         | 
| 3075 3165 | 
             
                  )
         | 
| @@ -3174,7 +3264,7 @@ module Polars | |
| 3174 3264 | 
             
                #   # │ B    ┆ 1    │
         | 
| 3175 3265 | 
             
                #   # │ C    ┆ 2    │
         | 
| 3176 3266 | 
             
                #   # │ D    ┆ 3    │
         | 
| 3177 | 
            -
                #   # │  | 
| 3267 | 
            +
                #   # │ E    ┆ 4    │
         | 
| 3178 3268 | 
             
                #   # │ F    ┆ 5    │
         | 
| 3179 3269 | 
             
                #   # │ G    ┆ 6    │
         | 
| 3180 3270 | 
             
                #   # │ H    ┆ 7    │
         | 
| @@ -4053,15 +4143,12 @@ module Polars | |
| 4053 4143 | 
             
                #   # │ 5   ┆ 3.0 ┆ true  │
         | 
| 4054 4144 | 
             
                #   # └─────┴─────┴───────┘
         | 
| 4055 4145 | 
             
                def unique(maintain_order: true, subset: nil, keep: "first")
         | 
| 4056 | 
            -
                   | 
| 4057 | 
            -
                     | 
| 4058 | 
            -
                      subset  | 
| 4059 | 
            -
             | 
| 4060 | 
            -
                       | 
| 4061 | 
            -
             | 
| 4062 | 
            -
                  end
         | 
| 4063 | 
            -
             | 
| 4064 | 
            -
                  _from_rbdf(_df.unique(maintain_order, subset, keep))
         | 
| 4146 | 
            +
                  self._from_rbdf(
         | 
| 4147 | 
            +
                    lazy
         | 
| 4148 | 
            +
                      .unique(maintain_order: maintain_order, subset: subset, keep: keep)
         | 
| 4149 | 
            +
                      .collect(no_optimization: true)
         | 
| 4150 | 
            +
                      ._df
         | 
| 4151 | 
            +
                  )
         | 
| 4065 4152 | 
             
                end
         | 
| 4066 4153 |  | 
| 4067 4154 | 
             
                # Return the number of unique rows, or the number of unique row-subsets.
         | 
| @@ -4403,7 +4490,7 @@ module Polars | |
| 4403 4490 | 
             
                  end
         | 
| 4404 4491 | 
             
                end
         | 
| 4405 4492 |  | 
| 4406 | 
            -
                # Returns an iterator over the DataFrame of rows of  | 
| 4493 | 
            +
                # Returns an iterator over the DataFrame of rows of Ruby-native values.
         | 
| 4407 4494 | 
             
                #
         | 
| 4408 4495 | 
             
                # @param named [Boolean]
         | 
| 4409 4496 | 
             
                #   Return hashes instead of arrays. The hashes are a mapping of
         | 
| @@ -4464,6 +4551,24 @@ module Polars | |
| 4464 4551 | 
             
                  end
         | 
| 4465 4552 | 
             
                end
         | 
| 4466 4553 |  | 
| 4554 | 
            +
                # Returns an iterator over the DataFrame of rows of Ruby-native values.
         | 
| 4555 | 
            +
                #
         | 
| 4556 | 
            +
                # @param named [Boolean]
         | 
| 4557 | 
            +
                #   Return hashes instead of arrays. The hashes are a mapping of
         | 
| 4558 | 
            +
                #   column name to row value. This is more expensive than returning an
         | 
| 4559 | 
            +
                #   array, but allows for accessing values by column name.
         | 
| 4560 | 
            +
                # @param buffer_size [Integer]
         | 
| 4561 | 
            +
                #   Determines the number of rows that are buffered internally while iterating
         | 
| 4562 | 
            +
                #   over the data; you should only modify this in very specific cases where the
         | 
| 4563 | 
            +
                #   default value is determined not to be a good fit to your access pattern, as
         | 
| 4564 | 
            +
                #   the speedup from using the buffer is significant (~2-4x). Setting this
         | 
| 4565 | 
            +
                #   value to zero disables row buffering.
         | 
| 4566 | 
            +
                #
         | 
| 4567 | 
            +
                # @return [Object]
         | 
| 4568 | 
            +
                def each_row(named: true, buffer_size: 500, &block)
         | 
| 4569 | 
            +
                  iter_rows(named: named, buffer_size: buffer_size, &block)
         | 
| 4570 | 
            +
                end
         | 
| 4571 | 
            +
             | 
| 4467 4572 | 
             
                # Shrink DataFrame memory usage.
         | 
| 4468 4573 | 
             
                #
         | 
| 4469 4574 | 
             
                # Shrinks to fit the exact capacity needed to hold the data.
         | 
| @@ -4717,20 +4822,63 @@ module Polars | |
| 4717 4822 | 
             
                end
         | 
| 4718 4823 |  | 
| 4719 4824 | 
             
                # @private
         | 
| 4720 | 
            -
                def self. | 
| 4721 | 
            -
                   | 
| 4722 | 
            -
             | 
| 4825 | 
            +
                def self.expand_hash_scalars(data, schema_overrides: nil, order: nil, nan_to_null: false)
         | 
| 4826 | 
            +
                  updated_data = {}
         | 
| 4827 | 
            +
                  unless data.empty?
         | 
| 4828 | 
            +
                    dtypes = schema_overrides || {}
         | 
| 4829 | 
            +
                    array_len = data.values.map { |val| Utils.arrlen(val) || 0 }.max
         | 
| 4830 | 
            +
                    if array_len > 0
         | 
| 4831 | 
            +
                      data.each do |name, val|
         | 
| 4832 | 
            +
                        dtype = dtypes[name]
         | 
| 4833 | 
            +
                        if val.is_a?(Hash) && dtype != Struct
         | 
| 4834 | 
            +
                          updated_data[name] = DataFrame.new(val).to_struct(name)
         | 
| 4835 | 
            +
                        elsif !Utils.arrlen(val).nil?
         | 
| 4836 | 
            +
                          updated_data[name] = Series.new(String.new(name), val, dtype: dtype)
         | 
| 4837 | 
            +
                        elsif val.nil? || [Integer, Float, TrueClass, FalseClass, String, ::Date, ::DateTime, ::Time].any? { |cls| val.is_a?(cls) }
         | 
| 4838 | 
            +
                          dtype = Polars::Float64 if val.nil? && dtype.nil?
         | 
| 4839 | 
            +
                          updated_data[name] = Series.new(String.new(name), [val], dtype: dtype).extend_constant(val, array_len - 1)
         | 
| 4840 | 
            +
                        else
         | 
| 4841 | 
            +
                          raise Todo
         | 
| 4842 | 
            +
                        end
         | 
| 4843 | 
            +
                      end
         | 
| 4844 | 
            +
                    elsif data.values.all? { |val| Utils.arrlen(val) == 0 }
         | 
| 4845 | 
            +
                      data.each do |name, val|
         | 
| 4846 | 
            +
                        updated_data[name] = Series.new(name, val, dtype: dtypes[name])
         | 
| 4847 | 
            +
                      end
         | 
| 4848 | 
            +
                    elsif data.values.all? { |val| Utils.arrlen(val).nil? }
         | 
| 4849 | 
            +
                      data.each do |name, val|
         | 
| 4850 | 
            +
                        updated_data[name] = Series.new(name, [val], dtype: dtypes[name])
         | 
| 4851 | 
            +
                      end
         | 
| 4852 | 
            +
                    end
         | 
| 4853 | 
            +
                  end
         | 
| 4854 | 
            +
                  updated_data
         | 
| 4855 | 
            +
                end
         | 
| 4723 4856 |  | 
| 4724 | 
            -
             | 
| 4725 | 
            -
             | 
| 4726 | 
            -
             | 
| 4727 | 
            -
             | 
| 4857 | 
            +
                # @private
         | 
| 4858 | 
            +
                def self.hash_to_rbdf(data, schema: nil, schema_overrides: nil, nan_to_null: nil)
         | 
| 4859 | 
            +
                  if schema.is_a?(Hash) && !data.empty?
         | 
| 4860 | 
            +
                    if !data.all? { |col, _| schema[col] }
         | 
| 4861 | 
            +
                      raise ArgumentError, "The given column-schema names do not match the data dictionary"
         | 
| 4728 4862 | 
             
                    end
         | 
| 4729 | 
            -
             | 
| 4730 | 
            -
                     | 
| 4863 | 
            +
             | 
| 4864 | 
            +
                    data = schema.to_h { |col| [col, data[col]] }
         | 
| 4865 | 
            +
                  end
         | 
| 4866 | 
            +
             | 
| 4867 | 
            +
                  column_names, schema_overrides = _unpack_schema(
         | 
| 4868 | 
            +
                    schema, lookup_names: data.keys, schema_overrides: schema_overrides
         | 
| 4869 | 
            +
                  )
         | 
| 4870 | 
            +
                  if column_names.empty?
         | 
| 4871 | 
            +
                    column_names = data.keys
         | 
| 4872 | 
            +
                  end
         | 
| 4873 | 
            +
             | 
| 4874 | 
            +
                  if data.empty? && !schema_overrides.empty?
         | 
| 4875 | 
            +
                    data_series = column_names.map { |name| Series.new(name, [], dtype: schema_overrides[name], nan_to_null: nan_to_null)._s }
         | 
| 4876 | 
            +
                  else
         | 
| 4877 | 
            +
                    data_series = expand_hash_scalars(data, schema_overrides: schema_overrides, nan_to_null: nan_to_null).values.map(&:_s)
         | 
| 4731 4878 | 
             
                  end
         | 
| 4732 4879 |  | 
| 4733 | 
            -
                   | 
| 4880 | 
            +
                  data_series = _handle_columns_arg(data_series, columns: column_names, from_hash: true)
         | 
| 4881 | 
            +
                  RbDataFrame.new(data_series)
         | 
| 4734 4882 | 
             
                end
         | 
| 4735 4883 |  | 
| 4736 4884 | 
             
                # @private
         | 
| @@ -4739,14 +4887,12 @@ module Polars | |
| 4739 4887 | 
             
                end
         | 
| 4740 4888 |  | 
| 4741 4889 | 
             
                # @private
         | 
| 4742 | 
            -
                def self. | 
| 4743 | 
            -
                   | 
| 4744 | 
            -
             | 
| 4745 | 
            -
                  if columns.is_a?(Hash)
         | 
| 4746 | 
            -
                    columns = columns.to_a
         | 
| 4890 | 
            +
                def self._unpack_schema(schema, schema_overrides: nil, n_expected: nil, lookup_names: nil, include_overrides_in_columns: false)
         | 
| 4891 | 
            +
                  if schema.is_a?(Hash)
         | 
| 4892 | 
            +
                    schema = schema.to_a
         | 
| 4747 4893 | 
             
                  end
         | 
| 4748 4894 | 
             
                  column_names =
         | 
| 4749 | 
            -
                    ( | 
| 4895 | 
            +
                    (schema || []).map.with_index do |col, i|
         | 
| 4750 4896 | 
             
                      if col.is_a?(String)
         | 
| 4751 4897 | 
             
                        col || "column_#{i}"
         | 
| 4752 4898 | 
             
                      else
         | 
| @@ -4759,21 +4905,38 @@ module Polars | |
| 4759 4905 | 
             
                  # TODO zip_longest
         | 
| 4760 4906 | 
             
                  lookup = column_names.zip(lookup_names || []).to_h
         | 
| 4761 4907 |  | 
| 4762 | 
            -
                   | 
| 4763 | 
            -
                     | 
| 4764 | 
            -
                    (columns || []).select { |col| !col.is_a?(String) && col[1] }.to_h do |col|
         | 
| 4908 | 
            +
                  column_dtypes =
         | 
| 4909 | 
            +
                    (schema || []).select { |col| !col.is_a?(String) && col[1] }.to_h do |col|
         | 
| 4765 4910 | 
             
                      [lookup[col[0]] || col[0], col[1]]
         | 
| 4766 4911 | 
             
                    end
         | 
| 4767 | 
            -
             | 
| 4912 | 
            +
             | 
| 4913 | 
            +
                  if schema_overrides
         | 
| 4914 | 
            +
                    raise Todo
         | 
| 4915 | 
            +
                  end
         | 
| 4916 | 
            +
             | 
| 4917 | 
            +
                  column_dtypes.each do |col, dtype|
         | 
| 4918 | 
            +
                    if !Utils.is_polars_dtype(dtype, include_unknown: true) && !dtype.nil?
         | 
| 4919 | 
            +
                      column_dtypes[col] = Utils.rb_type_to_dtype(dtype)
         | 
| 4920 | 
            +
                    end
         | 
| 4921 | 
            +
                  end
         | 
| 4922 | 
            +
             | 
| 4923 | 
            +
                  [column_names, column_dtypes]
         | 
| 4768 4924 | 
             
                end
         | 
| 4769 4925 |  | 
| 4770 | 
            -
                def self._handle_columns_arg(data, columns: nil)
         | 
| 4771 | 
            -
                  if columns.nil?
         | 
| 4926 | 
            +
                def self._handle_columns_arg(data, columns: nil, from_hash: false)
         | 
| 4927 | 
            +
                  if columns.nil? || columns.empty?
         | 
| 4772 4928 | 
             
                    data
         | 
| 4773 4929 | 
             
                  else
         | 
| 4774 4930 | 
             
                    if data.empty?
         | 
| 4775 4931 | 
             
                      columns.map { |c| Series.new(c, nil)._s }
         | 
| 4776 4932 | 
             
                    elsif data.length == columns.length
         | 
| 4933 | 
            +
                      if from_hash
         | 
| 4934 | 
            +
                        series_map = data.to_h { |s| [s.name, s] }
         | 
| 4935 | 
            +
                        if columns.all? { |col| series_map.key?(col) }
         | 
| 4936 | 
            +
                          return columns.map { |col| series_map[col] }
         | 
| 4937 | 
            +
                        end
         | 
| 4938 | 
            +
                      end
         | 
| 4939 | 
            +
             | 
| 4777 4940 | 
             
                      columns.each_with_index do |c, i|
         | 
| 4778 4941 | 
             
                        # not in-place?
         | 
| 4779 4942 | 
             
                        data[i].rename(c)
         | 
| @@ -4788,7 +4951,7 @@ module Polars | |
| 4788 4951 | 
             
                def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil)
         | 
| 4789 4952 | 
             
                  rbdf_columns = rbdf.columns
         | 
| 4790 4953 | 
             
                  rbdf_dtypes = rbdf.dtypes
         | 
| 4791 | 
            -
                  columns, dtypes =  | 
| 4954 | 
            +
                  columns, dtypes = _unpack_schema(
         | 
| 4792 4955 | 
             
                    (columns || rbdf_columns), schema_overrides: schema_overrides
         | 
| 4793 4956 | 
             
                  )
         | 
| 4794 4957 | 
             
                  column_subset = []
         | 
| @@ -4826,20 +4989,23 @@ module Polars | |
| 4826 4989 | 
             
                end
         | 
| 4827 4990 |  | 
| 4828 4991 | 
             
                # @private
         | 
| 4829 | 
            -
                def self.sequence_to_rbdf(data,  | 
| 4992 | 
            +
                def self.sequence_to_rbdf(data, schema: nil, schema_overrides: nil, orient: nil, infer_schema_length: 50)
         | 
| 4993 | 
            +
                  raise Todo if schema_overrides
         | 
| 4994 | 
            +
                  columns = schema
         | 
| 4995 | 
            +
             | 
| 4830 4996 | 
             
                  if data.length == 0
         | 
| 4831 | 
            -
                    return hash_to_rbdf({},  | 
| 4997 | 
            +
                    return hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
         | 
| 4832 4998 | 
             
                  end
         | 
| 4833 4999 |  | 
| 4834 5000 | 
             
                  if data[0].is_a?(Series)
         | 
| 4835 5001 | 
             
                    # series_names = data.map(&:name)
         | 
| 4836 | 
            -
                    # columns, dtypes =  | 
| 5002 | 
            +
                    # columns, dtypes = _unpack_schema(columns || series_names, n_expected: data.length)
         | 
| 4837 5003 | 
             
                    data_series = []
         | 
| 4838 5004 | 
             
                    data.each do |s|
         | 
| 4839 5005 | 
             
                      data_series << s._s
         | 
| 4840 5006 | 
             
                    end
         | 
| 4841 5007 | 
             
                  elsif data[0].is_a?(Hash)
         | 
| 4842 | 
            -
                    column_names, dtypes =  | 
| 5008 | 
            +
                    column_names, dtypes = _unpack_schema(columns)
         | 
| 4843 5009 | 
             
                    schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
         | 
| 4844 5010 | 
             
                    rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema_overrides)
         | 
| 4845 5011 | 
             
                    if column_names
         | 
| @@ -4865,11 +5031,21 @@ module Polars | |
| 4865 5031 | 
             
                end
         | 
| 4866 5032 |  | 
| 4867 5033 | 
             
                # @private
         | 
| 4868 | 
            -
                def self.series_to_rbdf(data,  | 
| 4869 | 
            -
                   | 
| 4870 | 
            -
             | 
| 5034 | 
            +
                def self.series_to_rbdf(data, schema: nil, schema_overrides: nil)
         | 
| 5035 | 
            +
                  data_series = [data._s]
         | 
| 5036 | 
            +
                  series_name = data_series.map(&:name)
         | 
| 5037 | 
            +
                  column_names, schema_overrides = _unpack_schema(
         | 
| 5038 | 
            +
                    schema || series_name, schema_overrides: schema_overrides, n_expected: 1
         | 
| 5039 | 
            +
                  )
         | 
| 5040 | 
            +
                  if schema_overrides.any?
         | 
| 5041 | 
            +
                    new_dtype = schema_overrides.values[0]
         | 
| 5042 | 
            +
                    if new_dtype != data.dtype
         | 
| 5043 | 
            +
                      data_series[0] = data_series[0].cast(new_dtype, true)
         | 
| 5044 | 
            +
                    end
         | 
| 4871 5045 | 
             
                  end
         | 
| 4872 | 
            -
             | 
| 5046 | 
            +
             | 
| 5047 | 
            +
                  data_series = _handle_columns_arg(data_series, columns: column_names)
         | 
| 5048 | 
            +
                  RbDataFrame.new(data_series)
         | 
| 4873 5049 | 
             
                end
         | 
| 4874 5050 |  | 
| 4875 5051 | 
             
                def wrap_ldf(ldf)
         | 
    
        data/lib/polars/data_types.rb
    CHANGED
    
    | @@ -84,20 +84,22 @@ module Polars | |
| 84 84 |  | 
| 85 85 | 
             
              # Calendar date and time type.
         | 
| 86 86 | 
             
              class Datetime < TemporalType
         | 
| 87 | 
            -
                attr_reader : | 
| 87 | 
            +
                attr_reader :time_unit, :time_zone
         | 
| 88 | 
            +
                alias_method :tu, :time_unit
         | 
| 88 89 |  | 
| 89 90 | 
             
                def initialize(time_unit = "us", time_zone = nil)
         | 
| 90 | 
            -
                  @ | 
| 91 | 
            +
                  @time_unit = time_unit || "us"
         | 
| 91 92 | 
             
                  @time_zone = time_zone
         | 
| 92 93 | 
             
                end
         | 
| 93 94 | 
             
              end
         | 
| 94 95 |  | 
| 95 96 | 
             
              # Time duration/delta type.
         | 
| 96 97 | 
             
              class Duration < TemporalType
         | 
| 97 | 
            -
                attr_reader : | 
| 98 | 
            +
                attr_reader :time_unit
         | 
| 99 | 
            +
                alias_method :tu, :time_unit
         | 
| 98 100 |  | 
| 99 101 | 
             
                def initialize(time_unit = "us")
         | 
| 100 | 
            -
                  @ | 
| 102 | 
            +
                  @time_unit = time_unit
         | 
| 101 103 | 
             
                end
         | 
| 102 104 | 
             
              end
         | 
| 103 105 |  |