polars-df 0.16.0 → 0.17.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -495,6 +495,45 @@ module Polars
495
495
  _df.arrow_c_stream
496
496
  end
497
497
 
498
+ # Get an ordered mapping of column names to their data type.
499
+ #
500
+ # @return [Schema]
501
+ #
502
+ # @note
503
+ # This method is included to facilitate writing code that is generic for both
504
+ # DataFrame and LazyFrame.
505
+ #
506
+ # @example Determine the schema.
507
+ # df = Polars::DataFrame.new(
508
+ # {
509
+ # "foo" => [1, 2, 3],
510
+ # "bar" => [6.0, 7.0, 8.0],
511
+ # "ham" => ["a", "b", "c"]
512
+ # }
513
+ # )
514
+ # df.collect_schema
515
+ # # => Polars::Schema({"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::String})
516
+ #
517
+ # @example Access various properties of the schema using the `Schema` object.
518
+ # schema = df.collect_schema
519
+ # schema["bar"]
520
+ # # => Polars::Float64
521
+ #
522
+ # @example
523
+ # schema.names
524
+ # # => ["foo", "bar", "ham"]
525
+ #
526
+ # @example
527
+ # schema.dtypes
528
+ # # => [Polars::Int64, Polars::Float64, Polars::String]
529
+ #
530
+ # @example
531
+ # schema.length
532
+ # # => 3
533
+ def collect_schema
534
+ Schema.new(columns.zip(dtypes), check_dtypes: false)
535
+ end
536
+
498
537
  # Return the dataframe as a scalar.
499
538
  #
500
539
  # Equivalent to `df[0,0]`, with a check that the shape is (1,1).
@@ -604,10 +643,6 @@ module Polars
604
643
  #
605
644
  # @param file [String]
606
645
  # File path to which the result should be written.
607
- # @param pretty [Boolean]
608
- # Pretty serialize json.
609
- # @param row_oriented [Boolean]
610
- # Write to row oriented json. This is slower, but more common.
611
646
  #
612
647
  # @return [nil]
613
648
  #
@@ -619,16 +654,8 @@ module Polars
619
654
  # }
620
655
  # )
621
656
  # df.write_json
622
- # # => "{\"columns\":[{\"name\":\"foo\",\"datatype\":\"Int64\",\"bit_settings\":\"\",\"values\":[1,2,3]},{\"name\":\"bar\",\"datatype\":\"Int64\",\"bit_settings\":\"\",\"values\":[6,7,8]}]}"
623
- #
624
- # @example
625
- # df.write_json(row_oriented: true)
626
657
  # # => "[{\"foo\":1,\"bar\":6},{\"foo\":2,\"bar\":7},{\"foo\":3,\"bar\":8}]"
627
- def write_json(
628
- file = nil,
629
- pretty: false,
630
- row_oriented: false
631
- )
658
+ def write_json(file = nil)
632
659
  if Utils.pathlike?(file)
633
660
  file = Utils.normalize_filepath(file)
634
661
  end
@@ -636,7 +663,7 @@ module Polars
636
663
  if file.nil? || to_string_io
637
664
  buf = StringIO.new
638
665
  buf.set_encoding(Encoding::BINARY)
639
- _df.write_json(buf, pretty, row_oriented)
666
+ _df.write_json(buf)
640
667
  json_bytes = buf.string
641
668
 
642
669
  json_str = json_bytes.force_encoding(Encoding::UTF_8)
@@ -646,7 +673,7 @@ module Polars
646
673
  return json_str
647
674
  end
648
675
  else
649
- _df.write_json(file, pretty, row_oriented)
676
+ _df.write_json(file)
650
677
  end
651
678
  nil
652
679
  end
@@ -973,6 +1000,139 @@ module Polars
973
1000
  )
974
1001
  end
975
1002
 
1003
+ # Write the data in a Polars DataFrame to a database.
1004
+ #
1005
+ # @param table_name [String]
1006
+ # Schema-qualified name of the table to create or append to in the target
1007
+ # SQL database.
1008
+ # @param connection [Object]
1009
+ # An existing Active Record connection against the target database.
1010
+ # @param if_table_exists ['append', 'replace', 'fail']
1011
+ # The insert mode:
1012
+ #
1013
+ # * 'replace' will create a new database table, overwriting an existing one.
1014
+ # * 'append' will append to an existing table.
1015
+ # * 'fail' will fail if table already exists.
1016
+ #
1017
+ # @return [Integer]
1018
+ #
1019
+ # @note
1020
+ # This functionality is experimental. It may be changed at any point without it being considered a breaking change.
1021
+ def write_database(table_name, connection = nil, if_table_exists: "fail")
1022
+ if !defined?(ActiveRecord)
1023
+ raise Error, "Active Record not available"
1024
+ elsif ActiveRecord::VERSION::MAJOR < 7
1025
+ raise Error, "Requires Active Record 7+"
1026
+ end
1027
+
1028
+ valid_write_modes = ["append", "replace", "fail"]
1029
+ if !valid_write_modes.include?(if_table_exists)
1030
+ msg = "write_database `if_table_exists` must be one of #{valid_write_modes.inspect}, got #{if_table_exists.inspect}"
1031
+ raise ArgumentError, msg
1032
+ end
1033
+
1034
+ with_connection(connection) do |connection|
1035
+ table_exists = connection.table_exists?(table_name)
1036
+ if table_exists && if_table_exists == "fail"
1037
+ raise ArgumentError, "Table already exists"
1038
+ end
1039
+
1040
+ create_table = !table_exists || if_table_exists == "replace"
1041
+ maybe_transaction(connection, create_table) do
1042
+ if create_table
1043
+ mysql = connection.adapter_name.match?(/mysql|trilogy/i)
1044
+ force = if_table_exists == "replace"
1045
+ connection.create_table(table_name, id: false, force: force) do |t|
1046
+ schema.each do |c, dtype|
1047
+ options = {}
1048
+ column_type =
1049
+ case dtype
1050
+ when Binary
1051
+ :binary
1052
+ when Boolean
1053
+ :boolean
1054
+ when Date
1055
+ :date
1056
+ when Datetime
1057
+ :datetime
1058
+ when Decimal
1059
+ if mysql
1060
+ options[:precision] = dtype.precision || 65
1061
+ options[:scale] = dtype.scale || 30
1062
+ end
1063
+ :decimal
1064
+ when Float32
1065
+ options[:limit] = 24
1066
+ :float
1067
+ when Float64
1068
+ options[:limit] = 53
1069
+ :float
1070
+ when Int8
1071
+ options[:limit] = 1
1072
+ :integer
1073
+ when Int16
1074
+ options[:limit] = 2
1075
+ :integer
1076
+ when Int32
1077
+ options[:limit] = 4
1078
+ :integer
1079
+ when Int64
1080
+ options[:limit] = 8
1081
+ :integer
1082
+ when UInt8
1083
+ if mysql
1084
+ options[:limit] = 1
1085
+ options[:unsigned] = true
1086
+ else
1087
+ options[:limit] = 2
1088
+ end
1089
+ :integer
1090
+ when UInt16
1091
+ if mysql
1092
+ options[:limit] = 2
1093
+ options[:unsigned] = true
1094
+ else
1095
+ options[:limit] = 4
1096
+ end
1097
+ :integer
1098
+ when UInt32
1099
+ if mysql
1100
+ options[:limit] = 4
1101
+ options[:unsigned] = true
1102
+ else
1103
+ options[:limit] = 8
1104
+ end
1105
+ :integer
1106
+ when UInt64
1107
+ if mysql
1108
+ options[:limit] = 8
1109
+ options[:unsigned] = true
1110
+ :integer
1111
+ else
1112
+ options[:precision] = 20
1113
+ options[:scale] = 0
1114
+ :decimal
1115
+ end
1116
+ when String
1117
+ :text
1118
+ when Time
1119
+ :time
1120
+ else
1121
+ raise ArgumentError, "column type not supported yet: #{dtype}"
1122
+ end
1123
+ t.column c, column_type, **options
1124
+ end
1125
+ end
1126
+ end
1127
+
1128
+ quoted_table = connection.quote_table_name(table_name)
1129
+ quoted_columns = columns.map { |c| connection.quote_column_name(c) }
1130
+ rows = cast({Polars::UInt64 => Polars::String}).rows(named: false).map { |row| "(#{row.map { |v| connection.quote(v) }.join(", ")})" }
1131
+ connection.exec_update("INSERT INTO #{quoted_table} (#{quoted_columns.join(", ")}) VALUES #{rows.join(", ")}")
1132
+ end
1133
+ end
1134
+ end
1135
+
976
1136
  # Write DataFrame as delta table.
977
1137
  #
978
1138
  # @param target [Object]
@@ -2294,6 +2454,14 @@ module Polars
2294
2454
  # keys are within this distance. If an asof join is done on columns of dtype
2295
2455
  # "Date", "Datetime", "Duration" or "Time" you use the following string
2296
2456
  # language:
2457
+ # @param allow_exact_matches [Boolean]
2458
+ # Whether exact matches are valid join predicates.
2459
+ # - If true, allow matching with the same `on` value (i.e. less-than-or-equal-to / greater-than-or-equal-to).
2460
+ # - If false, don't match the same `on` value (i.e., strictly less-than / strictly greater-than).
2461
+ # @param check_sortedness [Boolean]
2462
+ # Check the sortedness of the asof keys. If the keys are not sorted Polars
2463
+ # will error, or in case of 'by' argument raise a warning. This might become
2464
+ # a hard error in the future.
2297
2465
  #
2298
2466
  # - 1ns (1 nanosecond)
2299
2467
  # - 1us (1 microsecond)
@@ -2375,7 +2543,9 @@ module Polars
2375
2543
  tolerance: nil,
2376
2544
  allow_parallel: true,
2377
2545
  force_parallel: false,
2378
- coalesce: true
2546
+ coalesce: true,
2547
+ allow_exact_matches: true,
2548
+ check_sortedness: true
2379
2549
  )
2380
2550
  lazy
2381
2551
  .join_asof(
@@ -2391,7 +2561,9 @@ module Polars
2391
2561
  tolerance: tolerance,
2392
2562
  allow_parallel: allow_parallel,
2393
2563
  force_parallel: force_parallel,
2394
- coalesce: coalesce
2564
+ coalesce: coalesce,
2565
+ allow_exact_matches: allow_exact_matches,
2566
+ check_sortedness: check_sortedness
2395
2567
  )
2396
2568
  .collect(no_optimization: true)
2397
2569
  end
@@ -2424,6 +2596,24 @@ module Polars
2424
2596
  # - true: -> Always coalesce join columns.
2425
2597
  # - false: -> Never coalesce join columns.
2426
2598
  # Note that joining on any other expressions than `col` will turn off coalescing.
2599
+ # @param maintain_order ['none', 'left', 'right', 'left_right', 'right_left']
2600
+ # Which DataFrame row order to preserve, if any.
2601
+ # Do not rely on any observed ordering without explicitly
2602
+ # setting this parameter, as your code may break in a future release.
2603
+ # Not specifying any ordering can improve performance
2604
+ # Supported for inner, left, right and full joins
2605
+ #
2606
+ # * *none*
2607
+ # No specific ordering is desired. The ordering might differ across
2608
+ # Polars versions or even between different runs.
2609
+ # * *left*
2610
+ # Preserves the order of the left DataFrame.
2611
+ # * *right*
2612
+ # Preserves the order of the right DataFrame.
2613
+ # * *left_right*
2614
+ # First preserves the order of the left DataFrame, then the right.
2615
+ # * *right_left*
2616
+ # First preserves the order of the right DataFrame, then the left.
2427
2617
  #
2428
2618
  # @return [DataFrame]
2429
2619
  #
@@ -2506,7 +2696,8 @@ module Polars
2506
2696
  # # ╞═════╪═════╪═════╡
2507
2697
  # # │ 3 ┆ 8.0 ┆ c │
2508
2698
  # # └─────┴─────┴─────┘
2509
- def join(other,
2699
+ def join(
2700
+ other,
2510
2701
  left_on: nil,
2511
2702
  right_on: nil,
2512
2703
  on: nil,
@@ -2514,7 +2705,8 @@ module Polars
2514
2705
  suffix: "_right",
2515
2706
  validate: "m:m",
2516
2707
  join_nulls: false,
2517
- coalesce: nil
2708
+ coalesce: nil,
2709
+ maintain_order: nil
2518
2710
  )
2519
2711
  lazy
2520
2712
  .join(
@@ -2526,7 +2718,8 @@ module Polars
2526
2718
  suffix: suffix,
2527
2719
  validate: validate,
2528
2720
  join_nulls: join_nulls,
2529
- coalesce: coalesce
2721
+ coalesce: coalesce,
2722
+ maintain_order: maintain_order
2530
2723
  )
2531
2724
  .collect(no_optimization: true)
2532
2725
  end
@@ -4865,6 +5058,90 @@ module Polars
4865
5058
  iter_rows(named: named, buffer_size: buffer_size, &block)
4866
5059
  end
4867
5060
 
5061
+ # Returns an iterator over the columns of this DataFrame.
5062
+ #
5063
+ # @return [Object]
5064
+ #
5065
+ # @note
5066
+ # Consider whether you can use `all` instead.
5067
+ # If you can, it will be more efficient.
5068
+ #
5069
+ # @example
5070
+ # df = Polars::DataFrame.new(
5071
+ # {
5072
+ # "a" => [1, 3, 5],
5073
+ # "b" => [2, 4, 6]
5074
+ # }
5075
+ # )
5076
+ # df.iter_columns.map { |s| s.name }
5077
+ # # => ["a", "b"]
5078
+ #
5079
+ # @example If you're using this to modify a dataframe's columns, e.g.
5080
+ # # Do NOT do this
5081
+ # Polars::DataFrame.new(df.iter_columns.map { |column| column * 2 })
5082
+ # # =>
5083
+ # # shape: (3, 2)
5084
+ # # ┌─────┬─────┐
5085
+ # # │ a ┆ b │
5086
+ # # │ --- ┆ --- │
5087
+ # # │ i64 ┆ i64 │
5088
+ # # ╞═════╪═════╡
5089
+ # # │ 2 ┆ 4 │
5090
+ # # │ 6 ┆ 8 │
5091
+ # # │ 10 ┆ 12 │
5092
+ # # └─────┴─────┘
5093
+ #
5094
+ # @example then consider whether you can use `all` instead:
5095
+ # df.select(Polars.all * 2)
5096
+ # # =>
5097
+ # # shape: (3, 2)
5098
+ # # ┌─────┬─────┐
5099
+ # # │ a ┆ b │
5100
+ # # │ --- ┆ --- │
5101
+ # # │ i64 ┆ i64 │
5102
+ # # ╞═════╪═════╡
5103
+ # # │ 2 ┆ 4 │
5104
+ # # │ 6 ┆ 8 │
5105
+ # # │ 10 ┆ 12 │
5106
+ # # └─────┴─────┘
5107
+ def iter_columns
5108
+ return to_enum(:iter_columns) unless block_given?
5109
+
5110
+ _df.get_columns.each do |s|
5111
+ yield Utils.wrap_s(s)
5112
+ end
5113
+ end
5114
+
5115
+ # Returns a non-copying iterator of slices over the underlying DataFrame.
5116
+ #
5117
+ # @param n_rows [Integer]
5118
+ # Determines the number of rows contained in each DataFrame slice.
5119
+ #
5120
+ # @return [Object]
5121
+ #
5122
+ # @example
5123
+ # df = Polars::DataFrame.new(
5124
+ # {
5125
+ # "a" => 0...17_500,
5126
+ # "b" => Date.new(2023, 1, 1),
5127
+ # "c" => "klmnoopqrstuvwxyz"
5128
+ # },
5129
+ # schema_overrides: {"a" => Polars::Int32}
5130
+ # )
5131
+ # df.iter_slices.map.with_index do |frame, idx|
5132
+ # "#{frame.class.name}:[#{idx}]:#{frame.length}"
5133
+ # end
5134
+ # # => ["Polars::DataFrame:[0]:10000", "Polars::DataFrame:[1]:7500"]
5135
+ def iter_slices(n_rows: 10_000)
5136
+ return to_enum(:iter_slices, n_rows: n_rows) unless block_given?
5137
+
5138
+ offset = 0
5139
+ while offset < height
5140
+ yield slice(offset, n_rows)
5141
+ offset += n_rows
5142
+ end
5143
+ end
5144
+
4868
5145
  # Shrink DataFrame memory usage.
4869
5146
  #
4870
5147
  # Shrinks to fit the exact capacity needed to hold the data.
@@ -5101,12 +5378,17 @@ module Polars
5101
5378
  lazy.merge_sorted(other.lazy, key).collect(_eager: true)
5102
5379
  end
5103
5380
 
5104
- # Indicate that one or multiple columns are sorted.
5381
+ # Flag a column as sorted.
5382
+ #
5383
+ # This can speed up future operations.
5384
+ #
5385
+ # @note
5386
+ # This can lead to incorrect results if the data is NOT sorted! Use with care!
5105
5387
  #
5106
5388
  # @param column [Object]
5107
- # Columns that are sorted
5389
+ # Column that is sorted.
5108
5390
  # @param descending [Boolean]
5109
- # Whether the columns are sorted in descending order.
5391
+ # Whether the column is sorted in descending order.
5110
5392
  #
5111
5393
  # @return [DataFrame]
5112
5394
  def set_sorted(
@@ -5538,5 +5820,21 @@ module Polars
5538
5820
  end
5539
5821
  other
5540
5822
  end
5823
+
5824
+ def with_connection(connection, &block)
5825
+ if !connection.nil?
5826
+ yield connection
5827
+ else
5828
+ ActiveRecord::Base.connection_pool.with_connection(&block)
5829
+ end
5830
+ end
5831
+
5832
+ def maybe_transaction(connection, create_table, &block)
5833
+ if create_table && connection.adapter_name.match?(/postg|sqlite/i) && connection.open_transactions == 0
5834
+ connection.transaction(&block)
5835
+ else
5836
+ yield
5837
+ end
5838
+ end
5541
5839
  end
5542
5840
  end
@@ -167,6 +167,10 @@ module Polars
167
167
  class Int64 < SignedIntegerType
168
168
  end
169
169
 
170
+ # 128-bit signed integer type.
171
+ class Int128 < SignedIntegerType
172
+ end
173
+
170
174
  # 8-bit unsigned integer type.
171
175
  class UInt8 < UnsignedIntegerType
172
176
  end
data/lib/polars/expr.rb CHANGED
@@ -3994,6 +3994,37 @@ module Polars
3994
3994
  _from_rbexpr(_rbexpr.interpolate(method))
3995
3995
  end
3996
3996
 
3997
+ # Fill null values using interpolation based on another column.
3998
+ #
3999
+ # @param by [Expr] Column to interpolate values based on.
4000
+ #
4001
+ # @return [Expr]
4002
+ #
4003
+ # @example Fill null values using linear interpolation.
4004
+ # df = Polars::DataFrame.new(
4005
+ # {
4006
+ # "a" => [1, nil, nil, 3],
4007
+ # "b" => [1, 2, 7, 8]
4008
+ # }
4009
+ # )
4010
+ # df.with_columns(a_interpolated: Polars.col("a").interpolate_by("b"))
4011
+ # # =>
4012
+ # # shape: (4, 3)
4013
+ # # ┌──────┬─────┬────────────────┐
4014
+ # # │ a ┆ b ┆ a_interpolated │
4015
+ # # │ --- ┆ --- ┆ --- │
4016
+ # # │ i64 ┆ i64 ┆ f64 │
4017
+ # # ╞══════╪═════╪════════════════╡
4018
+ # # │ 1 ┆ 1 ┆ 1.0 │
4019
+ # # │ null ┆ 2 ┆ 1.285714 │
4020
+ # # │ null ┆ 7 ┆ 2.714286 │
4021
+ # # │ 3 ┆ 8 ┆ 3.0 │
4022
+ # # └──────┴─────┴────────────────┘
4023
+ def interpolate_by(by)
4024
+ by = Utils.parse_into_expression(by)
4025
+ _from_rbexpr(_rbexpr.interpolate_by(by))
4026
+ end
4027
+
3997
4028
  # Apply a rolling min based on another column.
3998
4029
  #
3999
4030
  # @param by [String]