polars-df 0.15.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -167,6 +167,10 @@ module Polars
167
167
  class Int64 < SignedIntegerType
168
168
  end
169
169
 
170
+ # 128-bit signed integer type.
171
+ class Int128 < SignedIntegerType
172
+ end
173
+
170
174
  # 8-bit unsigned integer type.
171
175
  class UInt8 < UnsignedIntegerType
172
176
  end
@@ -311,7 +315,7 @@ module Polars
311
315
  end
312
316
 
313
317
  if categories.empty?
314
- self.categories = Series.new("category", [], dtype: String)
318
+ @categories = Series.new("category", [], dtype: String)
315
319
  return
316
320
  end
317
321
 
@@ -143,6 +143,9 @@ module Polars
143
143
  # @param exprs [Array]
144
144
  # Column(s) to use in the aggregation. Accepts expression input. Strings are
145
145
  # parsed as column names, other non-expression inputs are parsed as literals.
146
+ # @param ignore_nulls [Boolean]
147
+ # Ignore null values (default).
148
+ # If set to `false`, any null value in the input will lead to a null output.
146
149
  #
147
150
  # @return [Expr]
148
151
  #
@@ -166,9 +169,9 @@ module Polars
166
169
  # # │ 8 ┆ 5 ┆ y ┆ 13 │
167
170
  # # │ 3 ┆ null ┆ z ┆ 3 │
168
171
  # # └─────┴──────┴─────┴─────┘
169
- def sum_horizontal(*exprs)
172
+ def sum_horizontal(*exprs, ignore_nulls: true)
170
173
  rbexprs = Utils.parse_into_list_of_expressions(*exprs)
171
- Utils.wrap_expr(Plr.sum_horizontal(rbexprs))
174
+ Utils.wrap_expr(Plr.sum_horizontal(rbexprs, ignore_nulls))
172
175
  end
173
176
 
174
177
  # Compute the mean of all values horizontally across columns.
@@ -176,6 +179,9 @@ module Polars
176
179
  # @param exprs [Array]
177
180
  # Column(s) to use in the aggregation. Accepts expression input. Strings are
178
181
  # parsed as column names, other non-expression inputs are parsed as literals.
182
+ # @param ignore_nulls [Boolean]
183
+ # Ignore null values (default).
184
+ # If set to `false`, any null value in the input will lead to a null output.
179
185
  #
180
186
  # @return [Expr]
181
187
  #
@@ -199,9 +205,9 @@ module Polars
199
205
  # # │ 8 ┆ 5 ┆ y ┆ 6.5 │
200
206
  # # │ 3 ┆ null ┆ z ┆ 3.0 │
201
207
  # # └─────┴──────┴─────┴──────┘
202
- def mean_horizontal(*exprs)
208
+ def mean_horizontal(*exprs, ignore_nulls: true)
203
209
  rbexprs = Utils.parse_into_list_of_expressions(*exprs)
204
- Utils.wrap_expr(Plr.mean_horizontal(rbexprs))
210
+ Utils.wrap_expr(Plr.mean_horizontal(rbexprs, ignore_nulls))
205
211
  end
206
212
 
207
213
  # Cumulatively sum all values horizontally across columns.
@@ -729,16 +729,20 @@ module Polars
729
729
  a,
730
730
  b,
731
731
  method: "pearson",
732
- ddof: 1,
732
+ ddof: nil,
733
733
  propagate_nans: false
734
734
  )
735
+ if !ddof.nil?
736
+ warn "The `ddof` parameter has no effect. Do not use it."
737
+ end
738
+
735
739
  a = Utils.parse_into_expression(a)
736
740
  b = Utils.parse_into_expression(b)
737
741
 
738
742
  if method == "pearson"
739
- Utils.wrap_expr(Plr.pearson_corr(a, b, ddof))
743
+ Utils.wrap_expr(Plr.pearson_corr(a, b))
740
744
  elsif method == "spearman"
741
- Utils.wrap_expr(Plr.spearman_rank_corr(a, b, ddof, propagate_nans))
745
+ Utils.wrap_expr(Plr.spearman_rank_corr(a, b, propagate_nans))
742
746
  else
743
747
  msg = "method must be one of {{'pearson', 'spearman'}}, got #{method}"
744
748
  raise ArgumentError, msg
@@ -0,0 +1,126 @@
1
+ module Polars
2
+ module IO
3
+ # Reads into a DataFrame from a Delta lake table.
4
+ #
5
+ # @param source [Object]
6
+ # DeltaTable or a Path or URI to the root of the Delta lake table.
7
+ # @param version [Object]
8
+ # Numerical version or timestamp version of the Delta lake table.
9
+ # @param columns [Array]
10
+ # Columns to select. Accepts a list of column names.
11
+ # @param rechunk [Boolean]
12
+ # Make sure that all columns are contiguous in memory by
13
+ # aggregating the chunks into a single array.
14
+ # @param storage_options [Hash]
15
+ # Extra options for the storage backends supported by `deltalake-rb`.
16
+ # @param delta_table_options [Hash]
17
+ # Additional keyword arguments while reading a Delta lake Table.
18
+ #
19
+ # @return [DataFrame]
20
+ def read_delta(
21
+ source,
22
+ version: nil,
23
+ columns: nil,
24
+ rechunk: false,
25
+ storage_options: nil,
26
+ delta_table_options: nil
27
+ )
28
+ dl_tbl =
29
+ _get_delta_lake_table(
30
+ source,
31
+ version: version,
32
+ storage_options: storage_options,
33
+ delta_table_options: delta_table_options
34
+ )
35
+
36
+ dl_tbl.to_polars(columns: columns, rechunk: rechunk)
37
+ end
38
+
39
+ # Lazily read from a Delta lake table.
40
+ #
41
+ # @param source [Object]
42
+ # DeltaTable or a Path or URI to the root of the Delta lake table.
43
+ # @param version [Object]
44
+ # Numerical version or timestamp version of the Delta lake table.
45
+ # @param storage_options [Hash]
46
+ # Extra options for the storage backends supported by `deltalake-rb`.
47
+ # @param delta_table_options [Hash]
48
+ # Additional keyword arguments while reading a Delta lake Table.
49
+ #
50
+ # @return [LazyFrame]
51
+ def scan_delta(
52
+ source,
53
+ version: nil,
54
+ storage_options: nil,
55
+ delta_table_options: nil
56
+ )
57
+ dl_tbl =
58
+ _get_delta_lake_table(
59
+ source,
60
+ version: version,
61
+ storage_options: storage_options,
62
+ delta_table_options: delta_table_options
63
+ )
64
+
65
+ dl_tbl.to_polars(eager: false)
66
+ end
67
+
68
+ private
69
+
70
+ def _resolve_delta_lake_uri(table_uri, strict: true)
71
+ require "uri"
72
+
73
+ parsed_result = URI(table_uri)
74
+
75
+ resolved_uri =
76
+ if parsed_result.scheme == ""
77
+ Utils.normalize_filepath(table_uri)
78
+ else
79
+ table_uri
80
+ end
81
+
82
+ resolved_uri
83
+ end
84
+
85
+ def _get_delta_lake_table(
86
+ table_path,
87
+ version: nil,
88
+ storage_options: nil,
89
+ delta_table_options: nil
90
+ )
91
+ _check_if_delta_available
92
+
93
+ if table_path.is_a?(DeltaLake::Table)
94
+ return table_path
95
+ end
96
+ delta_table_options ||= {}
97
+ resolved_uri = _resolve_delta_lake_uri(table_path)
98
+ if !version.is_a?(::String) && !version.is_a?(::Time)
99
+ dl_tbl =
100
+ DeltaLake::Table.new(
101
+ resolved_uri,
102
+ version: version,
103
+ storage_options: storage_options,
104
+ **delta_table_options
105
+ )
106
+ else
107
+ dl_tbl =
108
+ DeltaLake::Table.new(
109
+ resolved_uri,
110
+ storage_options: storage_options,
111
+ **delta_table_options
112
+ )
113
+ dl_tbl.load_as_version(version)
114
+ end
115
+
116
+ dl_tbl = DeltaLake::Table.new(table_path)
117
+ dl_tbl
118
+ end
119
+
120
+ def _check_if_delta_available
121
+ if !defined?(DeltaLake)
122
+ raise Error, "Delta Lake not available"
123
+ end
124
+ end
125
+ end
126
+ end
@@ -431,7 +431,9 @@ module Polars
431
431
  projection_pushdown: true,
432
432
  simplify_expression: true,
433
433
  no_optimization: false,
434
- slice_pushdown: true
434
+ slice_pushdown: true,
435
+ storage_options: nil,
436
+ retries: 2
435
437
  )
436
438
  lf = _set_sink_optimizations(
437
439
  type_coercion: type_coercion,
@@ -460,6 +462,12 @@ module Polars
460
462
  }
461
463
  end
462
464
 
465
+ if storage_options&.any?
466
+ storage_options = storage_options.to_a
467
+ else
468
+ storage_options = nil
469
+ end
470
+
463
471
  lf.sink_parquet(
464
472
  path,
465
473
  compression,
@@ -467,7 +475,9 @@ module Polars
467
475
  statistics,
468
476
  row_group_size,
469
477
  data_pagesize_limit,
470
- maintain_order
478
+ maintain_order,
479
+ storage_options,
480
+ retries
471
481
  )
472
482
  end
473
483
 
@@ -512,6 +522,10 @@ module Polars
512
522
  slice_pushdown: true,
513
523
  no_optimization: false
514
524
  )
525
+ # TODO support storage options in Rust
526
+ storage_options = nil
527
+ retries = 2
528
+
515
529
  lf = _set_sink_optimizations(
516
530
  type_coercion: type_coercion,
517
531
  predicate_pushdown: predicate_pushdown,
@@ -521,10 +535,18 @@ module Polars
521
535
  no_optimization: no_optimization
522
536
  )
523
537
 
538
+ if storage_options&.any?
539
+ storage_options = storage_options.to_a
540
+ else
541
+ storage_options = nil
542
+ end
543
+
524
544
  lf.sink_ipc(
525
545
  path,
526
546
  compression,
527
- maintain_order
547
+ maintain_order,
548
+ storage_options,
549
+ retries
528
550
  )
529
551
  end
530
552
 
@@ -692,7 +714,9 @@ module Polars
692
714
  projection_pushdown: true,
693
715
  simplify_expression: true,
694
716
  slice_pushdown: true,
695
- no_optimization: false
717
+ no_optimization: false,
718
+ storage_options: nil,
719
+ retries: 2
696
720
  )
697
721
  lf = _set_sink_optimizations(
698
722
  type_coercion: type_coercion,
@@ -703,7 +727,13 @@ module Polars
703
727
  no_optimization: no_optimization
704
728
  )
705
729
 
706
- lf.sink_json(path, maintain_order)
730
+ if storage_options&.any?
731
+ storage_options = storage_options.to_a
732
+ else
733
+ storage_options = nil
734
+ end
735
+
736
+ lf.sink_json(path, maintain_order, storage_options, retries)
707
737
  end
708
738
 
709
739
  # @private
@@ -1586,6 +1616,14 @@ module Polars
1586
1616
  # - true: -> Always coalesce join columns.
1587
1617
  # - false: -> Never coalesce join columns.
1588
1618
  # Note that joining on any other expressions than `col` will turn off coalescing.
1619
+ # @param allow_exact_matches [Boolean]
1620
+ # Whether exact matches are valid join predicates.
1621
+ # - If true, allow matching with the same `on` value (i.e. less-than-or-equal-to / greater-than-or-equal-to).
1622
+ # - If false, don't match the same `on` value (i.e., strictly less-than / strictly greater-than).
1623
+ # @param check_sortedness [Boolean]
1624
+ # Check the sortedness of the asof keys. If the keys are not sorted Polars
1625
+ # will error, or in case of 'by' argument raise a warning. This might become
1626
+ # a hard error in the future.
1589
1627
  #
1590
1628
  # @return [LazyFrame]
1591
1629
  #
@@ -1785,7 +1823,9 @@ module Polars
1785
1823
  tolerance: nil,
1786
1824
  allow_parallel: true,
1787
1825
  force_parallel: false,
1788
- coalesce: true
1826
+ coalesce: true,
1827
+ allow_exact_matches: true,
1828
+ check_sortedness: true
1789
1829
  )
1790
1830
  if !other.is_a?(LazyFrame)
1791
1831
  raise ArgumentError, "Expected a `LazyFrame` as join table, got #{other.class.name}"
@@ -1841,7 +1881,9 @@ module Polars
1841
1881
  strategy,
1842
1882
  tolerance_num,
1843
1883
  tolerance_str,
1844
- coalesce
1884
+ coalesce,
1885
+ allow_exact_matches,
1886
+ check_sortedness
1845
1887
  )
1846
1888
  )
1847
1889
  end
@@ -372,9 +372,91 @@ module Polars
372
372
  # def by_index
373
373
  # end
374
374
 
375
- # TODO
376
- # def by_name
377
- # end
375
+ # Select all columns matching the given names.
376
+ #
377
+ # @param names [Array]
378
+ # One or more names of columns to select.
379
+ # @param require_all [Boolean]
380
+ # Whether to match *all* names (the default) or *any* of the names.
381
+ #
382
+ # @return [SelectorProxy]
383
+ #
384
+ # @note
385
+ # Matching columns are returned in the order in which they are declared in
386
+ # the selector, not the underlying schema order.
387
+ #
388
+ # @example
389
+ # df = Polars::DataFrame.new(
390
+ # {
391
+ # "foo" => ["x", "y"],
392
+ # "bar" => [123, 456],
393
+ # "baz" => [2.0, 5.5],
394
+ # "zap" => [false, true]
395
+ # }
396
+ # )
397
+ #
398
+ # @example Select columns by name:
399
+ # df.select(Polars.cs.by_name("foo", "bar"))
400
+ # # =>
401
+ # # shape: (2, 2)
402
+ # # ┌─────┬─────┐
403
+ # # │ foo ┆ bar │
404
+ # # │ --- ┆ --- │
405
+ # # │ str ┆ i64 │
406
+ # # ╞═════╪═════╡
407
+ # # │ x ┆ 123 │
408
+ # # │ y ┆ 456 │
409
+ # # └─────┴─────┘
410
+ #
411
+ # @example Match *any* of the given columns by name:
412
+ # df.select(Polars.cs.by_name("baz", "moose", "foo", "bear", require_all: false))
413
+ # # =>
414
+ # # shape: (2, 2)
415
+ # # ┌─────┬─────┐
416
+ # # │ foo ┆ baz │
417
+ # # │ --- ┆ --- │
418
+ # # │ str ┆ f64 │
419
+ # # ╞═════╪═════╡
420
+ # # │ x ┆ 2.0 │
421
+ # # │ y ┆ 5.5 │
422
+ # # └─────┴─────┘
423
+ #
424
+ # @example Match all columns *except* for those given:
425
+ # df.select(~Polars.cs.by_name("foo", "bar"))
426
+ # # =>
427
+ # # shape: (2, 2)
428
+ # # ┌─────┬───────┐
429
+ # # │ baz ┆ zap │
430
+ # # │ --- ┆ --- │
431
+ # # │ f64 ┆ bool │
432
+ # # ╞═════╪═══════╡
433
+ # # │ 2.0 ┆ false │
434
+ # # │ 5.5 ┆ true │
435
+ # # └─────┴───────┘
436
+ def self.by_name(*names, require_all: true)
437
+ all_names = []
438
+ names.each do |nm|
439
+ if nm.is_a?(::String)
440
+ all_names << nm
441
+ else
442
+ msg = "invalid name: #{nm.inspect}"
443
+ raise TypeError, msg
444
+ end
445
+ end
446
+
447
+ selector_params = {"*names" => all_names}
448
+ match_cols = all_names
449
+ if !require_all
450
+ match_cols = "^(#{all_names.map { |nm| Utils.re_escape(nm) }.join("|")})$"
451
+ selector_params["require_all"] = require_all
452
+ end
453
+
454
+ _selector_proxy_(
455
+ F.col(match_cols),
456
+ name: "by_name",
457
+ parameters: selector_params
458
+ )
459
+ end
378
460
 
379
461
  # Select all categorical columns.
380
462
  #
data/lib/polars/series.rb CHANGED
@@ -4696,7 +4696,12 @@ module Polars
4696
4696
  end
4697
4697
 
4698
4698
  constructor = polars_type_to_constructor(dtype)
4699
- rbseries = constructor.call(name, values, strict)
4699
+ rbseries =
4700
+ if dtype == Array
4701
+ constructor.call(name, values, strict)
4702
+ else
4703
+ construct_series_with_fallbacks(constructor, name, values, dtype, strict: strict)
4704
+ end
4700
4705
 
4701
4706
  base_type = dtype.is_a?(DataType) ? dtype.class : dtype
4702
4707
  if [Date, Datetime, Duration, Time, Categorical, Boolean, Enum, Decimal].include?(base_type)
@@ -1,4 +1,4 @@
1
1
  module Polars
2
2
  # @private
3
- VERSION = "0.15.0"
3
+ VERSION = "0.17.0"
4
4
  end
data/lib/polars.rb CHANGED
@@ -49,6 +49,7 @@ require_relative "polars/group_by"
49
49
  require_relative "polars/io/avro"
50
50
  require_relative "polars/io/csv"
51
51
  require_relative "polars/io/database"
52
+ require_relative "polars/io/delta"
52
53
  require_relative "polars/io/ipc"
53
54
  require_relative "polars/io/json"
54
55
  require_relative "polars/io/ndjson"
@@ -89,4 +90,18 @@ module Polars
89
90
 
90
91
  # @private
91
92
  N_INFER_DEFAULT = 100
93
+
94
+ # @private
95
+ class ArrowArrayStream
96
+ def arrow_c_stream
97
+ self
98
+ end
99
+ end
100
+
101
+ # Return the number of threads in the Polars thread pool.
102
+ #
103
+ # @return [Integer]
104
+ def self.thread_pool_size
105
+ Plr.thread_pool_size
106
+ end
92
107
  end
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: polars-df
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.15.0
4
+ version: 0.17.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
- autorequire:
9
8
  bindir: bin
10
9
  cert_chain: []
11
- date: 2024-11-20 00:00:00.000000000 Z
10
+ date: 2025-01-28 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: bigdecimal
@@ -38,7 +37,6 @@ dependencies:
38
37
  - - ">="
39
38
  - !ruby/object:Gem::Version
40
39
  version: '0'
41
- description:
42
40
  email: andrew@ankane.org
43
41
  executables: []
44
42
  extensions:
@@ -160,6 +158,7 @@ files:
160
158
  - lib/polars/io/avro.rb
161
159
  - lib/polars/io/csv.rb
162
160
  - lib/polars/io/database.rb
161
+ - lib/polars/io/delta.rb
163
162
  - lib/polars/io/ipc.rb
164
163
  - lib/polars/io/json.rb
165
164
  - lib/polars/io/ndjson.rb
@@ -194,7 +193,6 @@ homepage: https://github.com/ankane/ruby-polars
194
193
  licenses:
195
194
  - MIT
196
195
  metadata: {}
197
- post_install_message:
198
196
  rdoc_options: []
199
197
  require_paths:
200
198
  - lib
@@ -202,15 +200,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
202
200
  requirements:
203
201
  - - ">="
204
202
  - !ruby/object:Gem::Version
205
- version: '3.1'
203
+ version: '3.2'
206
204
  required_rubygems_version: !ruby/object:Gem::Requirement
207
205
  requirements:
208
206
  - - ">="
209
207
  - !ruby/object:Gem::Version
210
208
  version: '0'
211
209
  requirements: []
212
- rubygems_version: 3.5.22
213
- signing_key:
210
+ rubygems_version: 3.6.2
214
211
  specification_version: 4
215
212
  summary: Blazingly fast DataFrames for Ruby
216
213
  test_files: []