polars-df 0.21.0-x86_64-linux-musl → 0.22.0-x86_64-linux-musl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +27 -0
  3. data/Cargo.lock +55 -48
  4. data/Cargo.toml +3 -0
  5. data/LICENSE-THIRD-PARTY.txt +23 -49
  6. data/README.md +12 -0
  7. data/lib/polars/3.2/polars.so +0 -0
  8. data/lib/polars/3.3/polars.so +0 -0
  9. data/lib/polars/3.4/polars.so +0 -0
  10. data/lib/polars/array_expr.rb +382 -3
  11. data/lib/polars/array_name_space.rb +281 -0
  12. data/lib/polars/binary_expr.rb +67 -0
  13. data/lib/polars/binary_name_space.rb +43 -0
  14. data/lib/polars/cat_expr.rb +224 -0
  15. data/lib/polars/cat_name_space.rb +138 -0
  16. data/lib/polars/config.rb +2 -2
  17. data/lib/polars/convert.rb +6 -6
  18. data/lib/polars/data_frame.rb +794 -27
  19. data/lib/polars/data_type_expr.rb +52 -0
  20. data/lib/polars/data_types.rb +26 -5
  21. data/lib/polars/date_time_expr.rb +252 -1
  22. data/lib/polars/date_time_name_space.rb +299 -0
  23. data/lib/polars/expr.rb +1248 -206
  24. data/lib/polars/functions/business.rb +95 -0
  25. data/lib/polars/functions/datatype.rb +21 -0
  26. data/lib/polars/functions/lazy.rb +14 -1
  27. data/lib/polars/io/csv.rb +1 -1
  28. data/lib/polars/io/iceberg.rb +27 -0
  29. data/lib/polars/io/json.rb +4 -4
  30. data/lib/polars/io/ndjson.rb +4 -4
  31. data/lib/polars/io/parquet.rb +32 -7
  32. data/lib/polars/io/scan_options.rb +4 -1
  33. data/lib/polars/lazy_frame.rb +1028 -28
  34. data/lib/polars/list_expr.rb +217 -17
  35. data/lib/polars/list_name_space.rb +231 -22
  36. data/lib/polars/meta_expr.rb +89 -0
  37. data/lib/polars/name_expr.rb +36 -0
  38. data/lib/polars/query_opt_flags.rb +50 -0
  39. data/lib/polars/scan_cast_options.rb +20 -1
  40. data/lib/polars/schema.rb +79 -3
  41. data/lib/polars/selector.rb +72 -0
  42. data/lib/polars/selectors.rb +3 -3
  43. data/lib/polars/series.rb +1053 -54
  44. data/lib/polars/string_expr.rb +436 -32
  45. data/lib/polars/string_name_space.rb +736 -50
  46. data/lib/polars/struct_expr.rb +103 -0
  47. data/lib/polars/struct_name_space.rb +19 -1
  48. data/lib/polars/utils/serde.rb +17 -0
  49. data/lib/polars/utils/various.rb +22 -1
  50. data/lib/polars/utils.rb +5 -1
  51. data/lib/polars/version.rb +1 -1
  52. data/lib/polars.rb +6 -0
  53. metadata +8 -2
@@ -0,0 +1,95 @@
1
+ module Polars
2
+ module Functions
3
+ # Count the number of business days between `start` and `end` (not including `end`).
4
+ #
5
+ # @note
6
+ # This functionality is considered **unstable**. It may be changed
7
+ # at any point without it being considered a breaking change.
8
+ #
9
+ # @param start [Object]
10
+ # Start dates.
11
+ # @param stop [Object]
12
+ # End dates.
13
+ # @param week_mask [Array]
14
+ # Which days of the week to count. The default is Monday to Friday.
15
+ # If you wanted to count only Monday to Thursday, you would pass
16
+ # `[true, true, true, true, false, false, false]`.
17
+ # @param holidays [Array]
18
+ # Holidays to exclude from the count.
19
+ #
20
+ # @return [Expr]
21
+ #
22
+ # @example
23
+ # df = Polars::DataFrame.new(
24
+ # {
25
+ # "start" => [Date.new(2020, 1, 1), Date.new(2020, 1, 2)],
26
+ # "end" => [Date.new(2020, 1, 2), Date.new(2020, 1, 10)]
27
+ # }
28
+ # )
29
+ # df.with_columns(
30
+ # business_day_count: Polars.business_day_count("start", "end")
31
+ # )
32
+ # # =>
33
+ # # shape: (2, 3)
34
+ # # ┌────────────┬────────────┬────────────────────┐
35
+ # # │ start ┆ end ┆ business_day_count │
36
+ # # │ --- ┆ --- ┆ --- │
37
+ # # │ date ┆ date ┆ i32 │
38
+ # # ╞════════════╪════════════╪════════════════════╡
39
+ # # │ 2020-01-01 ┆ 2020-01-02 ┆ 1 │
40
+ # # │ 2020-01-02 ┆ 2020-01-10 ┆ 6 │
41
+ # # └────────────┴────────────┴────────────────────┘
42
+ #
43
+ # @example You can pass a custom weekend - for example, if you only take Sunday off:
44
+ # week_mask = [true, true, true, true, true, true, false]
45
+ # df.with_columns(
46
+ # business_day_count: Polars.business_day_count(
47
+ # "start", "end", week_mask: week_mask
48
+ # )
49
+ # )
50
+ # # =>
51
+ # # shape: (2, 3)
52
+ # # ┌────────────┬────────────┬────────────────────┐
53
+ # # │ start ┆ end ┆ business_day_count │
54
+ # # │ --- ┆ --- ┆ --- │
55
+ # # │ date ┆ date ┆ i32 │
56
+ # # ╞════════════╪════════════╪════════════════════╡
57
+ # # │ 2020-01-01 ┆ 2020-01-02 ┆ 1 │
58
+ # # │ 2020-01-02 ┆ 2020-01-10 ┆ 7 │
59
+ # # └────────────┴────────────┴────────────────────┘
60
+ #
61
+ # @example You can also pass a list of holidays to exclude from the count:
62
+ # holidays = [Date.new(2020, 1, 1), Date.new(2020, 1, 2)]
63
+ # df.with_columns(
64
+ # business_day_count: Polars.business_day_count("start", "end", holidays: holidays)
65
+ # )
66
+ # # =>
67
+ # # shape: (2, 3)
68
+ # # ┌────────────┬────────────┬────────────────────┐
69
+ # # │ start ┆ end ┆ business_day_count │
70
+ # # │ --- ┆ --- ┆ --- │
71
+ # # │ date ┆ date ┆ i32 │
72
+ # # ╞════════════╪════════════╪════════════════════╡
73
+ # # │ 2020-01-01 ┆ 2020-01-02 ┆ 0 │
74
+ # # │ 2020-01-02 ┆ 2020-01-10 ┆ 5 │
75
+ # # └────────────┴────────────┴────────────────────┘
76
+ def business_day_count(
77
+ start,
78
+ stop,
79
+ week_mask: [true, true, true, true, true, false, false],
80
+ holidays: []
81
+ )
82
+ start_rbexpr = Utils.parse_into_expression(start)
83
+ end_rbexpr = Utils.parse_into_expression(stop)
84
+ unix_epoch = ::Date.new(1970, 1, 1)
85
+ Utils.wrap_expr(
86
+ Plr.business_day_count(
87
+ start_rbexpr,
88
+ end_rbexpr,
89
+ week_mask,
90
+ holidays.map { |holiday| holiday - unix_epoch }
91
+ )
92
+ )
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,21 @@
1
+ module Polars
2
+ module Functions
3
+ # Get a lazily evaluated :class:`DataType` of a column or expression.
4
+ #
5
+ # @note
6
+ # This functionality is considered **unstable**. It may be changed
7
+ # at any point without it being considered a breaking change.
8
+ #
9
+ # @return [DataTypeExpr]
10
+ def dtype_of(col_or_expr)
11
+ e = nil
12
+ if col_or_expr.is_a?(::String)
13
+ e = F.col(col_or_expr)
14
+ else
15
+ e = col_or_expr
16
+ end
17
+
18
+ DataTypeExpr._from_rbdatatype_expr(RbDataTypeExpr.of_expr(e._rbexpr))
19
+ end
20
+ end
21
+ end
@@ -1,5 +1,18 @@
1
1
  module Polars
2
2
  module Functions
3
+ # Select a field in the current `struct.with_fields` scope.
4
+ #
5
+ # @param name [Object]
6
+ # Name of the field(s) to select.
7
+ #
8
+ # @return [Expr]
9
+ def field(name)
10
+ if name.is_a?(::String)
11
+ name = [name]
12
+ end
13
+ Utils.wrap_expr(Plr.field(name))
14
+ end
15
+
3
16
  # Alias for an element in evaluated in an `eval` expression.
4
17
  #
5
18
  # @return [Expr]
@@ -810,7 +823,7 @@ module Polars
810
823
  # # ┌─────┐
811
824
  # # │ sum │
812
825
  # # │ --- │
813
- # # │ i64
826
+ # # │ i32
814
827
  # # ╞═════╡
815
828
  # # │ 10 │
816
829
  # # │ 13 │
data/lib/polars/io/csv.rb CHANGED
@@ -499,7 +499,7 @@ module Polars
499
499
  # for instance `#`.
500
500
  # @param quote_char [String]
501
501
  # Single byte character used for csv quoting.
502
- # Set to None to turn off special handling and escaping of quotes.
502
+ # Set to nil to turn off special handling and escaping of quotes.
503
503
  # @param skip_rows [Integer]
504
504
  # Start reading after `skip_rows` lines. The header will be parsed at this
505
505
  # offset.
@@ -0,0 +1,27 @@
1
+ module Polars
2
+ module IO
3
+ # Lazily read from an Apache Iceberg table.
4
+ #
5
+ # @param source [Object]
6
+ # A Iceberg Ruby table, or a direct path to the metadata.
7
+ # @param snapshot_id [Integer]
8
+ # The snapshot ID to scan from.
9
+ # @param storage_options [Hash]
10
+ # Extra options for the storage backends.
11
+ #
12
+ # @return [LazyFrame]
13
+ def scan_iceberg(
14
+ source,
15
+ snapshot_id: nil,
16
+ storage_options: nil
17
+ )
18
+ require "iceberg"
19
+
20
+ unless source.is_a?(Iceberg::Table)
21
+ raise Todo
22
+ end
23
+
24
+ source.to_polars(snapshot_id:, storage_options:)
25
+ end
26
+ end
27
+ end
@@ -7,11 +7,11 @@ module Polars
7
7
  # @param schema [Object]
8
8
  # The DataFrame schema may be declared in several ways:
9
9
  #
10
- # * As a dict of {name:type} pairs; if type is None, it will be auto-inferred.
11
- # * As a list of column names; in this case types are automatically inferred.
12
- # * As a list of (name,type) pairs; this is equivalent to the dictionary form.
10
+ # * As a hash of \\\\{name:type} pairs; if type is nil, it will be auto-inferred.
11
+ # * As an array of column names; in this case types are automatically inferred.
12
+ # * As an array of [name,type] pairs; this is equivalent to the hash form.
13
13
  #
14
- # If you supply a list of column names that does not match the names in the
14
+ # If you supply an array of column names that does not match the names in the
15
15
  # underlying data, the names given here will overwrite them. The number
16
16
  # of names given in the schema should match the underlying data dimensions.
17
17
  # @param schema_overrides [Hash]
@@ -7,11 +7,11 @@ module Polars
7
7
  # @param schema [Object]
8
8
  # The DataFrame schema may be declared in several ways:
9
9
  #
10
- # * As a dict of {name:type} pairs; if type is None, it will be auto-inferred.
11
- # * As a list of column names; in this case types are automatically inferred.
12
- # * As a list of (name,type) pairs; this is equivalent to the dictionary form.
10
+ # * As a hash of \\\\{name:type} pairs; if type is nil, it will be auto-inferred.
11
+ # * As an array of column names; in this case types are automatically inferred.
12
+ # * As an array of [name,type] pairs; this is equivalent to the hash form.
13
13
  #
14
- # If you supply a list of column names that does not match the names in the
14
+ # If you supply an array of column names that does not match the names in the
15
15
  # underlying data, the names given here will overwrite them. The number
16
16
  # of names given in the schema should match the underlying data dimensions.
17
17
  # @param schema_overrides [Hash]
@@ -43,7 +43,7 @@ module Polars
43
43
  # Extra options that make sense for a particular storage connection.
44
44
  # @param credential_provider [Object]
45
45
  # Provide a function that can be called to provide cloud storage
46
- # credentials. The function is expected to return a dictionary of
46
+ # credentials. The function is expected to return a hash of
47
47
  # credential keys along with an optional credential expiry time.
48
48
  # @param retries [Integer]
49
49
  # Number of retries if accessing a cloud instance fails.
@@ -117,13 +117,31 @@ module Polars
117
117
  # @param source [Object]
118
118
  # Path to a file or a file-like object.
119
119
  #
120
- # @return [Hash]
120
+ # @return [Schema]
121
121
  def read_parquet_schema(source)
122
122
  if Utils.pathlike?(source)
123
123
  source = Utils.normalize_filepath(source)
124
124
  end
125
125
 
126
- Plr.parquet_schema(source)
126
+ scan_parquet(source).collect_schema
127
+ end
128
+
129
+ # Get file-level custom metadata of a Parquet file without reading data.
130
+ #
131
+ # @note
132
+ # This functionality is considered **experimental**. It may be removed or
133
+ # changed at any point without it being considered a breaking change.
134
+ #
135
+ # @param source [Object]
136
+ # Path to a file or a file-like object.
137
+ #
138
+ # @return [Hash]
139
+ def read_parquet_metadata(source)
140
+ if Utils.pathlike?(source)
141
+ source = Utils.normalize_filepath(source, check_not_directory: false)
142
+ end
143
+
144
+ Plr.read_parquet_metadata(source)
127
145
  end
128
146
 
129
147
  # Lazily read from a parquet file or multiple files via glob patterns.
@@ -171,7 +189,7 @@ module Polars
171
189
  # Extra options that make sense for a particular storage connection.
172
190
  # @param credential_provider [Object]
173
191
  # Provide a function that can be called to provide cloud storage
174
- # credentials. The function is expected to return a dictionary of
192
+ # credentials. The function is expected to return a hash of
175
193
  # credential keys along with an optional credential expiry time.
176
194
  # @param retries [Integer]
177
195
  # Number of retries if accessing a cloud instance fails.
@@ -188,6 +206,9 @@ module Polars
188
206
  # defined schema are encountered in the data:
189
207
  # * `ignore`: Silently ignores.
190
208
  # * `raise`: Raises an error.
209
+ # @param cast_options [Object]
210
+ # Configuration for column type-casting during scans. Useful for datasets
211
+ # containing files that have differing schemas.
191
212
  #
192
213
  # @return [LazyFrame]
193
214
  def scan_parquet(
@@ -210,7 +231,10 @@ module Polars
210
231
  retries: 2,
211
232
  include_file_paths: nil,
212
233
  allow_missing_columns: false,
213
- extra_columns: "raise"
234
+ extra_columns: "raise",
235
+ cast_options: nil,
236
+ _column_mapping: nil,
237
+ _deletion_files: nil
214
238
  )
215
239
  missing_columns = allow_missing_columns ? "insert" : "raise"
216
240
 
@@ -247,7 +271,7 @@ module Polars
247
271
  ScanOptions.new(
248
272
  row_index: !row_index_name.nil? ? [row_index_name, row_index_offset] : nil,
249
273
  pre_slice: !n_rows.nil? ? [0, n_rows] : nil,
250
- # cast_options: cast_options,
274
+ cast_options: cast_options,
251
275
  extra_columns: extra_columns,
252
276
  missing_columns: missing_columns,
253
277
  include_file_paths: include_file_paths,
@@ -260,7 +284,8 @@ module Polars
260
284
  storage_options: storage_options,
261
285
  # credential_provider: credential_provider_builder,
262
286
  retries: retries,
263
- # deletion_files: _deletion_files
287
+ deletion_files: _deletion_files,
288
+ column_mapping: _column_mapping
264
289
  ),
265
290
  parallel,
266
291
  low_memory,
@@ -3,7 +3,8 @@ module Polars
3
3
  class ScanOptions
4
4
  attr_reader :row_index, :pre_slice, :cast_options, :extra_columns, :missing_columns,
5
5
  :include_file_paths, :glob, :hive_partitioning, :hive_schema, :try_parse_hive_dates,
6
- :rechunk, :cache, :storage_options, :credential_provider, :retries, :column_mapping, :deletion_files
6
+ :rechunk, :cache, :storage_options, :credential_provider, :retries, :column_mapping,
7
+ :default_values, :deletion_files
7
8
 
8
9
  def initialize(
9
10
  row_index: nil,
@@ -22,6 +23,7 @@ module Polars
22
23
  credential_provider: nil,
23
24
  retries: 2,
24
25
  column_mapping: nil,
26
+ default_values: nil,
25
27
  deletion_files: nil
26
28
  )
27
29
  @row_index = row_index
@@ -40,6 +42,7 @@ module Polars
40
42
  @credential_provider = credential_provider
41
43
  @retries = retries
42
44
  @column_mapping = column_mapping
45
+ @default_values = default_values
43
46
  @deletion_files = deletion_files
44
47
  end
45
48
  end