polars-df 0.3.1-arm64-darwin → 0.4.0-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -50,6 +50,9 @@ From Parquet
50
50
 
51
51
  ```ruby
52
52
  Polars.read_parquet("file.parquet")
53
+
54
+ # or lazily with
55
+ Polars.scan_parquet("file.parquet")
53
56
  ```
54
57
 
55
58
  From Active Record
@@ -60,6 +63,32 @@ Polars.read_sql(User.all)
60
63
  Polars.read_sql("SELECT * FROM users")
61
64
  ```
62
65
 
66
+ From JSON
67
+
68
+ ```ruby
69
+ Polars.read_json("file.json")
70
+ # or
71
+ Polars.read_ndjson("file.ndjson")
72
+
73
+ # or lazily with
74
+ Polars.scan_ndjson("file.ndjson")
75
+ ```
76
+
77
+ From Feather / Arrow IPC
78
+
79
+ ```ruby
80
+ Polars.read_ipc("file.arrow")
81
+
82
+ # or lazily with
83
+ Polars.scan_ipc("file.arrow")
84
+ ```
85
+
86
+ From Avro
87
+
88
+ ```ruby
89
+ Polars.read_avro("file.avro")
90
+ ```
91
+
63
92
  From a hash
64
93
 
65
94
  ```ruby
Binary file
Binary file
Binary file
@@ -30,7 +30,7 @@ module Polars
30
30
  new_columns: nil
31
31
  )
32
32
  if Utils.pathlike?(file)
33
- path = Utils.format_path(file)
33
+ path = Utils.normalise_filepath(file)
34
34
  end
35
35
 
36
36
  dtype_list = nil
@@ -0,0 +1,77 @@
1
+ module Polars
2
+ # Namespace for binary related expressions.
3
+ class BinaryExpr
4
+ # @private
5
+ attr_accessor :_rbexpr
6
+
7
+ # @private
8
+ def initialize(expr)
9
+ self._rbexpr = expr._rbexpr
10
+ end
11
+
12
+ # Check if binaries in Series contain a binary substring.
13
+ #
14
+ # @param lit [String]
15
+ # The binary substring to look for
16
+ #
17
+ # @return [Expr]
18
+ def contains(lit)
19
+ Utils.wrap_expr(_rbexpr.binary_contains(lit))
20
+ end
21
+
22
+ # Check if string values end with a binary substring.
23
+ #
24
+ # @param sub [String]
25
+ # Suffix substring.
26
+ #
27
+ # @return [Expr]
28
+ def ends_with(sub)
29
+ Utils.wrap_expr(_rbexpr.binary_ends_with(sub))
30
+ end
31
+
32
+ # Check if values start with a binary substring.
33
+ #
34
+ # @param sub [String]
35
+ # Prefix substring.
36
+ #
37
+ # @return [Expr]
38
+ def starts_with(sub)
39
+ Utils.wrap_expr(_rbexpr.binary_starts_with(sub))
40
+ end
41
+
42
+ # Decode a value using the provided encoding.
43
+ #
44
+ # @param encoding ["hex", "base64"]
45
+ # The encoding to use.
46
+ # @param strict [Boolean]
47
+ # Raise an error if the underlying value cannot be decoded,
48
+ # otherwise mask out with a null value.
49
+ #
50
+ # @return [Expr]
51
+ def decode(encoding, strict: true)
52
+ if encoding == "hex"
53
+ Utils.wrap_expr(_rbexpr.binary_hex_decode(strict))
54
+ elsif encoding == "base64"
55
+ Utils.wrap_expr(_rbexpr.binary_base64_decode(strict))
56
+ else
57
+ raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
58
+ end
59
+ end
60
+
61
+ # Encode a value using the provided encoding.
62
+ #
63
+ # @param encoding ["hex", "base64"]
64
+ # The encoding to use.
65
+ #
66
+ # @return [Expr]
67
+ def encode(encoding)
68
+ if encoding == "hex"
69
+ Utils.wrap_expr(_rbexpr.binary_hex_encode)
70
+ elsif encoding == "base64"
71
+ Utils.wrap_expr(_rbexpr.binary_base64_encode)
72
+ else
73
+ raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,66 @@
1
+ module Polars
2
+ # Series.bin namespace.
3
+ class BinaryNameSpace
4
+ include ExprDispatch
5
+
6
+ self._accessor = "bin"
7
+
8
+ # @private
9
+ def initialize(series)
10
+ self._s = series._s
11
+ end
12
+
13
+ # Check if binaries in Series contain a binary substring.
14
+ #
15
+ # @param lit [String]
16
+ # The binary substring to look for
17
+ #
18
+ # @return [Series]
19
+ def contains(lit)
20
+ super
21
+ end
22
+
23
+ # Check if string values end with a binary substring.
24
+ #
25
+ # @param sub [String]
26
+ # Suffix substring.
27
+ #
28
+ # @return [Series]
29
+ def ends_with(sub)
30
+ super
31
+ end
32
+
33
+ # Check if values start with a binary substring.
34
+ #
35
+ # @param sub [String]
36
+ # Prefix substring.
37
+ #
38
+ # @return [Series]
39
+ def starts_with(sub)
40
+ super
41
+ end
42
+
43
+ # Decode a value using the provided encoding.
44
+ #
45
+ # @param encoding ["hex", "base64"]
46
+ # The encoding to use.
47
+ # @param strict [Boolean]
48
+ # Raise an error if the underlying value cannot be decoded,
49
+ # otherwise mask out with a null value.
50
+ #
51
+ # @return [Series]
52
+ def decode(encoding, strict: true)
53
+ super
54
+ end
55
+
56
+ # Encode a value using the provided encoding.
57
+ #
58
+ # @param encoding ["hex", "base64"]
59
+ # The encoding to use.
60
+ #
61
+ # @return [Series]
62
+ def encode(encoding)
63
+ super
64
+ end
65
+ end
66
+ end
@@ -97,7 +97,7 @@ module Polars
97
97
  eol_char: "\n"
98
98
  )
99
99
  if Utils.pathlike?(file)
100
- path = Utils.format_path(file)
100
+ path = Utils.normalise_filepath(file)
101
101
  else
102
102
  path = nil
103
103
  # if defined?(StringIO) && file.is_a?(StringIO)
@@ -196,32 +196,56 @@ module Polars
196
196
 
197
197
  # @private
198
198
  def self._read_parquet(
199
- file,
199
+ source,
200
200
  columns: nil,
201
201
  n_rows: nil,
202
202
  parallel: "auto",
203
203
  row_count_name: nil,
204
204
  row_count_offset: 0,
205
- low_memory: false
205
+ low_memory: false,
206
+ use_statistics: true,
207
+ rechunk: true
206
208
  )
207
- if Utils.pathlike?(file)
208
- file = Utils.format_path(file)
209
+ if Utils.pathlike?(source)
210
+ source = Utils.normalise_filepath(source)
211
+ end
212
+ if columns.is_a?(String)
213
+ columns = [columns]
209
214
  end
210
215
 
211
- if file.is_a?(String) && file.include?("*")
212
- raise Todo
216
+ if source.is_a?(String) && source.include?("*") && Utils.local_file?(source)
217
+ scan =
218
+ Polars.scan_parquet(
219
+ source,
220
+ n_rows: n_rows,
221
+ rechunk: true,
222
+ parallel: parallel,
223
+ row_count_name: row_count_name,
224
+ row_count_offset: row_count_offset,
225
+ low_memory: low_memory
226
+ )
227
+
228
+ if columns.nil?
229
+ return self._from_rbdf(scan.collect._df)
230
+ elsif Utils.is_str_sequence(columns, allow_str: false)
231
+ return self._from_rbdf(scan.select(columns).collect._df)
232
+ else
233
+ raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: Array[String]"
234
+ end
213
235
  end
214
236
 
215
237
  projection, columns = Utils.handle_projection_columns(columns)
216
238
  _from_rbdf(
217
239
  RbDataFrame.read_parquet(
218
- file,
240
+ source,
219
241
  columns,
220
242
  projection,
221
243
  n_rows,
222
244
  parallel,
223
245
  Utils._prepare_row_count_args(row_count_name, row_count_offset),
224
- low_memory
246
+ low_memory,
247
+ use_statistics,
248
+ rechunk
225
249
  )
226
250
  )
227
251
  end
@@ -229,7 +253,7 @@ module Polars
229
253
  # @private
230
254
  def self._read_avro(file, columns: nil, n_rows: nil)
231
255
  if Utils.pathlike?(file)
232
- file = Utils.format_path(file)
256
+ file = Utils.normalise_filepath(file)
233
257
  end
234
258
  projection, columns = Utils.handle_projection_columns(columns)
235
259
  _from_rbdf(RbDataFrame.read_avro(file, columns, projection, n_rows))
@@ -246,7 +270,7 @@ module Polars
246
270
  memory_map: true
247
271
  )
248
272
  if Utils.pathlike?(file)
249
- file = Utils.format_path(file)
273
+ file = Utils.normalise_filepath(file)
250
274
  end
251
275
  if columns.is_a?(String)
252
276
  columns = [columns]
@@ -272,7 +296,7 @@ module Polars
272
296
  # @private
273
297
  def self._read_json(file)
274
298
  if Utils.pathlike?(file)
275
- file = Utils.format_path(file)
299
+ file = Utils.normalise_filepath(file)
276
300
  end
277
301
 
278
302
  _from_rbdf(RbDataFrame.read_json(file))
@@ -281,7 +305,7 @@ module Polars
281
305
  # @private
282
306
  def self._read_ndjson(file)
283
307
  if Utils.pathlike?(file)
284
- file = Utils.format_path(file)
308
+ file = Utils.normalise_filepath(file)
285
309
  end
286
310
 
287
311
  _from_rbdf(RbDataFrame.read_ndjson(file))
@@ -774,7 +798,7 @@ module Polars
774
798
  row_oriented: false
775
799
  )
776
800
  if Utils.pathlike?(file)
777
- file = Utils.format_path(file)
801
+ file = Utils.normalise_filepath(file)
778
802
  end
779
803
 
780
804
  _df.write_json(file, pretty, row_oriented)
@@ -789,7 +813,7 @@ module Polars
789
813
  # @return [nil]
790
814
  def write_ndjson(file)
791
815
  if Utils.pathlike?(file)
792
- file = Utils.format_path(file)
816
+ file = Utils.normalise_filepath(file)
793
817
  end
794
818
 
795
819
  _df.write_ndjson(file)
@@ -879,7 +903,7 @@ module Polars
879
903
  end
880
904
 
881
905
  if Utils.pathlike?(file)
882
- file = Utils.format_path(file)
906
+ file = Utils.normalise_filepath(file)
883
907
  end
884
908
 
885
909
  _df.write_csv(
@@ -917,7 +941,7 @@ module Polars
917
941
  compression = "uncompressed"
918
942
  end
919
943
  if Utils.pathlike?(file)
920
- file = Utils.format_path(file)
944
+ file = Utils.normalise_filepath(file)
921
945
  end
922
946
 
923
947
  _df.write_avro(file, compression)
@@ -936,7 +960,7 @@ module Polars
936
960
  compression = "uncompressed"
937
961
  end
938
962
  if Utils.pathlike?(file)
939
- file = Utils.format_path(file)
963
+ file = Utils.normalise_filepath(file)
940
964
  end
941
965
 
942
966
  _df.write_ipc(file, compression)
@@ -978,7 +1002,7 @@ module Polars
978
1002
  compression = "uncompressed"
979
1003
  end
980
1004
  if Utils.pathlike?(file)
981
- file = Utils.format_path(file)
1005
+ file = Utils.normalise_filepath(file)
982
1006
  end
983
1007
 
984
1008
  _df.write_parquet(
@@ -3042,24 +3066,28 @@ module Polars
3042
3066
  if aggregate_fn.is_a?(String)
3043
3067
  case aggregate_fn
3044
3068
  when "first"
3045
- aggregate_fn = Polars.element.first
3069
+ aggregate_expr = Polars.element.first._rbexpr
3046
3070
  when "sum"
3047
- aggregate_fn = Polars.element.sum
3071
+ aggregate_expr = Polars.element.sum._rbexpr
3048
3072
  when "max"
3049
- aggregate_fn = Polars.element.max
3073
+ aggregate_expr = Polars.element.max._rbexpr
3050
3074
  when "min"
3051
- aggregate_fn = Polars.element.min
3075
+ aggregate_expr = Polars.element.min._rbexpr
3052
3076
  when "mean"
3053
- aggregate_fn = Polars.element.mean
3077
+ aggregate_expr = Polars.element.mean._rbexpr
3054
3078
  when "median"
3055
- aggregate_fn = Polars.element.median
3079
+ aggregate_expr = Polars.element.median._rbexpr
3056
3080
  when "last"
3057
- aggregate_fn = Polars.element.last
3081
+ aggregate_expr = Polars.element.last._rbexpr
3058
3082
  when "count"
3059
- aggregate_fn = Polars.count
3083
+ aggregate_expr = Polars.count._rbexpr
3060
3084
  else
3061
3085
  raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected."
3062
3086
  end
3087
+ elsif aggregate_fn.nil?
3088
+ aggregate_expr = nil
3089
+ else
3090
+ aggregate_expr = aggregate_function._rbexpr
3063
3091
  end
3064
3092
 
3065
3093
  _from_rbdf(
@@ -3067,9 +3095,9 @@ module Polars
3067
3095
  values,
3068
3096
  index,
3069
3097
  columns,
3070
- aggregate_fn._rbexpr,
3071
3098
  maintain_order,
3072
3099
  sort_columns,
3100
+ aggregate_expr,
3073
3101
  separator
3074
3102
  )
3075
3103
  )
@@ -3174,7 +3202,7 @@ module Polars
3174
3202
  # # │ B ┆ 1 │
3175
3203
  # # │ C ┆ 2 │
3176
3204
  # # │ D ┆ 3 │
3177
- # # │ ... ...
3205
+ # # │
3178
3206
  # # │ F ┆ 5 │
3179
3207
  # # │ G ┆ 6 │
3180
3208
  # # │ H ┆ 7 │
@@ -4053,15 +4081,12 @@ module Polars
4053
4081
  # # │ 5 ┆ 3.0 ┆ true │
4054
4082
  # # └─────┴─────┴───────┘
4055
4083
  def unique(maintain_order: true, subset: nil, keep: "first")
4056
- if !subset.nil?
4057
- if subset.is_a?(String)
4058
- subset = [subset]
4059
- elsif !subset.is_a?(Array)
4060
- subset = subset.to_a
4061
- end
4062
- end
4063
-
4064
- _from_rbdf(_df.unique(maintain_order, subset, keep))
4084
+ self._from_rbdf(
4085
+ lazy
4086
+ .unique(maintain_order: maintain_order, subset: subset, keep: keep)
4087
+ .collect(no_optimization: true)
4088
+ ._df
4089
+ )
4065
4090
  end
4066
4091
 
4067
4092
  # Return the number of unique rows, or the number of unique row-subsets.
@@ -1130,7 +1130,7 @@ module Polars
1130
1130
  # ]
1131
1131
  # )
1132
1132
  # # =>
1133
- # # shape: (1001, 2)
1133
+ # # shape: (1_001, 2)
1134
1134
  # # ┌─────────────────────────┬───────────────────┐
1135
1135
  # # │ date ┆ milliseconds_diff │
1136
1136
  # # │ --- ┆ --- │
@@ -1140,7 +1140,7 @@ module Polars
1140
1140
  # # │ 2020-01-01 00:00:00.001 ┆ 1 │
1141
1141
  # # │ 2020-01-01 00:00:00.002 ┆ 1 │
1142
1142
  # # │ 2020-01-01 00:00:00.003 ┆ 1 │
1143
- # # │ ... ...
1143
+ # # │
1144
1144
  # # │ 2020-01-01 00:00:00.997 ┆ 1 │
1145
1145
  # # │ 2020-01-01 00:00:00.998 ┆ 1 │
1146
1146
  # # │ 2020-01-01 00:00:00.999 ┆ 1 │
@@ -1169,7 +1169,7 @@ module Polars
1169
1169
  # ]
1170
1170
  # )
1171
1171
  # # =>
1172
- # # shape: (1001, 2)
1172
+ # # shape: (1_001, 2)
1173
1173
  # # ┌─────────────────────────┬───────────────────┐
1174
1174
  # # │ date ┆ microseconds_diff │
1175
1175
  # # │ --- ┆ --- │
@@ -1179,7 +1179,7 @@ module Polars
1179
1179
  # # │ 2020-01-01 00:00:00.001 ┆ 1000 │
1180
1180
  # # │ 2020-01-01 00:00:00.002 ┆ 1000 │
1181
1181
  # # │ 2020-01-01 00:00:00.003 ┆ 1000 │
1182
- # # │ ... ...
1182
+ # # │
1183
1183
  # # │ 2020-01-01 00:00:00.997 ┆ 1000 │
1184
1184
  # # │ 2020-01-01 00:00:00.998 ┆ 1000 │
1185
1185
  # # │ 2020-01-01 00:00:00.999 ┆ 1000 │
@@ -1208,7 +1208,7 @@ module Polars
1208
1208
  # ]
1209
1209
  # )
1210
1210
  # # =>
1211
- # # shape: (1001, 2)
1211
+ # # shape: (1_001, 2)
1212
1212
  # # ┌─────────────────────────┬──────────────────┐
1213
1213
  # # │ date ┆ nanoseconds_diff │
1214
1214
  # # │ --- ┆ --- │
@@ -1218,7 +1218,7 @@ module Polars
1218
1218
  # # │ 2020-01-01 00:00:00.001 ┆ 1000000 │
1219
1219
  # # │ 2020-01-01 00:00:00.002 ┆ 1000000 │
1220
1220
  # # │ 2020-01-01 00:00:00.003 ┆ 1000000 │
1221
- # # │ ... ...
1221
+ # # │
1222
1222
  # # │ 2020-01-01 00:00:00.997 ┆ 1000000 │
1223
1223
  # # │ 2020-01-01 00:00:00.998 ┆ 1000000 │
1224
1224
  # # │ 2020-01-01 00:00:00.999 ┆ 1000000 │
data/lib/polars/expr.rb CHANGED
@@ -2194,7 +2194,7 @@ module Polars
2194
2194
  # # │ 4 │
2195
2195
  # # │ 6 │
2196
2196
  # # │ 6 │
2197
- # # │ ...
2197
+ # # │
2198
2198
  # # │ 6 │
2199
2199
  # # │ 6 │
2200
2200
  # # │ 6 │
@@ -2571,7 +2571,7 @@ module Polars
2571
2571
  # # │ e │
2572
2572
  # # │ l │
2573
2573
  # # │ l │
2574
- # # │ ...
2574
+ # # │
2575
2575
  # # │ o │
2576
2576
  # # │ r │
2577
2577
  # # │ l │
@@ -4962,6 +4962,13 @@ module Polars
4962
4962
  ListExpr.new(self)
4963
4963
  end
4964
4964
 
4965
+ # Create an object namespace of all binary related methods.
4966
+ #
4967
+ # @return [BinaryExpr]
4968
+ def bin
4969
+ BinaryExpr.new(self)
4970
+ end
4971
+
4965
4972
  # Create an object namespace of all categorical related methods.
4966
4973
  #
4967
4974
  # @return [CatExpr]