polars-df 0.5.0 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +26 -0
  3. data/Cargo.lock +595 -709
  4. data/Cargo.toml +1 -0
  5. data/README.md +11 -9
  6. data/ext/polars/Cargo.toml +18 -10
  7. data/ext/polars/src/batched_csv.rs +26 -26
  8. data/ext/polars/src/conversion.rs +272 -136
  9. data/ext/polars/src/dataframe.rs +135 -94
  10. data/ext/polars/src/error.rs +8 -5
  11. data/ext/polars/src/expr/array.rs +15 -0
  12. data/ext/polars/src/expr/binary.rs +18 -6
  13. data/ext/polars/src/expr/datetime.rs +10 -12
  14. data/ext/polars/src/expr/general.rs +78 -264
  15. data/ext/polars/src/expr/list.rs +41 -28
  16. data/ext/polars/src/{expr.rs → expr/mod.rs} +5 -2
  17. data/ext/polars/src/expr/name.rs +44 -0
  18. data/ext/polars/src/expr/rolling.rs +196 -0
  19. data/ext/polars/src/expr/string.rs +94 -66
  20. data/ext/polars/src/file.rs +3 -3
  21. data/ext/polars/src/functions/aggregation.rs +35 -0
  22. data/ext/polars/src/functions/eager.rs +7 -31
  23. data/ext/polars/src/functions/io.rs +10 -10
  24. data/ext/polars/src/functions/lazy.rs +119 -54
  25. data/ext/polars/src/functions/meta.rs +30 -0
  26. data/ext/polars/src/functions/misc.rs +8 -0
  27. data/ext/polars/src/functions/mod.rs +5 -0
  28. data/ext/polars/src/functions/random.rs +6 -0
  29. data/ext/polars/src/functions/range.rs +46 -0
  30. data/ext/polars/src/functions/string_cache.rs +11 -0
  31. data/ext/polars/src/functions/whenthen.rs +7 -7
  32. data/ext/polars/src/lazyframe.rs +61 -44
  33. data/ext/polars/src/lib.rs +173 -84
  34. data/ext/polars/src/{apply → map}/dataframe.rs +28 -33
  35. data/ext/polars/src/{apply → map}/mod.rs +10 -6
  36. data/ext/polars/src/{apply → map}/series.rs +12 -16
  37. data/ext/polars/src/object.rs +2 -2
  38. data/ext/polars/src/rb_modules.rs +25 -6
  39. data/ext/polars/src/series/construction.rs +32 -6
  40. data/ext/polars/src/series/export.rs +2 -2
  41. data/ext/polars/src/series/set_at_idx.rs +33 -17
  42. data/ext/polars/src/series.rs +62 -42
  43. data/ext/polars/src/sql.rs +46 -0
  44. data/lib/polars/array_expr.rb +84 -0
  45. data/lib/polars/array_name_space.rb +77 -0
  46. data/lib/polars/batched_csv_reader.rb +1 -1
  47. data/lib/polars/config.rb +530 -0
  48. data/lib/polars/data_frame.rb +206 -131
  49. data/lib/polars/data_types.rb +163 -29
  50. data/lib/polars/date_time_expr.rb +13 -18
  51. data/lib/polars/date_time_name_space.rb +22 -28
  52. data/lib/polars/dynamic_group_by.rb +2 -2
  53. data/lib/polars/expr.rb +241 -151
  54. data/lib/polars/functions.rb +29 -38
  55. data/lib/polars/group_by.rb +38 -76
  56. data/lib/polars/io.rb +37 -2
  57. data/lib/polars/lazy_frame.rb +174 -95
  58. data/lib/polars/lazy_functions.rb +87 -63
  59. data/lib/polars/lazy_group_by.rb +7 -8
  60. data/lib/polars/list_expr.rb +40 -36
  61. data/lib/polars/list_name_space.rb +15 -15
  62. data/lib/polars/name_expr.rb +198 -0
  63. data/lib/polars/rolling_group_by.rb +6 -4
  64. data/lib/polars/series.rb +95 -28
  65. data/lib/polars/sql_context.rb +194 -0
  66. data/lib/polars/string_expr.rb +249 -69
  67. data/lib/polars/string_name_space.rb +155 -25
  68. data/lib/polars/utils.rb +119 -57
  69. data/lib/polars/version.rb +1 -1
  70. data/lib/polars.rb +6 -0
  71. metadata +21 -7
  72. /data/ext/polars/src/{apply → map}/lazy.rs +0 -0
@@ -0,0 +1,194 @@
1
+ module Polars
2
+ # Run SQL queries against DataFrame/LazyFrame data.
3
+ class SQLContext
4
+ # @private
5
+ attr_accessor :_ctxt, :_eager_execution
6
+
7
+ # Initialize a new `SQLContext`.
8
+ def initialize(frames = nil, eager_execution: false, **named_frames)
9
+ self._ctxt = RbSQLContext.new
10
+ self._eager_execution = eager_execution
11
+
12
+ frames = (frames || {}).to_h
13
+
14
+ if frames.any? || named_frames.any?
15
+ register_many(frames, **named_frames)
16
+ end
17
+ end
18
+
19
+ # Parse the given SQL query and execute it against the registered frame data.
20
+ #
21
+ # @param query [String]
22
+ # A valid string SQL query.
23
+ # @param eager [Boolean]
24
+ # Apply the query eagerly, returning `DataFrame` instead of `LazyFrame`.
25
+ # If unset, the value of the init-time parameter "eager_execution" will be
26
+ # used. (Note that the query itself is always executed in lazy-mode; this
27
+ # parameter only impacts the type of the returned frame).
28
+ #
29
+ # @return [Object]
30
+ #
31
+ # @example Execute a SQL query against the registered frame data:
32
+ # df = Polars::DataFrame.new(
33
+ # [
34
+ # ["The Godfather", 1972, 6_000_000, 134_821_952, 9.2],
35
+ # ["The Dark Knight", 2008, 185_000_000, 533_316_061, 9.0],
36
+ # ["Schindler's List", 1993, 22_000_000, 96_067_179, 8.9],
37
+ # ["Pulp Fiction", 1994, 8_000_000, 107_930_000, 8.9],
38
+ # ["The Shawshank Redemption", 1994, 25_000_000, 28_341_469, 9.3],
39
+ # ],
40
+ # schema: ["title", "release_year", "budget", "gross", "imdb_score"]
41
+ # )
42
+ # ctx = Polars::SQLContext.new(films: df)
43
+ # ctx.execute(
44
+ # "
45
+ # SELECT title, release_year, imdb_score
46
+ # FROM films
47
+ # WHERE release_year > 1990
48
+ # ORDER BY imdb_score DESC
49
+ # ",
50
+ # eager: true
51
+ # )
52
+ # # =>
53
+ # # shape: (4, 3)
54
+ # # ┌──────────────────────────┬──────────────┬────────────┐
55
+ # # │ title ┆ release_year ┆ imdb_score │
56
+ # # │ --- ┆ --- ┆ --- │
57
+ # # │ str ┆ i64 ┆ f64 │
58
+ # # ╞══════════════════════════╪══════════════╪════════════╡
59
+ # # │ The Shawshank Redemption ┆ 1994 ┆ 9.3 │
60
+ # # │ The Dark Knight ┆ 2008 ┆ 9.0 │
61
+ # # │ Schindler's List ┆ 1993 ┆ 8.9 │
62
+ # # │ Pulp Fiction ┆ 1994 ┆ 8.9 │
63
+ # # └──────────────────────────┴──────────────┴────────────┘
64
+ #
65
+ # @example Execute a GROUP BY query:
66
+ # ctx.execute(
67
+ # "
68
+ # SELECT
69
+ # MAX(release_year / 10) * 10 AS decade,
70
+ # SUM(gross) AS total_gross,
71
+ # COUNT(title) AS n_films,
72
+ # FROM films
73
+ # GROUP BY (release_year / 10) -- decade
74
+ # ORDER BY total_gross DESC
75
+ # ",
76
+ # eager: true
77
+ # )
78
+ # # =>
79
+ # # shape: (3, 3)
80
+ # # ┌────────┬─────────────┬─────────┐
81
+ # # │ decade ┆ total_gross ┆ n_films │
82
+ # # │ --- ┆ --- ┆ --- │
83
+ # # │ i64 ┆ i64 ┆ u32 │
84
+ # # ╞════════╪═════════════╪═════════╡
85
+ # # │ 2000 ┆ 533316061 ┆ 1 │
86
+ # # │ 1990 ┆ 232338648 ┆ 3 │
87
+ # # │ 1970 ┆ 134821952 ┆ 1 │
88
+ # # └────────┴─────────────┴─────────┘
89
+ def execute(query, eager: nil)
90
+ res = Utils.wrap_ldf(_ctxt.execute(query))
91
+ eager || _eager_execution ? res.collect : res
92
+ end
93
+
94
+ # Register a single frame as a table, using the given name.
95
+ #
96
+ # @param name [String]
97
+ # Name of the table.
98
+ # @param frame [Object]
99
+ # eager/lazy frame to associate with this table name.
100
+ #
101
+ # @return [SQLContext]
102
+ #
103
+ # @example
104
+ # df = Polars::DataFrame.new({"hello" => ["world"]})
105
+ # ctx = Polars::SQLContext.new
106
+ # ctx.register("frame_data", df).execute("SELECT * FROM frame_data").collect
107
+ # # =>
108
+ # # shape: (1, 1)
109
+ # # ┌───────┐
110
+ # # │ hello │
111
+ # # │ --- │
112
+ # # │ str │
113
+ # # ╞═══════╡
114
+ # # │ world │
115
+ # # └───────┘
116
+ def register(name, frame)
117
+ if frame.is_a?(DataFrame)
118
+ frame = frame.lazy
119
+ end
120
+ _ctxt.register(name.to_s, frame._ldf)
121
+ self
122
+ end
123
+
124
+ # Register multiple eager/lazy frames as tables, using the associated names.
125
+ #
126
+ # @param frames [Hash]
127
+ # A `{name:frame, ...}` mapping.
128
+ # @param named_frames [Object]
129
+ # Named eager/lazy frames, provided as kwargs.
130
+ #
131
+ # @return [SQLContext]
132
+ def register_many(frames, **named_frames)
133
+ frames = (frames || {}).to_h
134
+ frames = frames.merge(named_frames)
135
+ frames.each do |name, frame|
136
+ register(name, frame)
137
+ end
138
+ self
139
+ end
140
+
141
+ # Unregister one or more eager/lazy frames by name.
142
+ #
143
+ # @param names [Object]
144
+ # Names of the tables to unregister.
145
+ #
146
+ # @return [SQLContext]
147
+ #
148
+ # @example Register with a SQLContext object:
149
+ # df0 = Polars::DataFrame.new({"ints" => [9, 8, 7, 6, 5]})
150
+ # lf1 = Polars::LazyFrame.new({"text" => ["a", "b", "c"]})
151
+ # lf2 = Polars::LazyFrame.new({"misc" => ["testing1234"]})
152
+ # ctx = Polars::SQLContext.new(test1: df0, test2: lf1, test3: lf2)
153
+ # ctx.tables
154
+ # # => ["test1", "test2", "test3"]
155
+ #
156
+ # @example Unregister one or more of the tables:
157
+ # ctx.unregister(["test1", "test3"]).tables
158
+ # # => ["test2"]
159
+ def unregister(names)
160
+ if names.is_a?(String)
161
+ names = [names]
162
+ end
163
+ names.each do |nm|
164
+ _ctxt.unregister(nm)
165
+ end
166
+ self
167
+ end
168
+
169
+ # Return a list of the registered table names.
170
+ #
171
+ # @return [Array]
172
+ #
173
+ # @example Executing as SQL:
174
+ # frame_data = Polars::DataFrame.new({"hello" => ["world"]})
175
+ # ctx = Polars::SQLContext.new(hello_world: frame_data)
176
+ # ctx.execute("SHOW TABLES", eager: true)
177
+ # # =>
178
+ # # shape: (1, 1)
179
+ # # ┌─────────────┐
180
+ # # │ name │
181
+ # # │ --- │
182
+ # # │ str │
183
+ # # ╞═════════════╡
184
+ # # │ hello_world │
185
+ # # └─────────────┘
186
+ #
187
+ # @example Calling the method:
188
+ # ctx.tables
189
+ # # => ["hello_world"]
190
+ def tables
191
+ _ctxt.get_tables.sort
192
+ end
193
+ end
194
+ end
@@ -9,11 +9,134 @@ module Polars
9
9
  self._rbexpr = expr._rbexpr
10
10
  end
11
11
 
12
+ # Convert a Utf8 column into a Date column.
13
+ #
14
+ # @param format [String]
15
+ # Format to use for conversion. Refer to the
16
+ # [chrono crate documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
17
+ # for the full specification. Example: `"%Y-%m-%d"`.
18
+ # If set to nil (default), the format is inferred from the data.
19
+ # @param strict [Boolean]
20
+ # Raise an error if any conversion fails.
21
+ # @param exact [Boolean]
22
+ # Require an exact format match. If false, allow the format to match anywhere
23
+ # in the target string.
24
+ # @param cache [Boolean]
25
+ # Use a cache of unique, converted dates to apply the conversion.
26
+ #
27
+ # @return [Expr]
28
+ #
29
+ # @example
30
+ # s = Polars::Series.new(["2020/01/01", "2020/02/01", "2020/03/01"])
31
+ # s.str.to_date
32
+ # # =>
33
+ # # shape: (3,)
34
+ # # Series: '' [date]
35
+ # # [
36
+ # # 2020-01-01
37
+ # # 2020-02-01
38
+ # # 2020-03-01
39
+ # # ]
40
+ def to_date(format = nil, strict: true, exact: true, cache: true)
41
+ _validate_format_argument(format)
42
+ Utils.wrap_expr(self._rbexpr.str_to_date(format, strict, exact, cache))
43
+ end
44
+
45
+ # Convert a Utf8 column into a Datetime column.
46
+ #
47
+ # @param format [String]
48
+ # Format to use for conversion. Refer to the
49
+ # [chrono crate documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
50
+ # for the full specification. Example: `"%Y-%m-%d %H:%M:%S"`.
51
+ # If set to nil (default), the format is inferred from the data.
52
+ # @param time_unit ["us", "ns", "ms"]
53
+ # Unit of time for the resulting Datetime column. If set to nil (default),
54
+ # the time unit is inferred from the format string if given, eg:
55
+ # `"%F %T%.3f"` => `Datetime("ms")`. If no fractional second component is
56
+ # found, the default is `"us"`.
57
+ # @param time_zone [String]
58
+ # Time zone for the resulting Datetime column.
59
+ # @param strict [Boolean]
60
+ # Raise an error if any conversion fails.
61
+ # @param exact [Boolean]
62
+ # Require an exact format match. If false, allow the format to match anywhere
63
+ # in the target string.
64
+ # @param cache [Boolean]
65
+ # Use a cache of unique, converted datetimes to apply the conversion.
66
+ #
67
+ # @return [Expr]
68
+ #
69
+ # @example
70
+ # s = Polars::Series.new(["2020-01-01 01:00Z", "2020-01-01 02:00Z"])
71
+ # s.str.to_datetime("%Y-%m-%d %H:%M%#z")
72
+ # # =>
73
+ # # shape: (2,)
74
+ # # Series: '' [datetime[μs, UTC]]
75
+ # # [
76
+ # # 2020-01-01 01:00:00 UTC
77
+ # # 2020-01-01 02:00:00 UTC
78
+ # # ]
79
+ def to_datetime(
80
+ format = nil,
81
+ time_unit: nil,
82
+ time_zone: nil,
83
+ strict: true,
84
+ exact: true,
85
+ cache: true,
86
+ use_earliest: nil,
87
+ ambiguous: "raise"
88
+ )
89
+ _validate_format_argument(format)
90
+ ambiguous = Utils.rename_use_earliest_to_ambiguous(use_earliest, ambiguous)
91
+ ambiguous = Polars.lit(ambiguous) unless ambiguous.is_a?(Expr)
92
+ Utils.wrap_expr(
93
+ self._rbexpr.str_to_datetime(
94
+ format,
95
+ time_unit,
96
+ time_zone,
97
+ strict,
98
+ exact,
99
+ cache,
100
+ ambiguous._rbexpr
101
+ )
102
+ )
103
+ end
104
+
105
+ # Convert a Utf8 column into a Time column.
106
+ #
107
+ # @param format [String]
108
+ # Format to use for conversion. Refer to the
109
+ # [chrono crate documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
110
+ # for the full specification. Example: `"%H:%M:%S"`.
111
+ # If set to nil (default), the format is inferred from the data.
112
+ # @param strict [Boolean]
113
+ # Raise an error if any conversion fails.
114
+ # @param cache [Boolean]
115
+ # Use a cache of unique, converted times to apply the conversion.
116
+ #
117
+ # @return [Expr]
118
+ #
119
+ # @example
120
+ # s = Polars::Series.new(["01:00", "02:00", "03:00"])
121
+ # s.str.to_time("%H:%M")
122
+ # # =>
123
+ # # shape: (3,)
124
+ # # Series: '' [time]
125
+ # # [
126
+ # # 01:00:00
127
+ # # 02:00:00
128
+ # # 03:00:00
129
+ # # ]
130
+ def to_time(format = nil, strict: true, cache: true)
131
+ _validate_format_argument(format)
132
+ Utils.wrap_expr(_rbexpr.str_to_time(format, strict, cache))
133
+ end
134
+
12
135
  # Parse a Utf8 expression to a Date/Datetime/Time type.
13
136
  #
14
137
  # @param dtype [Object]
15
138
  # The data type to convert into. Can be either Date, Datetime, or Time.
16
- # @param fmt [String]
139
+ # @param format [String]
17
140
  # Format to use, refer to the
18
141
  # [chrono strftime documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
19
142
  # for specification. Example: `"%y-%m-%d"`.
@@ -38,10 +161,10 @@ module Polars
38
161
  # s.str.strptime(Polars::Datetime, "%Y-%m-%d %H:%M%#z")
39
162
  # # =>
40
163
  # # shape: (2,)
41
- # # Series: '' [datetime[μs, +00:00]]
164
+ # # Series: '' [datetime[μs, UTC]]
42
165
  # # [
43
- # # 2020-01-01 01:00:00 +00:00
44
- # # 2020-01-01 02:00:00 +00:00
166
+ # # 2020-01-01 01:00:00 UTC
167
+ # # 2020-01-01 02:00:00 UTC
45
168
  # # ]
46
169
  #
47
170
  # @example Dealing with different formats.
@@ -71,16 +194,18 @@ module Polars
71
194
  # # 2022-01-31
72
195
  # # 2001-07-08
73
196
  # # ]
74
- def strptime(dtype, fmt = nil, strict: true, exact: true, cache: true, tz_aware: false, utc: false)
197
+ def strptime(dtype, format = nil, strict: true, exact: true, cache: true, utc: false)
198
+ _validate_format_argument(format)
199
+
75
200
  if dtype == Date
76
- Utils.wrap_expr(_rbexpr.str_parse_date(fmt, strict, exact, cache))
201
+ to_date(format, strict: strict, exact: exact, cache: cache)
77
202
  elsif dtype == Datetime || dtype.is_a?(Datetime)
78
203
  dtype = Datetime.new if dtype == Datetime
79
204
  time_unit = dtype.time_unit
80
205
  time_zone = dtype.time_zone
81
- Utils.wrap_expr(_rbexpr.str_parse_datetime(fmt, time_unit, time_zone, strict, exact, cache, tz_aware, utc))
206
+ to_datetime(format, time_unit: time_unit, time_zone: time_zone, strict: strict, exact: exact, cache: cache)
82
207
  elsif dtype == Time
83
- Utils.wrap_expr(_rbexpr.str_parse_time(fmt, strict, exact, cache))
208
+ to_time(format, strict: strict, cache: cache)
84
209
  else
85
210
  raise ArgumentError, "dtype should be of type {Date, Datetime, Time}"
86
211
  end
@@ -115,7 +240,7 @@ module Polars
115
240
  # # │ 東京 ┆ 6 ┆ 2 │
116
241
  # # └──────┴────────┴────────┘
117
242
  def lengths
118
- Utils.wrap_expr(_rbexpr.str_lengths)
243
+ Utils.wrap_expr(_rbexpr.str_len_bytes)
119
244
  end
120
245
 
121
246
  # Get length of the strings as `:u32` (as number of chars).
@@ -147,13 +272,15 @@ module Polars
147
272
  # # │ 東京 ┆ 6 ┆ 2 │
148
273
  # # └──────┴────────┴────────┘
149
274
  def n_chars
150
- Utils.wrap_expr(_rbexpr.str_n_chars)
275
+ Utils.wrap_expr(_rbexpr.str_len_chars)
151
276
  end
152
277
 
153
278
  # Vertically concat the values in the Series to a single string value.
154
279
  #
155
280
  # @param delimiter [String]
156
281
  # The delimiter to insert between consecutive string values.
282
+ # @param ignore_nulls [Boolean]
283
+ # Ignore null values (default).
157
284
  #
158
285
  # @return [Expr]
159
286
  #
@@ -162,15 +289,28 @@ module Polars
162
289
  # df.select(Polars.col("foo").str.concat("-"))
163
290
  # # =>
164
291
  # # shape: (1, 1)
165
- # # ┌──────────┐
166
- # # │ foo
167
- # # │ ---
168
- # # │ str
169
- # # ╞══════════╡
170
- # # │ 1-null-2 │
171
- # # └──────────┘
172
- def concat(delimiter = "-")
173
- Utils.wrap_expr(_rbexpr.str_concat(delimiter))
292
+ # # ┌─────┐
293
+ # # │ foo
294
+ # # │ ---
295
+ # # │ str
296
+ # # ╞═════╡
297
+ # # │ 1-2 │
298
+ # # └─────┘
299
+ #
300
+ # @example
301
+ # df = Polars::DataFrame.new({"foo" => [1, nil, 2]})
302
+ # df.select(Polars.col("foo").str.concat("-", ignore_nulls: false))
303
+ # # =>
304
+ # # shape: (1, 1)
305
+ # # ┌──────┐
306
+ # # │ foo │
307
+ # # │ --- │
308
+ # # │ str │
309
+ # # ╞══════╡
310
+ # # │ null │
311
+ # # └──────┘
312
+ def concat(delimiter = "-", ignore_nulls: true)
313
+ Utils.wrap_expr(_rbexpr.str_concat(delimiter, ignore_nulls))
174
314
  end
175
315
 
176
316
  # Transform to uppercase variant.
@@ -217,7 +357,7 @@ module Polars
217
357
 
218
358
  # Remove leading and trailing whitespace.
219
359
  #
220
- # @param matches [String, nil]
360
+ # @param characters [String, nil]
221
361
  # An optional single character that should be trimmed.
222
362
  #
223
363
  # @return [Expr]
@@ -236,16 +376,15 @@ module Polars
236
376
  # # │ trail │
237
377
  # # │ both │
238
378
  # # └───────┘
239
- def strip(matches = nil)
240
- if !matches.nil? && matches.length > 1
241
- raise ArgumentError, "matches should contain a single character"
242
- end
243
- Utils.wrap_expr(_rbexpr.str_strip(matches))
379
+ def strip_chars(characters = nil)
380
+ characters = Utils.parse_as_expression(characters, str_as_lit: true)
381
+ Utils.wrap_expr(_rbexpr.str_strip_chars(characters))
244
382
  end
383
+ alias_method :strip, :strip_chars
245
384
 
246
385
  # Remove leading whitespace.
247
386
  #
248
- # @param matches [String, nil]
387
+ # @param characters [String, nil]
249
388
  # An optional single character that should be trimmed.
250
389
  #
251
390
  # @return [Expr]
@@ -264,16 +403,15 @@ module Polars
264
403
  # # │ trail │
265
404
  # # │ both │
266
405
  # # └────────┘
267
- def lstrip(matches = nil)
268
- if !matches.nil? && matches.length > 1
269
- raise ArgumentError, "matches should contain a single character"
270
- end
271
- Utils.wrap_expr(_rbexpr.str_lstrip(matches))
406
+ def strip_chars_start(characters = nil)
407
+ characters = Utils.parse_as_expression(characters, str_as_lit: true)
408
+ Utils.wrap_expr(_rbexpr.str_strip_chars_start(characters))
272
409
  end
410
+ alias_method :lstrip, :strip_chars_start
273
411
 
274
412
  # Remove trailing whitespace.
275
413
  #
276
- # @param matches [String, nil]
414
+ # @param characters [String, nil]
277
415
  # An optional single character that should be trimmed.
278
416
  #
279
417
  # @return [Expr]
@@ -292,12 +430,11 @@ module Polars
292
430
  # # │ trail │
293
431
  # # │ both │
294
432
  # # └───────┘
295
- def rstrip(matches = nil)
296
- if !matches.nil? && matches.length > 1
297
- raise ArgumentError, "matches should contain a single character"
298
- end
299
- Utils.wrap_expr(_rbexpr.str_rstrip(matches))
433
+ def strip_chars_end(characters = nil)
434
+ characters = Utils.parse_as_expression(characters, str_as_lit: true)
435
+ Utils.wrap_expr(_rbexpr.str_strip_chars_end(characters))
300
436
  end
437
+ alias_method :rstrip, :strip_chars_end
301
438
 
302
439
  # Fills the string with zeroes.
303
440
  #
@@ -341,13 +478,13 @@ module Polars
341
478
  Utils.wrap_expr(_rbexpr.str_zfill(alignment))
342
479
  end
343
480
 
344
- # Return the string left justified in a string of length `width`.
481
+ # Return the string left justified in a string of length `length`.
345
482
  #
346
483
  # Padding is done using the specified `fillchar`.
347
- # The original string is returned if `width` is less than or equal to
484
+ # The original string is returned if `length` is less than or equal to
348
485
  # `s.length`.
349
486
  #
350
- # @param width [Integer]
487
+ # @param length [Integer]
351
488
  # Justify left to this length.
352
489
  # @param fillchar [String]
353
490
  # Fill with this ASCII character.
@@ -369,17 +506,18 @@ module Polars
369
506
  # # │ null │
370
507
  # # │ hippopotamus │
371
508
  # # └──────────────┘
372
- def ljust(width, fillchar = " ")
373
- Utils.wrap_expr(_rbexpr.str_ljust(width, fillchar))
509
+ def ljust(length, fillchar = " ")
510
+ Utils.wrap_expr(_rbexpr.str_pad_end(length, fillchar))
374
511
  end
512
+ alias_method :pad_end, :ljust
375
513
 
376
- # Return the string right justified in a string of length `width`.
514
+ # Return the string right justified in a string of length `length`.
377
515
  #
378
516
  # Padding is done using the specified `fillchar`.
379
- # The original string is returned if `width` is less than or equal to
517
+ # The original string is returned if `length` is less than or equal to
380
518
  # `s.length`.
381
519
  #
382
- # @param width [Integer]
520
+ # @param length [Integer]
383
521
  # Justify right to this length.
384
522
  # @param fillchar [String]
385
523
  # Fill with this ASCII character.
@@ -401,9 +539,10 @@ module Polars
401
539
  # # │ null │
402
540
  # # │ hippopotamus │
403
541
  # # └──────────────┘
404
- def rjust(width, fillchar = " ")
405
- Utils.wrap_expr(_rbexpr.str_rjust(width, fillchar))
542
+ def rjust(length, fillchar = " ")
543
+ Utils.wrap_expr(_rbexpr.str_pad_start(length, fillchar))
406
544
  end
545
+ alias_method :pad_start, :rjust
407
546
 
408
547
  # Check if string contains a substring that matches a regex.
409
548
  #
@@ -547,11 +686,11 @@ module Polars
547
686
  # # │ {null,null} │
548
687
  # # │ {2,false} │
549
688
  # # └─────────────┘
550
- def json_extract(dtype = nil)
689
+ def json_extract(dtype = nil, infer_schema_length: 100)
551
690
  if !dtype.nil?
552
691
  dtype = Utils.rb_type_to_dtype(dtype)
553
692
  end
554
- Utils.wrap_expr(_rbexpr.str_json_extract(dtype))
693
+ Utils.wrap_expr(_rbexpr.str_json_extract(dtype, infer_schema_length))
555
694
  end
556
695
 
557
696
  # Extract the first match of json string with provided JSONPath expression.
@@ -744,9 +883,11 @@ module Polars
744
883
  # # │ 5 │
745
884
  # # │ 6 │
746
885
  # # └──────────────┘
747
- def count_match(pattern)
748
- Utils.wrap_expr(_rbexpr.count_match(pattern))
886
+ def count_matches(pattern, literal: false)
887
+ pattern = Utils.parse_as_expression(pattern, str_as_lit: true)
888
+ Utils.wrap_expr(_rbexpr.str_count_matches(pattern, literal))
749
889
  end
890
+ alias_method :count_match, :count_matches
750
891
 
751
892
  # Split the string by a substring.
752
893
  #
@@ -772,6 +913,7 @@ module Polars
772
913
  # # │ ["foo", "bar", "baz"] │
773
914
  # # └───────────────────────┘
774
915
  def split(by, inclusive: false)
916
+ by = Utils.parse_as_expression(by, str_as_lit: true)
775
917
  if inclusive
776
918
  Utils.wrap_expr(_rbexpr.str_split_inclusive(by))
777
919
  else
@@ -814,6 +956,7 @@ module Polars
814
956
  # # │ {"d","4"} │
815
957
  # # └─────────────┘
816
958
  def split_exact(by, n, inclusive: false)
959
+ by = Utils.parse_as_expression(by, str_as_lit: true)
817
960
  if inclusive
818
961
  Utils.wrap_expr(_rbexpr.str_split_exact_inclusive(by, n))
819
962
  else
@@ -850,6 +993,7 @@ module Polars
850
993
  # # │ {"foo","bar baz"} │
851
994
  # # └───────────────────┘
852
995
  def splitn(by, n)
996
+ by = Utils.parse_as_expression(by, str_as_lit: true)
853
997
  Utils.wrap_expr(_rbexpr.str_splitn(by, n))
854
998
  end
855
999
 
@@ -968,7 +1112,53 @@ module Polars
968
1112
  # # │ r │
969
1113
  # # └─────┘
970
1114
  def explode
971
- Utils.wrap_expr(_rbexpr.explode)
1115
+ Utils.wrap_expr(_rbexpr.str_explode)
1116
+ end
1117
+
1118
+ # Convert an Utf8 column into an Int64 column with base radix.
1119
+ #
1120
+ # @param base [Integer]
1121
+ # Positive integer which is the base of the string we are parsing.
1122
+ # Default: 10.
1123
+ # @param strict [Boolean]
1124
+ # Bool, default=true will raise any ParseError or overflow as ComputeError.
1125
+ # false silently convert to Null.
1126
+ #
1127
+ # @return [Expr]
1128
+ #
1129
+ # @example
1130
+ # df = Polars::DataFrame.new({"bin" => ["110", "101", "010", "invalid"]})
1131
+ # df.with_columns(Polars.col("bin").str.to_integer(base: 2, strict: false).alias("parsed"))
1132
+ # # =>
1133
+ # # shape: (4, 2)
1134
+ # # ┌─────────┬────────┐
1135
+ # # │ bin ┆ parsed │
1136
+ # # │ --- ┆ --- │
1137
+ # # │ str ┆ i64 │
1138
+ # # ╞═════════╪════════╡
1139
+ # # │ 110 ┆ 6 │
1140
+ # # │ 101 ┆ 5 │
1141
+ # # │ 010 ┆ 2 │
1142
+ # # │ invalid ┆ null │
1143
+ # # └─────────┴────────┘
1144
+ #
1145
+ # @example
1146
+ # df = Polars::DataFrame.new({"hex" => ["fa1e", "ff00", "cafe", nil]})
1147
+ # df.with_columns(Polars.col("hex").str.to_integer(base: 16, strict: true).alias("parsed"))
1148
+ # # =>
1149
+ # # shape: (4, 2)
1150
+ # # ┌──────┬────────┐
1151
+ # # │ hex ┆ parsed │
1152
+ # # │ --- ┆ --- │
1153
+ # # │ str ┆ i64 │
1154
+ # # ╞══════╪════════╡
1155
+ # # │ fa1e ┆ 64030 │
1156
+ # # │ ff00 ┆ 65280 │
1157
+ # # │ cafe ┆ 51966 │
1158
+ # # │ null ┆ null │
1159
+ # # └──────┴────────┘
1160
+ def to_integer(base: 10, strict: true)
1161
+ Utils.wrap_expr(_rbexpr.str_to_integer(base, strict))
972
1162
  end
973
1163
 
974
1164
  # Parse integers with base radix from strings.
@@ -999,24 +1189,14 @@ module Polars
999
1189
  # # │ 2 │
1000
1190
  # # │ null │
1001
1191
  # # └──────┘
1002
- #
1003
- # @example
1004
- # df = Polars::DataFrame.new({"hex" => ["fa1e", "ff00", "cafe", nil]})
1005
- # df.select(Polars.col("hex").str.parse_int(16, strict: true))
1006
- # # =>
1007
- # # shape: (4, 1)
1008
- # # ┌───────┐
1009
- # # │ hex │
1010
- # # │ --- │
1011
- # # │ i32 │
1012
- # # ╞═══════╡
1013
- # # │ 64030 │
1014
- # # │ 65280 │
1015
- # # │ 51966 │
1016
- # # │ null │
1017
- # # └───────┘
1018
1192
  def parse_int(radix = 2, strict: true)
1019
- Utils.wrap_expr(_rbexpr.str_parse_int(radix, strict))
1193
+ to_integer(base: 2, strict: strict).cast(Int32, strict: strict)
1194
+ end
1195
+
1196
+ private
1197
+
1198
+ def _validate_format_argument(format)
1199
+ # TODO
1020
1200
  end
1021
1201
  end
1022
1202
  end