polars-df 0.4.0-x86_64-darwin → 0.6.0-x86_64-darwin

Sign up to get free protection for your applications and to get access to all the features.
@@ -9,11 +9,129 @@ module Polars
9
9
  self._rbexpr = expr._rbexpr
10
10
  end
11
11
 
12
+ # Convert a Utf8 column into a Date column.
13
+ #
14
+ # @param format [String]
15
+ # Format to use for conversion. Refer to the
16
+ # [chrono crate documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
17
+ # for the full specification. Example: `"%Y-%m-%d"`.
18
+ # If set to nil (default), the format is inferred from the data.
19
+ # @param strict [Boolean]
20
+ # Raise an error if any conversion fails.
21
+ # @param exact [Boolean]
22
+ # Require an exact format match. If false, allow the format to match anywhere
23
+ # in the target string.
24
+ # @param cache [Boolean]
25
+ # Use a cache of unique, converted dates to apply the conversion.
26
+ #
27
+ # @return [Expr]
28
+ #
29
+ # @example
30
+ # s = Polars::Series.new(["2020/01/01", "2020/02/01", "2020/03/01"])
31
+ # s.str.to_date
32
+ # # =>
33
+ # # shape: (3,)
34
+ # # Series: '' [date]
35
+ # # [
36
+ # # 2020-01-01
37
+ # # 2020-02-01
38
+ # # 2020-03-01
39
+ # # ]
40
+ def to_date(format = nil, strict: true, exact: true, cache: true)
41
+ _validate_format_argument(format)
42
+ Utils.wrap_expr(self._rbexpr.str_to_date(format, strict, exact, cache))
43
+ end
44
+
45
+ # Convert a Utf8 column into a Datetime column.
46
+ #
47
+ # @param format [String]
48
+ # Format to use for conversion. Refer to the
49
+ # [chrono crate documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
50
+ # for the full specification. Example: `"%Y-%m-%d %H:%M:%S"`.
51
+ # If set to nil (default), the format is inferred from the data.
52
+ # @param time_unit ["us", "ns", "ms"]
53
+ # Unit of time for the resulting Datetime column. If set to nil (default),
54
+ # the time unit is inferred from the format string if given, eg:
55
+ # `"%F %T%.3f"` => `Datetime("ms")`. If no fractional second component is
56
+ # found, the default is `"us"`.
57
+ # @param time_zone [String]
58
+ # Time zone for the resulting Datetime column.
59
+ # @param strict [Boolean]
60
+ # Raise an error if any conversion fails.
61
+ # @param exact [Boolean]
62
+ # Require an exact format match. If false, allow the format to match anywhere
63
+ # in the target string.
64
+ # @param cache [Boolean]
65
+ # Use a cache of unique, converted datetimes to apply the conversion.
66
+ #
67
+ # @return [Expr]
68
+ #
69
+ # @example
70
+ # s = Polars::Series.new(["2020-01-01 01:00Z", "2020-01-01 02:00Z"])
71
+ # s.str.to_datetime("%Y-%m-%d %H:%M%#z")
72
+ # # =>
73
+ # # shape: (2,)
74
+ # # Series: '' [datetime[μs, UTC]]
75
+ # # [
76
+ # # 2020-01-01 01:00:00 UTC
77
+ # # 2020-01-01 02:00:00 UTC
78
+ # # ]
79
+ def to_datetime(
80
+ format = nil,
81
+ time_unit: nil,
82
+ time_zone: nil,
83
+ strict: true,
84
+ exact: true,
85
+ cache: true
86
+ )
87
+ _validate_format_argument(format)
88
+ Utils.wrap_expr(
89
+ self._rbexpr.str_to_datetime(
90
+ format,
91
+ time_unit,
92
+ time_zone,
93
+ strict,
94
+ exact,
95
+ cache
96
+ )
97
+ )
98
+ end
99
+
100
+ # Convert a Utf8 column into a Time column.
101
+ #
102
+ # @param format [String]
103
+ # Format to use for conversion. Refer to the
104
+ # [chrono crate documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
105
+ # for the full specification. Example: `"%H:%M:%S"`.
106
+ # If set to nil (default), the format is inferred from the data.
107
+ # @param strict [Boolean]
108
+ # Raise an error if any conversion fails.
109
+ # @param cache [Boolean]
110
+ # Use a cache of unique, converted times to apply the conversion.
111
+ #
112
+ # @return [Expr]
113
+ #
114
+ # @example
115
+ # s = Polars::Series.new(["01:00", "02:00", "03:00"])
116
+ # s.str.to_time("%H:%M")
117
+ # # =>
118
+ # # shape: (3,)
119
+ # # Series: '' [time]
120
+ # # [
121
+ # # 01:00:00
122
+ # # 02:00:00
123
+ # # 03:00:00
124
+ # # ]
125
+ def to_time(format = nil, strict: true, cache: true)
126
+ _validate_format_argument(format)
127
+ Utils.wrap_expr(_rbexpr.str_to_time(format, strict, cache))
128
+ end
129
+
12
130
  # Parse a Utf8 expression to a Date/Datetime/Time type.
13
131
  #
14
- # @param datatype [Symbol]
15
- # `:date`, `:dateime`, or `:time`.
16
- # @param fmt [String]
132
+ # @param dtype [Object]
133
+ # The data type to convert into. Can be either Date, Datetime, or Time.
134
+ # @param format [String]
17
135
  # Format to use, refer to the
18
136
  # [chrono strftime documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
19
137
  # for specification. Example: `"%y-%m-%d"`.
@@ -33,57 +151,58 @@ module Polars
33
151
  # the format string, if given, eg: "%F %T%.3f" => Datetime("ms"). If
34
152
  # no fractional second component is found then the default is "us".
35
153
  #
36
- # @example
154
+ # @example Dealing with a consistent format:
155
+ # s = Polars::Series.new(["2020-01-01 01:00Z", "2020-01-01 02:00Z"])
156
+ # s.str.strptime(Polars::Datetime, "%Y-%m-%d %H:%M%#z")
157
+ # # =>
158
+ # # shape: (2,)
159
+ # # Series: '' [datetime[μs, UTC]]
160
+ # # [
161
+ # # 2020-01-01 01:00:00 UTC
162
+ # # 2020-01-01 02:00:00 UTC
163
+ # # ]
164
+ #
165
+ # @example Dealing with different formats.
37
166
  # s = Polars::Series.new(
38
167
  # "date",
39
168
  # [
40
169
  # "2021-04-22",
41
170
  # "2022-01-04 00:00:00",
42
171
  # "01/31/22",
43
- # "Sun Jul 8 00:34:60 2001"
172
+ # "Sun Jul 8 00:34:60 2001",
44
173
  # ]
45
174
  # )
46
- # s.to_frame.with_column(
47
- # Polars.col("date")
48
- # .str.strptime(:date, "%F", strict: false)
49
- # .fill_null(
50
- # Polars.col("date").str.strptime(:date, "%F %T", strict: false)
51
- # )
52
- # .fill_null(Polars.col("date").str.strptime(:date, "%D", strict: false))
53
- # .fill_null(Polars.col("date").str.strptime(:date, "%c", strict: false))
54
- # )
175
+ # s.to_frame.select(
176
+ # Polars.coalesce(
177
+ # Polars.col("date").str.strptime(Polars::Date, "%F", strict: false),
178
+ # Polars.col("date").str.strptime(Polars::Date, "%F %T", strict: false),
179
+ # Polars.col("date").str.strptime(Polars::Date, "%D", strict: false),
180
+ # Polars.col("date").str.strptime(Polars::Date, "%c", strict: false)
181
+ # )
182
+ # ).to_series
55
183
  # # =>
56
- # # shape: (4, 1)
57
- # # ┌────────────┐
58
- # # │ date │
59
- # # │ --- │
60
- # # │ date │
61
- # # ╞════════════╡
62
- # # │ 2021-04-22 │
63
- # # │ 2022-01-04 │
64
- # # 2022-01-31
65
- # # │ 2001-07-08 │
66
- # # └────────────┘
67
- def strptime(datatype, fmt = nil, strict: true, exact: true, cache: true, tz_aware: false, utc: false)
68
- if !Utils.is_polars_dtype(datatype)
69
- raise ArgumentError, "expected: {DataType} got: #{datatype}"
70
- end
184
+ # # shape: (4,)
185
+ # # Series: 'date' [date]
186
+ # # [
187
+ # # 2021-04-22
188
+ # # 2022-01-04
189
+ # # 2022-01-31
190
+ # # 2001-07-08
191
+ # # ]
192
+ def strptime(dtype, format = nil, strict: true, exact: true, cache: true, utc: false)
193
+ _validate_format_argument(format)
71
194
 
72
- if datatype == :date
73
- Utils.wrap_expr(_rbexpr.str_parse_date(fmt, strict, exact, cache))
74
- elsif datatype == :datetime
75
- # TODO fix
76
- tu = nil # datatype.tu
77
- dtcol = Utils.wrap_expr(_rbexpr.str_parse_datetime(fmt, strict, exact, cache, tz_aware, utc))
78
- if tu.nil?
79
- dtcol
80
- else
81
- dtcol.dt.cast_time_unit(tu)
82
- end
83
- elsif datatype == :time
84
- Utils.wrap_expr(_rbexpr.str_parse_time(fmt, strict, exact, cache))
195
+ if dtype == Date
196
+ to_date(format, strict: strict, exact: exact, cache: cache)
197
+ elsif dtype == Datetime || dtype.is_a?(Datetime)
198
+ dtype = Datetime.new if dtype == Datetime
199
+ time_unit = dtype.time_unit
200
+ time_zone = dtype.time_zone
201
+ to_datetime(format, time_unit: time_unit, time_zone: time_zone, strict: strict, exact: exact, cache: cache)
202
+ elsif dtype == Time
203
+ to_time(format, strict: strict, cache: cache)
85
204
  else
86
- raise ArgumentError, "dtype should be of type :date, :datetime, or :time"
205
+ raise ArgumentError, "dtype should be of type {Date, Datetime, Time}"
87
206
  end
88
207
  end
89
208
 
@@ -521,6 +640,40 @@ module Polars
521
640
  Utils.wrap_expr(_rbexpr.str_starts_with(sub))
522
641
  end
523
642
 
643
+ # Parse string values as JSON.
644
+ #
645
+ # Throw errors if encounter invalid JSON strings.
646
+ #
647
+ # @param dtype [Object]
648
+ # The dtype to cast the extracted value to. If nil, the dtype will be
649
+ # inferred from the JSON value.
650
+ #
651
+ # @return [Expr]
652
+ #
653
+ # @example
654
+ # df = Polars::DataFrame.new(
655
+ # {"json" => ['{"a":1, "b": true}', nil, '{"a":2, "b": false}']}
656
+ # )
657
+ # dtype = Polars::Struct.new([Polars::Field.new("a", Polars::Int64), Polars::Field.new("b", Polars::Boolean)])
658
+ # df.select(Polars.col("json").str.json_extract(dtype))
659
+ # # =>
660
+ # # shape: (3, 1)
661
+ # # ┌─────────────┐
662
+ # # │ json │
663
+ # # │ --- │
664
+ # # │ struct[2] │
665
+ # # ╞═════════════╡
666
+ # # │ {1,true} │
667
+ # # │ {null,null} │
668
+ # # │ {2,false} │
669
+ # # └─────────────┘
670
+ def json_extract(dtype = nil, infer_schema_length: 100)
671
+ if !dtype.nil?
672
+ dtype = Utils.rb_type_to_dtype(dtype)
673
+ end
674
+ Utils.wrap_expr(_rbexpr.str_json_extract(dtype, infer_schema_length))
675
+ end
676
+
524
677
  # Extract the first match of json string with provided JSONPath expression.
525
678
  #
526
679
  # Throw errors if encounter invalid json strings.
@@ -846,10 +999,10 @@ module Polars
846
999
  # # │ 1 ┆ 123ABC │
847
1000
  # # │ 2 ┆ abc456 │
848
1001
  # # └─────┴────────┘
849
- def replace(pattern, value, literal: false)
1002
+ def replace(pattern, value, literal: false, n: 1)
850
1003
  pattern = Utils.expr_to_lit_or_expr(pattern, str_to_lit: true)
851
1004
  value = Utils.expr_to_lit_or_expr(value, str_to_lit: true)
852
- Utils.wrap_expr(_rbexpr.str_replace(pattern._rbexpr, value._rbexpr, literal))
1005
+ Utils.wrap_expr(_rbexpr.str_replace_n(pattern._rbexpr, value._rbexpr, literal, n))
853
1006
  end
854
1007
 
855
1008
  # Replace all matching regex/literal substrings with a new string value.
@@ -912,5 +1065,84 @@ module Polars
912
1065
  def slice(offset, length = nil)
913
1066
  Utils.wrap_expr(_rbexpr.str_slice(offset, length))
914
1067
  end
1068
+
1069
+ # Returns a column with a separate row for every string character.
1070
+ #
1071
+ # @return [Expr]
1072
+ #
1073
+ # @example
1074
+ # df = Polars::DataFrame.new({"a": ["foo", "bar"]})
1075
+ # df.select(Polars.col("a").str.explode)
1076
+ # # =>
1077
+ # # shape: (6, 1)
1078
+ # # ┌─────┐
1079
+ # # │ a │
1080
+ # # │ --- │
1081
+ # # │ str │
1082
+ # # ╞═════╡
1083
+ # # │ f │
1084
+ # # │ o │
1085
+ # # │ o │
1086
+ # # │ b │
1087
+ # # │ a │
1088
+ # # │ r │
1089
+ # # └─────┘
1090
+ def explode
1091
+ Utils.wrap_expr(_rbexpr.str_explode)
1092
+ end
1093
+
1094
+ # Parse integers with base radix from strings.
1095
+ #
1096
+ # By default base 2. ParseError/Overflows become Nulls.
1097
+ #
1098
+ # @param radix [Integer]
1099
+ # Positive integer which is the base of the string we are parsing.
1100
+ # Default: 2.
1101
+ # @param strict [Boolean]
1102
+ # Bool, Default=true will raise any ParseError or overflow as ComputeError.
1103
+ # False silently convert to Null.
1104
+ #
1105
+ # @return [Expr]
1106
+ #
1107
+ # @example
1108
+ # df = Polars::DataFrame.new({"bin" => ["110", "101", "010", "invalid"]})
1109
+ # df.select(Polars.col("bin").str.parse_int(2, strict: false))
1110
+ # # =>
1111
+ # # shape: (4, 1)
1112
+ # # ┌──────┐
1113
+ # # │ bin │
1114
+ # # │ --- │
1115
+ # # │ i32 │
1116
+ # # ╞══════╡
1117
+ # # │ 6 │
1118
+ # # │ 5 │
1119
+ # # │ 2 │
1120
+ # # │ null │
1121
+ # # └──────┘
1122
+ #
1123
+ # @example
1124
+ # df = Polars::DataFrame.new({"hex" => ["fa1e", "ff00", "cafe", nil]})
1125
+ # df.select(Polars.col("hex").str.parse_int(16, strict: true))
1126
+ # # =>
1127
+ # # shape: (4, 1)
1128
+ # # ┌───────┐
1129
+ # # │ hex │
1130
+ # # │ --- │
1131
+ # # │ i32 │
1132
+ # # ╞═══════╡
1133
+ # # │ 64030 │
1134
+ # # │ 65280 │
1135
+ # # │ 51966 │
1136
+ # # │ null │
1137
+ # # └───────┘
1138
+ def parse_int(radix = 2, strict: true)
1139
+ Utils.wrap_expr(_rbexpr.str_parse_int(radix, strict))
1140
+ end
1141
+
1142
+ private
1143
+
1144
+ def _validate_format_argument(format)
1145
+ # TODO
1146
+ end
915
1147
  end
916
1148
  end
@@ -10,6 +10,112 @@ module Polars
10
10
  self._s = series._s
11
11
  end
12
12
 
13
+ # Convert a Utf8 column into a Date column.
14
+ #
15
+ # @param format [String]
16
+ # Format to use for conversion. Refer to the
17
+ # [chrono crate documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
18
+ # for the full specification. Example: `"%Y-%m-%d"`.
19
+ # If set to nil (default), the format is inferred from the data.
20
+ # @param strict [Boolean]
21
+ # Raise an error if any conversion fails.
22
+ # @param exact [Boolean]
23
+ # Require an exact format match. If false, allow the format to match anywhere
24
+ # in the target string.
25
+ # @param cache [Boolean]
26
+ # Use a cache of unique, converted dates to apply the conversion.
27
+ #
28
+ # @return [Series]
29
+ #
30
+ # @example
31
+ # s = Polars::Series.new(["2020/01/01", "2020/02/01", "2020/03/01"])
32
+ # s.str.to_date
33
+ # # =>
34
+ # # shape: (3,)
35
+ # # Series: '' [date]
36
+ # # [
37
+ # # 2020-01-01
38
+ # # 2020-02-01
39
+ # # 2020-03-01
40
+ # # ]
41
+ def to_date(format = nil, strict: true, exact: true, cache: true)
42
+ super
43
+ end
44
+
45
+ # Convert a Utf8 column into a Datetime column.
46
+ #
47
+ # @param format [String]
48
+ # Format to use for conversion. Refer to the
49
+ # [chrono crate documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
50
+ # for the full specification. Example: `"%Y-%m-%d %H:%M:%S"`.
51
+ # If set to nil (default), the format is inferred from the data.
52
+ # @param time_unit ["us", "ns", "ms"]
53
+ # Unit of time for the resulting Datetime column. If set to nil (default),
54
+ # the time unit is inferred from the format string if given, eg:
55
+ # `"%F %T%.3f"` => `Datetime("ms")`. If no fractional second component is
56
+ # found, the default is `"us"`.
57
+ # @param time_zone [String]
58
+ # Time zone for the resulting Datetime column.
59
+ # @param strict [Boolean]
60
+ # Raise an error if any conversion fails.
61
+ # @param exact [Boolean]
62
+ # Require an exact format match. If false, allow the format to match anywhere
63
+ # in the target string.
64
+ # @param cache [Boolean]
65
+ # Use a cache of unique, converted datetimes to apply the conversion.
66
+ #
67
+ # @return [Series]
68
+ #
69
+ # @example
70
+ # s = Polars::Series.new(["2020-01-01 01:00Z", "2020-01-01 02:00Z"])
71
+ # s.str.to_datetime("%Y-%m-%d %H:%M%#z")
72
+ # # =>
73
+ # # shape: (2,)
74
+ # # Series: '' [datetime[μs, UTC]]
75
+ # # [
76
+ # # 2020-01-01 01:00:00 UTC
77
+ # # 2020-01-01 02:00:00 UTC
78
+ # # ]
79
+ def to_datetime(
80
+ format = nil,
81
+ time_unit: nil,
82
+ time_zone: nil,
83
+ strict: true,
84
+ exact: true,
85
+ cache: true
86
+ )
87
+ super
88
+ end
89
+
90
+ # Convert a Utf8 column into a Time column.
91
+ #
92
+ # @param format [String]
93
+ # Format to use for conversion. Refer to the
94
+ # [chrono crate documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
95
+ # for the full specification. Example: `"%H:%M:%S"`.
96
+ # If set to nil (default), the format is inferred from the data.
97
+ # @param strict [Boolean]
98
+ # Raise an error if any conversion fails.
99
+ # @param cache [Boolean]
100
+ # Use a cache of unique, converted times to apply the conversion.
101
+ #
102
+ # @return [Series]
103
+ #
104
+ # @example
105
+ # s = Polars::Series.new(["01:00", "02:00", "03:00"])
106
+ # s.str.to_time("%H:%M")
107
+ # # =>
108
+ # # shape: (3,)
109
+ # # Series: '' [time]
110
+ # # [
111
+ # # 01:00:00
112
+ # # 02:00:00
113
+ # # 03:00:00
114
+ # # ]
115
+ def to_time(format = nil, strict: true, cache: true)
116
+ super
117
+ end
118
+
13
119
  # Parse a Series of dtype Utf8 to a Date/Datetime Series.
14
120
  #
15
121
  # @param datatype [Symbol]
@@ -23,10 +129,23 @@ module Polars
23
129
  # @param exact [Boolean]
24
130
  # - If true, require an exact format match.
25
131
  # - If false, allow the format to match anywhere in the target string.
132
+ # @param cache [Boolean]
133
+ # Use a cache of unique, converted dates to apply the datetime conversion.
26
134
  #
27
135
  # @return [Series]
28
136
  #
29
- # @example
137
+ # @example Dealing with a consistent format:
138
+ # s = Polars::Series.new(["2020-01-01 01:00Z", "2020-01-01 02:00Z"])
139
+ # s.str.strptime(Polars::Datetime, "%Y-%m-%d %H:%M%#z")
140
+ # # =>
141
+ # # shape: (2,)
142
+ # # Series: '' [datetime[μs, UTC]]
143
+ # # [
144
+ # # 2020-01-01 01:00:00 UTC
145
+ # # 2020-01-01 02:00:00 UTC
146
+ # # ]
147
+ #
148
+ # @example Dealing with different formats.
30
149
  # s = Polars::Series.new(
31
150
  # "date",
32
151
  # [
@@ -36,28 +155,24 @@ module Polars
36
155
  # "Sun Jul 8 00:34:60 2001"
37
156
  # ]
38
157
  # )
39
- # s.to_frame.with_column(
40
- # Polars.col("date")
41
- # .str.strptime(:date, "%F", strict: false)
42
- # .fill_null(
43
- # Polars.col("date").str.strptime(:date, "%F %T", strict: false)
44
- # )
45
- # .fill_null(Polars.col("date").str.strptime(:date, "%D", strict: false))
46
- # .fill_null(Polars.col("date").str.strptime(:date, "%c", strict: false))
47
- # )
158
+ # s.to_frame.select(
159
+ # Polars.coalesce(
160
+ # Polars.col("date").str.strptime(Polars::Date, "%F", strict: false),
161
+ # Polars.col("date").str.strptime(Polars::Date, "%F %T", strict: false),
162
+ # Polars.col("date").str.strptime(Polars::Date, "%D", strict: false),
163
+ # Polars.col("date").str.strptime(Polars::Date, "%c", strict: false)
164
+ # )
165
+ # ).to_series
48
166
  # # =>
49
- # # shape: (4, 1)
50
- # # ┌────────────┐
51
- # # │ date │
52
- # # │ --- │
53
- # # │ date │
54
- # # ╞════════════╡
55
- # # │ 2021-04-22 │
56
- # # │ 2022-01-04 │
57
- # # 2022-01-31
58
- # # │ 2001-07-08 │
59
- # # └────────────┘
60
- def strptime(datatype, fmt = nil, strict: true, exact: true, cache: true, tz_aware: false, utc: false)
167
+ # # shape: (4,)
168
+ # # Series: 'date' [date]
169
+ # # [
170
+ # # 2021-04-22
171
+ # # 2022-01-04
172
+ # # 2022-01-31
173
+ # # 2001-07-08
174
+ # # ]
175
+ def strptime(datatype, fmt = nil, strict: true, exact: true, cache: true)
61
176
  super
62
177
  end
63
178
 
@@ -60,5 +60,37 @@ module Polars
60
60
  def rename_fields(names)
61
61
  super
62
62
  end
63
+
64
+ # Get the struct definition as a name/dtype schema dict.
65
+ #
66
+ # @return [Object]
67
+ def schema
68
+ if _s.nil?
69
+ {}
70
+ else
71
+ _s.dtype.to_schema
72
+ end
73
+ end
74
+
75
+ # Convert this struct Series to a DataFrame with a separate column for each field.
76
+ #
77
+ # @return [DataFrame]
78
+ #
79
+ # @example
80
+ # s = Polars::Series.new([{"a" => 1, "b" => 2}, {"a" => 3, "b" => 4}])
81
+ # s.struct.unnest
82
+ # # =>
83
+ # # shape: (2, 2)
84
+ # # ┌─────┬─────┐
85
+ # # │ a ┆ b │
86
+ # # │ --- ┆ --- │
87
+ # # │ i64 ┆ i64 │
88
+ # # ╞═════╪═════╡
89
+ # # │ 1 ┆ 2 │
90
+ # # │ 3 ┆ 4 │
91
+ # # └─────┴─────┘
92
+ def unnest
93
+ Utils.wrap_df(_s.struct_unnest)
94
+ end
63
95
  end
64
96
  end