polars-df 0.3.1-x86_64-linux → 0.5.0-x86_64-linux

Sign up to get free protection for your applications and to get access to all the features.
@@ -11,8 +11,8 @@ module Polars
11
11
 
12
12
  # Parse a Utf8 expression to a Date/Datetime/Time type.
13
13
  #
14
- # @param datatype [Symbol]
15
- # `:date`, `:dateime`, or `:time`.
14
+ # @param dtype [Object]
15
+ # The data type to convert into. Can be either Date, Datetime, or Time.
16
16
  # @param fmt [String]
17
17
  # Format to use, refer to the
18
18
  # [chrono strftime documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
@@ -33,57 +33,56 @@ module Polars
33
33
  # the format string, if given, eg: "%F %T%.3f" => Datetime("ms"). If
34
34
  # no fractional second component is found then the default is "us".
35
35
  #
36
- # @example
36
+ # @example Dealing with a consistent format:
37
+ # s = Polars::Series.new(["2020-01-01 01:00Z", "2020-01-01 02:00Z"])
38
+ # s.str.strptime(Polars::Datetime, "%Y-%m-%d %H:%M%#z")
39
+ # # =>
40
+ # # shape: (2,)
41
+ # # Series: '' [datetime[μs, +00:00]]
42
+ # # [
43
+ # # 2020-01-01 01:00:00 +00:00
44
+ # # 2020-01-01 02:00:00 +00:00
45
+ # # ]
46
+ #
47
+ # @example Dealing with different formats.
37
48
  # s = Polars::Series.new(
38
49
  # "date",
39
50
  # [
40
51
  # "2021-04-22",
41
52
  # "2022-01-04 00:00:00",
42
53
  # "01/31/22",
43
- # "Sun Jul 8 00:34:60 2001"
54
+ # "Sun Jul 8 00:34:60 2001",
44
55
  # ]
45
56
  # )
46
- # s.to_frame.with_column(
47
- # Polars.col("date")
48
- # .str.strptime(:date, "%F", strict: false)
49
- # .fill_null(
50
- # Polars.col("date").str.strptime(:date, "%F %T", strict: false)
51
- # )
52
- # .fill_null(Polars.col("date").str.strptime(:date, "%D", strict: false))
53
- # .fill_null(Polars.col("date").str.strptime(:date, "%c", strict: false))
54
- # )
57
+ # s.to_frame.select(
58
+ # Polars.coalesce(
59
+ # Polars.col("date").str.strptime(Polars::Date, "%F", strict: false),
60
+ # Polars.col("date").str.strptime(Polars::Date, "%F %T", strict: false),
61
+ # Polars.col("date").str.strptime(Polars::Date, "%D", strict: false),
62
+ # Polars.col("date").str.strptime(Polars::Date, "%c", strict: false)
63
+ # )
64
+ # ).to_series
55
65
  # # =>
56
- # # shape: (4, 1)
57
- # # ┌────────────┐
58
- # # │ date │
59
- # # │ --- │
60
- # # │ date │
61
- # # ╞════════════╡
62
- # # │ 2021-04-22 │
63
- # # │ 2022-01-04 │
64
- # # 2022-01-31
65
- # # 2001-07-08
66
- # # └────────────┘
67
- def strptime(datatype, fmt = nil, strict: true, exact: true, cache: true, tz_aware: false, utc: false)
68
- if !Utils.is_polars_dtype(datatype)
69
- raise ArgumentError, "expected: {DataType} got: #{datatype}"
70
- end
71
-
72
- if datatype == :date
66
+ # # shape: (4,)
67
+ # # Series: 'date' [date]
68
+ # # [
69
+ # # 2021-04-22
70
+ # # 2022-01-04
71
+ # # 2022-01-31
72
+ # # 2001-07-08
73
+ # # ]
74
+ def strptime(dtype, fmt = nil, strict: true, exact: true, cache: true, tz_aware: false, utc: false)
75
+ if dtype == Date
73
76
  Utils.wrap_expr(_rbexpr.str_parse_date(fmt, strict, exact, cache))
74
- elsif datatype == :datetime
75
- # TODO fix
76
- tu = nil # datatype.tu
77
- dtcol = Utils.wrap_expr(_rbexpr.str_parse_datetime(fmt, strict, exact, cache, tz_aware, utc))
78
- if tu.nil?
79
- dtcol
80
- else
81
- dtcol.dt.cast_time_unit(tu)
82
- end
83
- elsif datatype == :time
77
+ elsif dtype == Datetime || dtype.is_a?(Datetime)
78
+ dtype = Datetime.new if dtype == Datetime
79
+ time_unit = dtype.time_unit
80
+ time_zone = dtype.time_zone
81
+ Utils.wrap_expr(_rbexpr.str_parse_datetime(fmt, time_unit, time_zone, strict, exact, cache, tz_aware, utc))
82
+ elsif dtype == Time
84
83
  Utils.wrap_expr(_rbexpr.str_parse_time(fmt, strict, exact, cache))
85
84
  else
86
- raise ArgumentError, "dtype should be of type :date, :datetime, or :time"
85
+ raise ArgumentError, "dtype should be of type {Date, Datetime, Time}"
87
86
  end
88
87
  end
89
88
 
@@ -332,7 +331,7 @@ module Polars
332
331
  # # │ -0001 │
333
332
  # # │ 00000 │
334
333
  # # │ 00001 │
335
- # # │ ...
334
+ # # │
336
335
  # # │ 10000 │
337
336
  # # │ 100000 │
338
337
  # # │ 1000000 │
@@ -521,6 +520,40 @@ module Polars
521
520
  Utils.wrap_expr(_rbexpr.str_starts_with(sub))
522
521
  end
523
522
 
523
+ # Parse string values as JSON.
524
+ #
525
+ # Throw errors if encounter invalid JSON strings.
526
+ #
527
+ # @param dtype [Object]
528
+ # The dtype to cast the extracted value to. If nil, the dtype will be
529
+ # inferred from the JSON value.
530
+ #
531
+ # @return [Expr]
532
+ #
533
+ # @example
534
+ # df = Polars::DataFrame.new(
535
+ # {"json" => ['{"a":1, "b": true}', nil, '{"a":2, "b": false}']}
536
+ # )
537
+ # dtype = Polars::Struct.new([Polars::Field.new("a", Polars::Int64), Polars::Field.new("b", Polars::Boolean)])
538
+ # df.select(Polars.col("json").str.json_extract(dtype))
539
+ # # =>
540
+ # # shape: (3, 1)
541
+ # # ┌─────────────┐
542
+ # # │ json │
543
+ # # │ --- │
544
+ # # │ struct[2] │
545
+ # # ╞═════════════╡
546
+ # # │ {1,true} │
547
+ # # │ {null,null} │
548
+ # # │ {2,false} │
549
+ # # └─────────────┘
550
+ def json_extract(dtype = nil)
551
+ if !dtype.nil?
552
+ dtype = Utils.rb_type_to_dtype(dtype)
553
+ end
554
+ Utils.wrap_expr(_rbexpr.str_json_extract(dtype))
555
+ end
556
+
524
557
  # Extract the first match of json string with provided JSONPath expression.
525
558
  #
526
559
  # Throw errors if encounter invalid json strings.
@@ -846,10 +879,10 @@ module Polars
846
879
  # # │ 1 ┆ 123ABC │
847
880
  # # │ 2 ┆ abc456 │
848
881
  # # └─────┴────────┘
849
- def replace(pattern, value, literal: false)
882
+ def replace(pattern, value, literal: false, n: 1)
850
883
  pattern = Utils.expr_to_lit_or_expr(pattern, str_to_lit: true)
851
884
  value = Utils.expr_to_lit_or_expr(value, str_to_lit: true)
852
- Utils.wrap_expr(_rbexpr.str_replace(pattern._rbexpr, value._rbexpr, literal))
885
+ Utils.wrap_expr(_rbexpr.str_replace_n(pattern._rbexpr, value._rbexpr, literal, n))
853
886
  end
854
887
 
855
888
  # Replace all matching regex/literal substrings with a new string value.
@@ -912,5 +945,78 @@ module Polars
912
945
  def slice(offset, length = nil)
913
946
  Utils.wrap_expr(_rbexpr.str_slice(offset, length))
914
947
  end
948
+
949
+ # Returns a column with a separate row for every string character.
950
+ #
951
+ # @return [Expr]
952
+ #
953
+ # @example
954
+ # df = Polars::DataFrame.new({"a": ["foo", "bar"]})
955
+ # df.select(Polars.col("a").str.explode)
956
+ # # =>
957
+ # # shape: (6, 1)
958
+ # # ┌─────┐
959
+ # # │ a │
960
+ # # │ --- │
961
+ # # │ str │
962
+ # # ╞═════╡
963
+ # # │ f │
964
+ # # │ o │
965
+ # # │ o │
966
+ # # │ b │
967
+ # # │ a │
968
+ # # │ r │
969
+ # # └─────┘
970
+ def explode
971
+ Utils.wrap_expr(_rbexpr.explode)
972
+ end
973
+
974
+ # Parse integers with base radix from strings.
975
+ #
976
+ # By default base 2. ParseError/Overflows become Nulls.
977
+ #
978
+ # @param radix [Integer]
979
+ # Positive integer which is the base of the string we are parsing.
980
+ # Default: 2.
981
+ # @param strict [Boolean]
982
+ # Bool, Default=true will raise any ParseError or overflow as ComputeError.
983
+ # False silently convert to Null.
984
+ #
985
+ # @return [Expr]
986
+ #
987
+ # @example
988
+ # df = Polars::DataFrame.new({"bin" => ["110", "101", "010", "invalid"]})
989
+ # df.select(Polars.col("bin").str.parse_int(2, strict: false))
990
+ # # =>
991
+ # # shape: (4, 1)
992
+ # # ┌──────┐
993
+ # # │ bin │
994
+ # # │ --- │
995
+ # # │ i32 │
996
+ # # ╞══════╡
997
+ # # │ 6 │
998
+ # # │ 5 │
999
+ # # │ 2 │
1000
+ # # │ null │
1001
+ # # └──────┘
1002
+ #
1003
+ # @example
1004
+ # df = Polars::DataFrame.new({"hex" => ["fa1e", "ff00", "cafe", nil]})
1005
+ # df.select(Polars.col("hex").str.parse_int(16, strict: true))
1006
+ # # =>
1007
+ # # shape: (4, 1)
1008
+ # # ┌───────┐
1009
+ # # │ hex │
1010
+ # # │ --- │
1011
+ # # │ i32 │
1012
+ # # ╞═══════╡
1013
+ # # │ 64030 │
1014
+ # # │ 65280 │
1015
+ # # │ 51966 │
1016
+ # # │ null │
1017
+ # # └───────┘
1018
+ def parse_int(radix = 2, strict: true)
1019
+ Utils.wrap_expr(_rbexpr.str_parse_int(radix, strict))
1020
+ end
915
1021
  end
916
1022
  end
@@ -38,12 +38,12 @@ module Polars
38
38
  # )
39
39
  # s.to_frame.with_column(
40
40
  # Polars.col("date")
41
- # .str.strptime(:date, "%F", strict: false)
41
+ # .str.strptime(Polars::Date, "%F", strict: false)
42
42
  # .fill_null(
43
- # Polars.col("date").str.strptime(:date, "%F %T", strict: false)
43
+ # Polars.col("date").str.strptime(Polars::Date, "%F %T", strict: false)
44
44
  # )
45
- # .fill_null(Polars.col("date").str.strptime(:date, "%D", strict: false))
46
- # .fill_null(Polars.col("date").str.strptime(:date, "%c", strict: false))
45
+ # .fill_null(Polars.col("date").str.strptime(Polars::Date, "%D", strict: false))
46
+ # .fill_null(Polars.col("date").str.strptime(Polars::Date, "%c", strict: false))
47
47
  # )
48
48
  # # =>
49
49
  # # shape: (4, 1)
@@ -60,5 +60,37 @@ module Polars
60
60
  def rename_fields(names)
61
61
  super
62
62
  end
63
+
64
+ # Get the struct definition as a name/dtype schema dict.
65
+ #
66
+ # @return [Object]
67
+ def schema
68
+ if _s.nil?
69
+ {}
70
+ else
71
+ _s.dtype.to_schema
72
+ end
73
+ end
74
+
75
+ # Convert this struct Series to a DataFrame with a separate column for each field.
76
+ #
77
+ # @return [DataFrame]
78
+ #
79
+ # @example
80
+ # s = Polars::Series.new([{"a" => 1, "b" => 2}, {"a" => 3, "b" => 4}])
81
+ # s.struct.unnest
82
+ # # =>
83
+ # # shape: (2, 2)
84
+ # # ┌─────┬─────┐
85
+ # # │ a ┆ b │
86
+ # # │ --- ┆ --- │
87
+ # # │ i64 ┆ i64 │
88
+ # # ╞═════╪═════╡
89
+ # # │ 1 ┆ 2 │
90
+ # # │ 3 ┆ 4 │
91
+ # # └─────┴─────┘
92
+ def unnest
93
+ Utils.wrap_df(_s.struct_unnest)
94
+ end
63
95
  end
64
96
  end
data/lib/polars/utils.rb CHANGED
@@ -23,24 +23,42 @@ module Polars
23
23
  Polars.col(name)
24
24
  end
25
25
 
26
+ def self.arrlen(obj)
27
+ if obj.is_a?(Range)
28
+ # size only works for numeric ranges
29
+ obj.to_a.length
30
+ elsif obj.is_a?(String)
31
+ nil
32
+ else
33
+ obj.length
34
+ end
35
+ rescue
36
+ nil
37
+ end
38
+
26
39
  def self._timedelta_to_pl_duration(td)
27
40
  td
28
41
  end
29
42
 
30
43
  def self._datetime_to_pl_timestamp(dt, tu)
31
44
  if tu == "ns"
32
- (dt.to_datetime.utc.to_f * 1e9).to_i
45
+ (dt.to_datetime.to_time.to_f * 1e9).to_i
33
46
  elsif tu == "us"
34
- (dt.to_datetime.utc.to_f * 1e6).to_i
47
+ (dt.to_datetime.to_time.to_f * 1e6).to_i
35
48
  elsif tu == "ms"
36
- (dt.to_datetime.utc.to_f * 1e3).to_i
49
+ (dt.to_datetime.to_time.to_f * 1e3).to_i
37
50
  elsif tu.nil?
38
- (dt.to_datetime.utc.to_f * 1e6).to_i
51
+ (dt.to_datetime.to_time.to_f * 1e6).to_i
39
52
  else
40
53
  raise ArgumentError, "tu must be one of {{'ns', 'us', 'ms'}}, got #{tu}"
41
54
  end
42
55
  end
43
56
 
57
+ def self._date_to_pl_date(d)
58
+ dt = d.to_datetime.to_time
59
+ dt.to_i / (3600 * 24)
60
+ end
61
+
44
62
  def self._to_ruby_datetime(value, dtype, tu: "ns", tz: nil)
45
63
  if dtype == :date || dtype == Date
46
64
  # days to seconds
@@ -69,6 +87,18 @@ module Polars
69
87
  end
70
88
  end
71
89
 
90
+ def self._to_ruby_duration(value, tu = "ns")
91
+ if tu == "ns"
92
+ value / 1e9
93
+ elsif tu == "us"
94
+ value / 1e6
95
+ elsif tu == "ms"
96
+ value / 1e3
97
+ else
98
+ raise ArgumentError, "tu must be one of {{'ns', 'us', 'ms'}}, got #{tu}"
99
+ end
100
+ end
101
+
72
102
  def self.selection_to_rbexpr_list(exprs)
73
103
  if exprs.is_a?(String) || exprs.is_a?(Symbol) || exprs.is_a?(Expr) || exprs.is_a?(Series)
74
104
  exprs = [exprs]
@@ -93,12 +123,19 @@ module Polars
93
123
  Polars.lit(value)
94
124
  end
95
125
 
96
- def self.format_path(path)
97
- File.expand_path(path)
126
+ def self.normalise_filepath(path, check_not_directory: true)
127
+ path = File.expand_path(path)
128
+ if check_not_directory && File.exist?(path) && Dir.exist?(path)
129
+ raise ArgumentError, "Expected a file path; #{path} is a directory"
130
+ end
131
+ path
98
132
  end
99
133
 
100
134
  # TODO fix
101
- def self.is_polars_dtype(data_type)
135
+ def self.is_polars_dtype(data_type, include_unknown: false)
136
+ if data_type == Unknown
137
+ return include_unknown
138
+ end
102
139
  data_type.is_a?(Symbol) || data_type.is_a?(String) || data_type.is_a?(DataType) || (data_type.is_a?(Class) && data_type < DataType)
103
140
  end
104
141
 
@@ -109,7 +146,8 @@ module Polars
109
146
  TrueClass => :bool,
110
147
  FalseClass => :bool,
111
148
  ::Date => :date,
112
- ::DateTime => :datetime
149
+ ::DateTime => :datetime,
150
+ ::Time => :datetime
113
151
  }
114
152
 
115
153
  # TODO fix
@@ -174,7 +212,7 @@ module Polars
174
212
  end
175
213
 
176
214
  def self.bool?(value)
177
- value == true || value == false
215
+ value.is_a?(TrueClass) || value.is_a?(FalseClass)
178
216
  end
179
217
 
180
218
  def self.strlike?(value)
@@ -216,5 +254,9 @@ module Polars
216
254
  val.is_a?(Array) && _is_iterable_of(val, String)
217
255
  end
218
256
  end
257
+
258
+ def self.local_file?(file)
259
+ Dir.glob(file).any?
260
+ end
219
261
  end
220
262
  end
@@ -1,4 +1,4 @@
1
1
  module Polars
2
2
  # @private
3
- VERSION = "0.3.1"
3
+ VERSION = "0.5.0"
4
4
  end
data/lib/polars.rb CHANGED
@@ -1,8 +1,8 @@
1
1
  # ext
2
2
  begin
3
- require_relative "polars/#{RUBY_VERSION.to_f}/polars"
3
+ require "polars/#{RUBY_VERSION.to_f}/polars"
4
4
  rescue LoadError
5
- require_relative "polars/polars"
5
+ require "polars/polars"
6
6
  end
7
7
 
8
8
  # stdlib
@@ -12,6 +12,8 @@ require "stringio"
12
12
  # modules
13
13
  require_relative "polars/expr_dispatch"
14
14
  require_relative "polars/batched_csv_reader"
15
+ require_relative "polars/binary_expr"
16
+ require_relative "polars/binary_name_space"
15
17
  require_relative "polars/cat_expr"
16
18
  require_relative "polars/cat_name_space"
17
19
  require_relative "polars/convert"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: polars-df
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.5.0
5
5
  platform: x86_64-linux
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-02-22 00:00:00.000000000 Z
11
+ date: 2023-05-16 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
14
  email: andrew@ankane.org
@@ -29,6 +29,8 @@ files:
29
29
  - lib/polars/3.1/polars.so
30
30
  - lib/polars/3.2/polars.so
31
31
  - lib/polars/batched_csv_reader.rb
32
+ - lib/polars/binary_expr.rb
33
+ - lib/polars/binary_name_space.rb
32
34
  - lib/polars/cat_expr.rb
33
35
  - lib/polars/cat_name_space.rb
34
36
  - lib/polars/convert.rb