polars-df 0.10.0-x86_64-linux-musl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +175 -0
  4. data/Cargo.lock +2536 -0
  5. data/Cargo.toml +6 -0
  6. data/LICENSE-THIRD-PARTY.txt +38726 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +437 -0
  9. data/lib/polars/3.1/polars.so +0 -0
  10. data/lib/polars/3.2/polars.so +0 -0
  11. data/lib/polars/3.3/polars.so +0 -0
  12. data/lib/polars/array_expr.rb +537 -0
  13. data/lib/polars/array_name_space.rb +423 -0
  14. data/lib/polars/batched_csv_reader.rb +98 -0
  15. data/lib/polars/binary_expr.rb +77 -0
  16. data/lib/polars/binary_name_space.rb +66 -0
  17. data/lib/polars/cat_expr.rb +72 -0
  18. data/lib/polars/cat_name_space.rb +125 -0
  19. data/lib/polars/config.rb +530 -0
  20. data/lib/polars/convert.rb +93 -0
  21. data/lib/polars/data_frame.rb +5418 -0
  22. data/lib/polars/data_types.rb +466 -0
  23. data/lib/polars/date_time_expr.rb +1444 -0
  24. data/lib/polars/date_time_name_space.rb +1484 -0
  25. data/lib/polars/dynamic_group_by.rb +52 -0
  26. data/lib/polars/exceptions.rb +31 -0
  27. data/lib/polars/expr.rb +6105 -0
  28. data/lib/polars/expr_dispatch.rb +22 -0
  29. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  30. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  31. data/lib/polars/functions/as_datatype.rb +248 -0
  32. data/lib/polars/functions/col.rb +47 -0
  33. data/lib/polars/functions/eager.rb +182 -0
  34. data/lib/polars/functions/lazy.rb +1280 -0
  35. data/lib/polars/functions/len.rb +49 -0
  36. data/lib/polars/functions/lit.rb +35 -0
  37. data/lib/polars/functions/random.rb +16 -0
  38. data/lib/polars/functions/range/date_range.rb +103 -0
  39. data/lib/polars/functions/range/int_range.rb +51 -0
  40. data/lib/polars/functions/repeat.rb +144 -0
  41. data/lib/polars/functions/whenthen.rb +96 -0
  42. data/lib/polars/functions.rb +57 -0
  43. data/lib/polars/group_by.rb +548 -0
  44. data/lib/polars/io.rb +890 -0
  45. data/lib/polars/lazy_frame.rb +2833 -0
  46. data/lib/polars/lazy_group_by.rb +84 -0
  47. data/lib/polars/list_expr.rb +791 -0
  48. data/lib/polars/list_name_space.rb +445 -0
  49. data/lib/polars/meta_expr.rb +222 -0
  50. data/lib/polars/name_expr.rb +198 -0
  51. data/lib/polars/plot.rb +109 -0
  52. data/lib/polars/rolling_group_by.rb +37 -0
  53. data/lib/polars/series.rb +4527 -0
  54. data/lib/polars/slice.rb +104 -0
  55. data/lib/polars/sql_context.rb +194 -0
  56. data/lib/polars/string_cache.rb +75 -0
  57. data/lib/polars/string_expr.rb +1519 -0
  58. data/lib/polars/string_name_space.rb +810 -0
  59. data/lib/polars/struct_expr.rb +98 -0
  60. data/lib/polars/struct_name_space.rb +96 -0
  61. data/lib/polars/testing.rb +507 -0
  62. data/lib/polars/utils.rb +422 -0
  63. data/lib/polars/version.rb +4 -0
  64. data/lib/polars/whenthen.rb +83 -0
  65. data/lib/polars-df.rb +1 -0
  66. data/lib/polars.rb +72 -0
  67. metadata +125 -0
@@ -0,0 +1,104 @@
1
+ module Polars
2
+ # @private
3
+ class Slice
4
+ def initialize(obj)
5
+ @obj = obj
6
+ end
7
+
8
+ # Apply a slice operation, taking advantage of any potential fast paths.
9
+ def apply(s)
10
+ # normalize slice
11
+ _slice_setup(s)
12
+
13
+ # check for fast-paths / single-operation calls
14
+ if @slice_length == 0
15
+ @obj.cleared
16
+ elsif @is_unbounded && [-1, 1].include?(@stride)
17
+ @stride < 0 ? @obj.reverse : @obj.clone
18
+ elsif @start >= 0 && @stop >= 0 && @stride == 1
19
+ @obj.slice(@start, @slice_length)
20
+ elsif @stride < 0 && @slice_length == 1
21
+ @obj.slice(@stop + 1, 1)
22
+ else
23
+ # multi-operation calls; make lazy
24
+ lazyobj = _lazify(@obj)
25
+ sliced = @stride > 0 ? _slice_positive(lazyobj) : _slice_negative(lazyobj)
26
+ _as_original(sliced, @obj)
27
+ end
28
+ end
29
+
30
+ private
31
+
32
+ # Return lazy variant back to its original type.
33
+ def _as_original(lazy, original)
34
+ frame = lazy.collect
35
+ original.is_a?(DataFrame) ? frame : frame.to_series
36
+ end
37
+
38
+ # Make lazy to ensure efficient/consistent handling.
39
+ def _lazify(obj)
40
+ obj.is_a?(DataFrame) ? obj.lazy : obj.to_frame.lazy
41
+ end
42
+
43
+ # Logic for slices with positive stride.
44
+ def _slice_positive(obj)
45
+ # note: at this point stride is guaranteed to be > 1
46
+ obj.slice(@start, @slice_length).take_every(@stride)
47
+ end
48
+
49
+ # Logic for slices with negative stride.
50
+ def _slice_negative(obj)
51
+ stride = @stride.abs
52
+ lazyslice = obj.slice(@stop + 1, @slice_length).reverse
53
+ stride > 1 ? lazyslice.take_every(stride) : lazyslice
54
+ end
55
+
56
+ # Normalize slice bounds, identify unbounded and/or zero-length slices.
57
+ def _slice_setup(s)
58
+ # can normalize slice indices as we know object size
59
+ obj_len = @obj.length
60
+ start = if s.begin
61
+ if s.begin < 0
62
+ [s.begin + obj_len, 0].max
63
+ else
64
+ s.begin
65
+ end
66
+ else
67
+ 0
68
+ end
69
+ stop = if s.end
70
+ if s.end < 0
71
+ s.end + (s.exclude_end? ? 0 : 1) + obj_len
72
+ else
73
+ s.end + (s.exclude_end? ? 0 : 1)
74
+ end
75
+ else
76
+ obj_len
77
+ end
78
+ stride = 1
79
+
80
+ # check if slice is actually unbounded
81
+ if stride >= 1
82
+ @is_unbounded = start <= 0 && stop >= obj_len
83
+ else
84
+ @is_unbounded = stop == -1 && start >= obj_len - 1
85
+ end
86
+
87
+ # determine slice length
88
+ if @obj.is_empty
89
+ @slice_length = 0
90
+ elsif @is_unbounded
91
+ @slice_length = obj_len
92
+ else
93
+ @slice_length = if start == stop || (stride > 0 && start > stop) || (stride < 0 && start < stop)
94
+ 0
95
+ else
96
+ (stop - start).abs
97
+ end
98
+ end
99
+ @start = start
100
+ @stop = stop
101
+ @stride = stride
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,194 @@
1
+ module Polars
2
+ # Run SQL queries against DataFrame/LazyFrame data.
3
+ class SQLContext
4
+ # @private
5
+ attr_accessor :_ctxt, :_eager_execution
6
+
7
+ # Initialize a new `SQLContext`.
8
+ def initialize(frames = nil, eager_execution: false, **named_frames)
9
+ self._ctxt = RbSQLContext.new
10
+ self._eager_execution = eager_execution
11
+
12
+ frames = (frames || {}).to_h
13
+
14
+ if frames.any? || named_frames.any?
15
+ register_many(frames, **named_frames)
16
+ end
17
+ end
18
+
19
+ # Parse the given SQL query and execute it against the registered frame data.
20
+ #
21
+ # @param query [String]
22
+ # A valid string SQL query.
23
+ # @param eager [Boolean]
24
+ # Apply the query eagerly, returning `DataFrame` instead of `LazyFrame`.
25
+ # If unset, the value of the init-time parameter "eager_execution" will be
26
+ # used. (Note that the query itself is always executed in lazy-mode; this
27
+ # parameter only impacts the type of the returned frame).
28
+ #
29
+ # @return [Object]
30
+ #
31
+ # @example Execute a SQL query against the registered frame data:
32
+ # df = Polars::DataFrame.new(
33
+ # [
34
+ # ["The Godfather", 1972, 6_000_000, 134_821_952, 9.2],
35
+ # ["The Dark Knight", 2008, 185_000_000, 533_316_061, 9.0],
36
+ # ["Schindler's List", 1993, 22_000_000, 96_067_179, 8.9],
37
+ # ["Pulp Fiction", 1994, 8_000_000, 107_930_000, 8.9],
38
+ # ["The Shawshank Redemption", 1994, 25_000_000, 28_341_469, 9.3],
39
+ # ],
40
+ # schema: ["title", "release_year", "budget", "gross", "imdb_score"]
41
+ # )
42
+ # ctx = Polars::SQLContext.new(films: df)
43
+ # ctx.execute(
44
+ # "
45
+ # SELECT title, release_year, imdb_score
46
+ # FROM films
47
+ # WHERE release_year > 1990
48
+ # ORDER BY imdb_score DESC
49
+ # ",
50
+ # eager: true
51
+ # )
52
+ # # =>
53
+ # # shape: (4, 3)
54
+ # # ┌──────────────────────────┬──────────────┬────────────┐
55
+ # # │ title ┆ release_year ┆ imdb_score │
56
+ # # │ --- ┆ --- ┆ --- │
57
+ # # │ str ┆ i64 ┆ f64 │
58
+ # # ╞══════════════════════════╪══════════════╪════════════╡
59
+ # # │ The Shawshank Redemption ┆ 1994 ┆ 9.3 │
60
+ # # │ The Dark Knight ┆ 2008 ┆ 9.0 │
61
+ # # │ Schindler's List ┆ 1993 ┆ 8.9 │
62
+ # # │ Pulp Fiction ┆ 1994 ┆ 8.9 │
63
+ # # └──────────────────────────┴──────────────┴────────────┘
64
+ #
65
+ # @example Execute a GROUP BY query:
66
+ # ctx.execute(
67
+ # "
68
+ # SELECT
69
+ # MAX(release_year / 10) * 10 AS decade,
70
+ # SUM(gross) AS total_gross,
71
+ # COUNT(title) AS n_films,
72
+ # FROM films
73
+ # GROUP BY (release_year / 10) -- decade
74
+ # ORDER BY total_gross DESC
75
+ # ",
76
+ # eager: true
77
+ # )
78
+ # # =>
79
+ # # shape: (3, 3)
80
+ # # ┌────────┬─────────────┬─────────┐
81
+ # # │ decade ┆ total_gross ┆ n_films │
82
+ # # │ --- ┆ --- ┆ --- │
83
+ # # │ i64 ┆ i64 ┆ u32 │
84
+ # # ╞════════╪═════════════╪═════════╡
85
+ # # │ 2000 ┆ 533316061 ┆ 1 │
86
+ # # │ 1990 ┆ 232338648 ┆ 3 │
87
+ # # │ 1970 ┆ 134821952 ┆ 1 │
88
+ # # └────────┴─────────────┴─────────┘
89
+ def execute(query, eager: nil)
90
+ res = Utils.wrap_ldf(_ctxt.execute(query))
91
+ eager || _eager_execution ? res.collect : res
92
+ end
93
+
94
+ # Register a single frame as a table, using the given name.
95
+ #
96
+ # @param name [String]
97
+ # Name of the table.
98
+ # @param frame [Object]
99
+ # eager/lazy frame to associate with this table name.
100
+ #
101
+ # @return [SQLContext]
102
+ #
103
+ # @example
104
+ # df = Polars::DataFrame.new({"hello" => ["world"]})
105
+ # ctx = Polars::SQLContext.new
106
+ # ctx.register("frame_data", df).execute("SELECT * FROM frame_data").collect
107
+ # # =>
108
+ # # shape: (1, 1)
109
+ # # ┌───────┐
110
+ # # │ hello │
111
+ # # │ --- │
112
+ # # │ str │
113
+ # # ╞═══════╡
114
+ # # │ world │
115
+ # # └───────┘
116
+ def register(name, frame)
117
+ if frame.is_a?(DataFrame)
118
+ frame = frame.lazy
119
+ end
120
+ _ctxt.register(name.to_s, frame._ldf)
121
+ self
122
+ end
123
+
124
+ # Register multiple eager/lazy frames as tables, using the associated names.
125
+ #
126
+ # @param frames [Hash]
127
+ # A `{name:frame, ...}` mapping.
128
+ # @param named_frames [Object]
129
+ # Named eager/lazy frames, provided as kwargs.
130
+ #
131
+ # @return [SQLContext]
132
+ def register_many(frames, **named_frames)
133
+ frames = (frames || {}).to_h
134
+ frames = frames.merge(named_frames)
135
+ frames.each do |name, frame|
136
+ register(name, frame)
137
+ end
138
+ self
139
+ end
140
+
141
+ # Unregister one or more eager/lazy frames by name.
142
+ #
143
+ # @param names [Object]
144
+ # Names of the tables to unregister.
145
+ #
146
+ # @return [SQLContext]
147
+ #
148
+ # @example Register with a SQLContext object:
149
+ # df0 = Polars::DataFrame.new({"ints" => [9, 8, 7, 6, 5]})
150
+ # lf1 = Polars::LazyFrame.new({"text" => ["a", "b", "c"]})
151
+ # lf2 = Polars::LazyFrame.new({"misc" => ["testing1234"]})
152
+ # ctx = Polars::SQLContext.new(test1: df0, test2: lf1, test3: lf2)
153
+ # ctx.tables
154
+ # # => ["test1", "test2", "test3"]
155
+ #
156
+ # @example Unregister one or more of the tables:
157
+ # ctx.unregister(["test1", "test3"]).tables
158
+ # # => ["test2"]
159
+ def unregister(names)
160
+ if names.is_a?(::String)
161
+ names = [names]
162
+ end
163
+ names.each do |nm|
164
+ _ctxt.unregister(nm)
165
+ end
166
+ self
167
+ end
168
+
169
+ # Return a list of the registered table names.
170
+ #
171
+ # @return [Array]
172
+ #
173
+ # @example Executing as SQL:
174
+ # frame_data = Polars::DataFrame.new({"hello" => ["world"]})
175
+ # ctx = Polars::SQLContext.new(hello_world: frame_data)
176
+ # ctx.execute("SHOW TABLES", eager: true)
177
+ # # =>
178
+ # # shape: (1, 1)
179
+ # # ┌─────────────┐
180
+ # # │ name │
181
+ # # │ --- │
182
+ # # │ str │
183
+ # # ╞═════════════╡
184
+ # # │ hello_world │
185
+ # # └─────────────┘
186
+ #
187
+ # @example Calling the method:
188
+ # ctx.tables
189
+ # # => ["hello_world"]
190
+ def tables
191
+ _ctxt.get_tables.sort
192
+ end
193
+ end
194
+ end
@@ -0,0 +1,75 @@
1
+ module Polars
2
+ # Context manager for enabling and disabling the global string cache.
3
+ class StringCache
4
+ def initialize(&block)
5
+ RbStringCacheHolder.hold(&block)
6
+ end
7
+ end
8
+
9
+ module Functions
10
+ # Enable the global string cache.
11
+ #
12
+ # `Categorical` columns created under the same global string cache have
13
+ # the same underlying physical value when string values are equal. This allows the
14
+ # columns to be concatenated or used in a join operation, for example.
15
+ #
16
+ # @return [nil]
17
+ #
18
+ # @example Construct two Series using the same global string cache.
19
+ # Polars.enable_string_cache
20
+ # s1 = Polars::Series.new("color", ["red", "green", "red"], dtype: Polars::Categorical)
21
+ # s2 = Polars::Series.new("color", ["blue", "red", "green"], dtype: Polars::Categorical)
22
+ # Polars.disable_string_cache
23
+ #
24
+ # @example As both Series are constructed under the same global string cache, they can be concatenated.
25
+ # Polars.concat([s1, s2])
26
+ # # =>
27
+ # # shape: (6,)
28
+ # # Series: 'color' [cat]
29
+ # # [
30
+ # # "red"
31
+ # # "green"
32
+ # # "red"
33
+ # # "blue"
34
+ # # "red"
35
+ # # "green"
36
+ # # ]
37
+ def enable_string_cache
38
+ Plr.enable_string_cache
39
+ end
40
+
41
+ # Disable and clear the global string cache.
42
+ #
43
+ # @return [nil]
44
+ #
45
+ # @example Construct two Series using the same global string cache.
46
+ # Polars.enable_string_cache
47
+ # s1 = Polars::Series.new("color", ["red", "green", "red"], dtype: Polars::Categorical)
48
+ # s2 = Polars::Series.new("color", ["blue", "red", "green"], dtype: Polars::Categorical)
49
+ # Polars.disable_string_cache
50
+ #
51
+ # @example As both Series are constructed under the same global string cache, they can be concatenated.
52
+ # Polars.concat([s1, s2])
53
+ # # =>
54
+ # # shape: (6,)
55
+ # # Series: 'color' [cat]
56
+ # # [
57
+ # # "red"
58
+ # # "green"
59
+ # # "red"
60
+ # # "blue"
61
+ # # "red"
62
+ # # "green"
63
+ # # ]
64
+ def disable_string_cache
65
+ Plr.disable_string_cache
66
+ end
67
+
68
+ # Check whether the global string cache is enabled.
69
+ #
70
+ # @return [Boolean]
71
+ def using_string_cache
72
+ Plr.using_string_cache
73
+ end
74
+ end
75
+ end