polars-df 0.10.0-aarch64-linux → 0.11.0-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,73 @@
1
+ module Polars
2
+ module IO
3
+ # Read a SQL query into a DataFrame.
4
+ #
5
+ # @param query [Object]
6
+ # ActiveRecord::Relation or ActiveRecord::Result.
7
+ # @param schema_overrides [Hash]
8
+ # A hash mapping column names to dtypes, used to override the schema
9
+ # inferred from the query.
10
+ #
11
+ # @return [DataFrame]
12
+ def read_database(query, schema_overrides: nil)
13
+ if !defined?(ActiveRecord)
14
+ raise Error, "Active Record not available"
15
+ end
16
+
17
+ result =
18
+ if query.is_a?(ActiveRecord::Result)
19
+ query
20
+ elsif query.is_a?(ActiveRecord::Relation)
21
+ query.connection.select_all(query.to_sql)
22
+ elsif query.is_a?(::String)
23
+ ActiveRecord::Base.connection.select_all(query)
24
+ else
25
+ raise ArgumentError, "Expected ActiveRecord::Relation, ActiveRecord::Result, or String"
26
+ end
27
+
28
+ data = {}
29
+ schema_overrides = (schema_overrides || {}).transform_keys(&:to_s)
30
+
31
+ result.columns.each_with_index do |k, i|
32
+ column_type = result.column_types[i]
33
+
34
+ data[k] =
35
+ if column_type
36
+ result.rows.map { |r| column_type.deserialize(r[i]) }
37
+ else
38
+ result.rows.map { |r| r[i] }
39
+ end
40
+
41
+ polars_type =
42
+ case column_type&.type
43
+ when :binary
44
+ Binary
45
+ when :boolean
46
+ Boolean
47
+ when :date
48
+ Date
49
+ when :datetime, :timestamp
50
+ Datetime
51
+ when :decimal
52
+ Decimal
53
+ when :float
54
+ Float64
55
+ when :integer
56
+ Int64
57
+ when :string, :text
58
+ String
59
+ when :time
60
+ Time
61
+ # TODO fix issue with null
62
+ # when :json, :jsonb
63
+ # Struct
64
+ end
65
+
66
+ schema_overrides[k] ||= polars_type if polars_type
67
+ end
68
+
69
+ DataFrame.new(data, schema_overrides: schema_overrides)
70
+ end
71
+ alias_method :read_sql, :read_database
72
+ end
73
+ end
@@ -0,0 +1,247 @@
1
+ module Polars
2
+ module IO
3
+ # Read into a DataFrame from Arrow IPC (Feather v2) file.
4
+ #
5
+ # @param source [Object]
6
+ # Path to a file or a file-like object.
7
+ # @param columns [Object]
8
+ # Columns to select. Accepts a list of column indices (starting at zero) or a list
9
+ # of column names.
10
+ # @param n_rows [Integer]
11
+ # Stop reading from IPC file after reading `n_rows`.
12
+ # @param memory_map [Boolean]
13
+ # Try to memory map the file. This can greatly improve performance on repeated
14
+ # queries as the OS may cache pages.
15
+ # Only uncompressed IPC files can be memory mapped.
16
+ # @param storage_options [Hash]
17
+ # Extra options that make sense for a particular storage connection.
18
+ # @param row_count_name [String]
19
+ # If not nil, this will insert a row count column with give name into the
20
+ # DataFrame.
21
+ # @param row_count_offset [Integer]
22
+ # Offset to start the row_count column (only use if the name is set).
23
+ # @param rechunk [Boolean]
24
+ # Make sure that all data is contiguous.
25
+ #
26
+ # @return [DataFrame]
27
+ def read_ipc(
28
+ source,
29
+ columns: nil,
30
+ n_rows: nil,
31
+ memory_map: true,
32
+ storage_options: nil,
33
+ row_count_name: nil,
34
+ row_count_offset: 0,
35
+ rechunk: true
36
+ )
37
+ storage_options ||= {}
38
+ _prepare_file_arg(source, **storage_options) do |data|
39
+ _read_ipc_impl(
40
+ data,
41
+ columns: columns,
42
+ n_rows: n_rows,
43
+ row_count_name: row_count_name,
44
+ row_count_offset: row_count_offset,
45
+ rechunk: rechunk,
46
+ memory_map: memory_map
47
+ )
48
+ end
49
+ end
50
+
51
+ # @private
52
+ def _read_ipc_impl(
53
+ file,
54
+ columns: nil,
55
+ n_rows: nil,
56
+ row_count_name: nil,
57
+ row_count_offset: 0,
58
+ rechunk: true,
59
+ memory_map: true
60
+ )
61
+ if Utils.pathlike?(file)
62
+ file = Utils.normalize_filepath(file)
63
+ end
64
+ if columns.is_a?(::String)
65
+ columns = [columns]
66
+ end
67
+
68
+ if file.is_a?(::String) && file.include?("*")
69
+ raise Todo
70
+ end
71
+
72
+ projection, columns = Utils.handle_projection_columns(columns)
73
+ rbdf =
74
+ RbDataFrame.read_ipc(
75
+ file,
76
+ columns,
77
+ projection,
78
+ n_rows,
79
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
80
+ memory_map
81
+ )
82
+ Utils.wrap_df(rbdf)
83
+ end
84
+
85
+ # Read into a DataFrame from Arrow IPC record batch stream.
86
+ #
87
+ # See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html.
88
+ #
89
+ # @param source [Object]
90
+ # Path to a file or a file-like object.
91
+ # @param columns [Array]
92
+ # Columns to select. Accepts a list of column indices (starting at zero) or a list
93
+ # of column names.
94
+ # @param n_rows [Integer]
95
+ # Stop reading from IPC stream after reading `n_rows`.
96
+ # @param storage_options [Hash]
97
+ # Extra options that make sense for a particular storage connection.
98
+ # @param row_index_name [String]
99
+ # Insert a row index column with the given name into the DataFrame as the first
100
+ # column. If set to `nil` (default), no row index column is created.
101
+ # @param row_index_offset [Integer]
102
+ # Start the row index at this offset. Cannot be negative.
103
+ # Only used if `row_index_name` is set.
104
+ # @param rechunk [Boolean]
105
+ # Make sure that all data is contiguous.
106
+ #
107
+ # @return [DataFrame]
108
+ def read_ipc_stream(
109
+ source,
110
+ columns: nil,
111
+ n_rows: nil,
112
+ storage_options: nil,
113
+ row_index_name: nil,
114
+ row_index_offset: 0,
115
+ rechunk: true
116
+ )
117
+ storage_options ||= {}
118
+ _prepare_file_arg(source, **storage_options) do |data|
119
+ _read_ipc_stream_impl(
120
+ data,
121
+ columns: columns,
122
+ n_rows: n_rows,
123
+ row_index_name: row_index_name,
124
+ row_index_offset: row_index_offset,
125
+ rechunk: rechunk
126
+ )
127
+ end
128
+ end
129
+
130
+ # @private
131
+ def _read_ipc_stream_impl(
132
+ source,
133
+ columns: nil,
134
+ n_rows: nil,
135
+ row_index_name: nil,
136
+ row_index_offset: 0,
137
+ rechunk: true
138
+ )
139
+ if Utils.pathlike?(source)
140
+ source = Utils.normalize_filepath(source)
141
+ end
142
+ if columns.is_a?(String)
143
+ columns = [columns]
144
+ end
145
+
146
+ projection, columns = Utils.handle_projection_columns(columns)
147
+ pydf = RbDataFrame.read_ipc_stream(
148
+ source,
149
+ columns,
150
+ projection,
151
+ n_rows,
152
+ Utils._prepare_row_count_args(row_index_name, row_index_offset),
153
+ rechunk
154
+ )
155
+ Utils.wrap_df(pydf)
156
+ end
157
+
158
+ # Get a schema of the IPC file without reading data.
159
+ #
160
+ # @param source [Object]
161
+ # Path to a file or a file-like object.
162
+ #
163
+ # @return [Hash]
164
+ def read_ipc_schema(source)
165
+ if Utils.pathlike?(source)
166
+ source = Utils.normalize_filepath(source)
167
+ end
168
+
169
+ Plr.ipc_schema(source)
170
+ end
171
+
172
+ # Lazily read from an Arrow IPC (Feather v2) file or multiple files via glob patterns.
173
+ #
174
+ # This allows the query optimizer to push down predicates and projections to the scan
175
+ # level, thereby potentially reducing memory overhead.
176
+ #
177
+ # @param source [String]
178
+ # Path to a IPC file.
179
+ # @param n_rows [Integer]
180
+ # Stop reading from IPC file after reading `n_rows`.
181
+ # @param cache [Boolean]
182
+ # Cache the result after reading.
183
+ # @param rechunk [Boolean]
184
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
185
+ # @param row_count_name [String]
186
+ # If not nil, this will insert a row count column with give name into the
187
+ # DataFrame.
188
+ # @param row_count_offset [Integer]
189
+ # Offset to start the row_count column (only use if the name is set).
190
+ # @param storage_options [Hash]
191
+ # Extra options that make sense for a particular storage connection.
192
+ # @param memory_map [Boolean]
193
+ # Try to memory map the file. This can greatly improve performance on repeated
194
+ # queries as the OS may cache pages.
195
+ # Only uncompressed IPC files can be memory mapped.
196
+ #
197
+ # @return [LazyFrame]
198
+ def scan_ipc(
199
+ source,
200
+ n_rows: nil,
201
+ cache: true,
202
+ rechunk: true,
203
+ row_count_name: nil,
204
+ row_count_offset: 0,
205
+ storage_options: nil,
206
+ memory_map: true
207
+ )
208
+ _scan_ipc_impl(
209
+ source,
210
+ n_rows: n_rows,
211
+ cache: cache,
212
+ rechunk: rechunk,
213
+ row_count_name: row_count_name,
214
+ row_count_offset: row_count_offset,
215
+ storage_options: storage_options,
216
+ memory_map: memory_map
217
+ )
218
+ end
219
+
220
+ # @private
221
+ def _scan_ipc_impl(
222
+ file,
223
+ n_rows: nil,
224
+ cache: true,
225
+ rechunk: true,
226
+ row_count_name: nil,
227
+ row_count_offset: 0,
228
+ storage_options: nil,
229
+ memory_map: true
230
+ )
231
+ if Utils.pathlike?(file)
232
+ file = Utils.normalize_filepath(file)
233
+ end
234
+
235
+ rblf =
236
+ RbLazyFrame.new_from_ipc(
237
+ file,
238
+ n_rows,
239
+ cache,
240
+ rechunk,
241
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
242
+ memory_map
243
+ )
244
+ Utils.wrap_ldf(rblf)
245
+ end
246
+ end
247
+ end
@@ -0,0 +1,18 @@
1
+ module Polars
2
+ module IO
3
+ # Read into a DataFrame from a JSON file.
4
+ #
5
+ # @param source [Object]
6
+ # Path to a file or a file-like object.
7
+ #
8
+ # @return [DataFrame]
9
+ def read_json(source)
10
+ if Utils.pathlike?(source)
11
+ source = Utils.normalize_filepath(source)
12
+ end
13
+
14
+ rbdf = RbDataFrame.read_json(source)
15
+ Utils.wrap_df(rbdf)
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,69 @@
1
+ module Polars
2
+ module IO
3
+ # Read into a DataFrame from a newline delimited JSON file.
4
+ #
5
+ # @param source [Object]
6
+ # Path to a file or a file-like object.
7
+ #
8
+ # @return [DataFrame]
9
+ def read_ndjson(source)
10
+ if Utils.pathlike?(source)
11
+ source = Utils.normalize_filepath(source)
12
+ end
13
+
14
+ rbdf = RbDataFrame.read_ndjson(source)
15
+ Utils.wrap_df(rbdf)
16
+ end
17
+
18
+ # Lazily read from a newline delimited JSON file.
19
+ #
20
+ # This allows the query optimizer to push down predicates and projections to the scan
21
+ # level, thereby potentially reducing memory overhead.
22
+ #
23
+ # @param source [String]
24
+ # Path to a file.
25
+ # @param infer_schema_length [Integer]
26
+ # Infer the schema length from the first `infer_schema_length` rows.
27
+ # @param batch_size [Integer]
28
+ # Number of rows to read in each batch.
29
+ # @param n_rows [Integer]
30
+ # Stop reading from JSON file after reading `n_rows`.
31
+ # @param low_memory [Boolean]
32
+ # Reduce memory pressure at the expense of performance.
33
+ # @param rechunk [Boolean]
34
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
35
+ # @param row_count_name [String]
36
+ # If not nil, this will insert a row count column with give name into the
37
+ # DataFrame.
38
+ # @param row_count_offset [Integer]
39
+ # Offset to start the row_count column (only use if the name is set).
40
+ #
41
+ # @return [LazyFrame]
42
+ def scan_ndjson(
43
+ source,
44
+ infer_schema_length: 100,
45
+ batch_size: 1024,
46
+ n_rows: nil,
47
+ low_memory: false,
48
+ rechunk: true,
49
+ row_count_name: nil,
50
+ row_count_offset: 0
51
+ )
52
+ if Utils.pathlike?(source)
53
+ source = Utils.normalize_filepath(source)
54
+ end
55
+
56
+ rblf =
57
+ RbLazyFrame.new_from_ndjson(
58
+ source,
59
+ infer_schema_length,
60
+ batch_size,
61
+ n_rows,
62
+ low_memory,
63
+ rechunk,
64
+ Utils._prepare_row_count_args(row_count_name, row_count_offset)
65
+ )
66
+ Utils.wrap_ldf(rblf)
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,226 @@
1
+ module Polars
2
+ module IO
3
+ # Read into a DataFrame from a parquet file.
4
+ #
5
+ # @param source [String, Pathname, StringIO]
6
+ # Path to a file or a file-like object.
7
+ # @param columns [Object]
8
+ # Columns to select. Accepts a list of column indices (starting at zero) or a list
9
+ # of column names.
10
+ # @param n_rows [Integer]
11
+ # Stop reading from parquet file after reading `n_rows`.
12
+ # @param storage_options [Hash]
13
+ # Extra options that make sense for a particular storage connection.
14
+ # @param parallel ["auto", "columns", "row_groups", "none"]
15
+ # This determines the direction of parallelism. 'auto' will try to determine the
16
+ # optimal direction.
17
+ # @param row_count_name [String]
18
+ # If not nil, this will insert a row count column with give name into the
19
+ # DataFrame.
20
+ # @param row_count_offset [Integer]
21
+ # Offset to start the row_count column (only use if the name is set).
22
+ # @param low_memory [Boolean]
23
+ # Reduce memory pressure at the expense of performance.
24
+ # @param use_statistics [Boolean]
25
+ # Use statistics in the parquet to determine if pages
26
+ # can be skipped from reading.
27
+ # @param rechunk [Boolean]
28
+ # Make sure that all columns are contiguous in memory by
29
+ # aggregating the chunks into a single array.
30
+ #
31
+ # @return [DataFrame]
32
+ #
33
+ # @note
34
+ # This operation defaults to a `rechunk` operation at the end, meaning that
35
+ # all data will be stored continuously in memory.
36
+ # Set `rechunk: false` if you are benchmarking the parquet-reader. A `rechunk` is
37
+ # an expensive operation.
38
+ def read_parquet(
39
+ source,
40
+ columns: nil,
41
+ n_rows: nil,
42
+ storage_options: nil,
43
+ parallel: "auto",
44
+ row_count_name: nil,
45
+ row_count_offset: 0,
46
+ low_memory: false,
47
+ use_statistics: true,
48
+ rechunk: true
49
+ )
50
+ _prepare_file_arg(source) do |data|
51
+ _read_parquet_impl(
52
+ data,
53
+ columns: columns,
54
+ n_rows: n_rows,
55
+ parallel: parallel,
56
+ row_count_name: row_count_name,
57
+ row_count_offset: row_count_offset,
58
+ low_memory: low_memory,
59
+ use_statistics: use_statistics,
60
+ rechunk: rechunk
61
+ )
62
+ end
63
+ end
64
+
65
+ # @private
66
+ def _read_parquet_impl(
67
+ source,
68
+ columns: nil,
69
+ n_rows: nil,
70
+ parallel: "auto",
71
+ row_count_name: nil,
72
+ row_count_offset: 0,
73
+ low_memory: false,
74
+ use_statistics: true,
75
+ rechunk: true
76
+ )
77
+ if Utils.pathlike?(source)
78
+ source = Utils.normalize_filepath(source)
79
+ end
80
+ if columns.is_a?(::String)
81
+ columns = [columns]
82
+ end
83
+
84
+ if source.is_a?(::String) && source.include?("*") && Utils.local_file?(source)
85
+ scan =
86
+ scan_parquet(
87
+ source,
88
+ n_rows: n_rows,
89
+ rechunk: true,
90
+ parallel: parallel,
91
+ row_count_name: row_count_name,
92
+ row_count_offset: row_count_offset,
93
+ low_memory: low_memory
94
+ )
95
+
96
+ if columns.nil?
97
+ return scan.collect
98
+ elsif Utils.is_str_sequence(columns, allow_str: false)
99
+ return scan.select(columns).collect
100
+ else
101
+ raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: Array[String]"
102
+ end
103
+ end
104
+
105
+ projection, columns = Utils.handle_projection_columns(columns)
106
+ rbdf =
107
+ RbDataFrame.read_parquet(
108
+ source,
109
+ columns,
110
+ projection,
111
+ n_rows,
112
+ parallel,
113
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
114
+ low_memory,
115
+ use_statistics,
116
+ rechunk
117
+ )
118
+ Utils.wrap_df(rbdf)
119
+ end
120
+
121
+ # Get a schema of the Parquet file without reading data.
122
+ #
123
+ # @param source [Object]
124
+ # Path to a file or a file-like object.
125
+ #
126
+ # @return [Hash]
127
+ def read_parquet_schema(source)
128
+ if Utils.pathlike?(source)
129
+ source = Utils.normalize_filepath(source)
130
+ end
131
+
132
+ Plr.parquet_schema(source)
133
+ end
134
+
135
+ # Lazily read from a parquet file or multiple files via glob patterns.
136
+ #
137
+ # This allows the query optimizer to push down predicates and projections to the scan
138
+ # level, thereby potentially reducing memory overhead.
139
+ #
140
+ # @param source [String]
141
+ # Path to a file.
142
+ # @param n_rows [Integer]
143
+ # Stop reading from parquet file after reading `n_rows`.
144
+ # @param cache [Boolean]
145
+ # Cache the result after reading.
146
+ # @param parallel ["auto", "columns", "row_groups", "none"]
147
+ # This determines the direction of parallelism. 'auto' will try to determine the
148
+ # optimal direction.
149
+ # @param rechunk [Boolean]
150
+ # In case of reading multiple files via a glob pattern rechunk the final DataFrame
151
+ # into contiguous memory chunks.
152
+ # @param row_count_name [String]
153
+ # If not nil, this will insert a row count column with give name into the
154
+ # DataFrame.
155
+ # @param row_count_offset [Integer]
156
+ # Offset to start the row_count column (only use if the name is set).
157
+ # @param storage_options [Hash]
158
+ # Extra options that make sense for a particular storage connection.
159
+ # @param low_memory [Boolean]
160
+ # Reduce memory pressure at the expense of performance.
161
+ #
162
+ # @return [LazyFrame]
163
+ def scan_parquet(
164
+ source,
165
+ n_rows: nil,
166
+ cache: true,
167
+ parallel: "auto",
168
+ glob: true,
169
+ rechunk: true,
170
+ row_count_name: nil,
171
+ row_count_offset: 0,
172
+ storage_options: nil,
173
+ low_memory: false
174
+ )
175
+ if Utils.pathlike?(source)
176
+ source = Utils.normalize_filepath(source)
177
+ end
178
+
179
+ _scan_parquet_impl(
180
+ source,
181
+ n_rows:n_rows,
182
+ cache: cache,
183
+ parallel: parallel,
184
+ rechunk: rechunk,
185
+ row_count_name: row_count_name,
186
+ row_count_offset: row_count_offset,
187
+ storage_options: storage_options,
188
+ low_memory: low_memory,
189
+ glob: glob
190
+ )
191
+ end
192
+
193
+ # @private
194
+ def _scan_parquet_impl(
195
+ file,
196
+ n_rows: nil,
197
+ cache: true,
198
+ parallel: "auto",
199
+ rechunk: true,
200
+ row_count_name: nil,
201
+ row_count_offset: 0,
202
+ storage_options: nil,
203
+ low_memory: false,
204
+ use_statistics: true,
205
+ hive_partitioning: true,
206
+ glob: true
207
+ )
208
+ rblf =
209
+ RbLazyFrame.new_from_parquet(
210
+ file,
211
+ [],
212
+ n_rows,
213
+ cache,
214
+ parallel,
215
+ rechunk,
216
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
217
+ low_memory,
218
+ use_statistics,
219
+ hive_partitioning,
220
+ nil,
221
+ glob
222
+ )
223
+ Utils.wrap_ldf(rblf)
224
+ end
225
+ end
226
+ end