polars-df 0.3.1-x86_64-linux → 0.4.0-x86_64-linux
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -1
- data/Cargo.lock +335 -310
- data/Cargo.toml +0 -1
- data/LICENSE-THIRD-PARTY.txt +9228 -11189
- data/README.md +29 -0
- data/lib/polars/3.0/polars.so +0 -0
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/batched_csv_reader.rb +1 -1
- data/lib/polars/binary_expr.rb +77 -0
- data/lib/polars/binary_name_space.rb +66 -0
- data/lib/polars/data_frame.rb +63 -38
- data/lib/polars/date_time_expr.rb +6 -6
- data/lib/polars/expr.rb +9 -2
- data/lib/polars/io.rb +73 -62
- data/lib/polars/lazy_frame.rb +103 -7
- data/lib/polars/lazy_functions.rb +3 -2
- data/lib/polars/list_expr.rb +2 -2
- data/lib/polars/list_name_space.rb +2 -2
- data/lib/polars/series.rb +9 -1
- data/lib/polars/string_expr.rb +1 -1
- data/lib/polars/utils.rb +10 -2
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +2 -0
- metadata +4 -2
data/lib/polars/io.rb
CHANGED
@@ -2,7 +2,7 @@ module Polars
|
|
2
2
|
module IO
|
3
3
|
# Read a CSV file into a DataFrame.
|
4
4
|
#
|
5
|
-
# @param
|
5
|
+
# @param source [Object]
|
6
6
|
# Path to a file or a file-like object.
|
7
7
|
# @param has_header [Boolean]
|
8
8
|
# Indicate if the first row of dataset is a header or not.
|
@@ -89,7 +89,7 @@ module Polars
|
|
89
89
|
# Set `rechunk: false` if you are benchmarking the csv-reader. A `rechunk` is
|
90
90
|
# an expensive operation.
|
91
91
|
def read_csv(
|
92
|
-
|
92
|
+
source,
|
93
93
|
has_header: true,
|
94
94
|
columns: nil,
|
95
95
|
new_columns: nil,
|
@@ -137,7 +137,7 @@ module Polars
|
|
137
137
|
end
|
138
138
|
|
139
139
|
df = nil
|
140
|
-
_prepare_file_arg(
|
140
|
+
_prepare_file_arg(source) do |data|
|
141
141
|
df = DataFrame._read_csv(
|
142
142
|
data,
|
143
143
|
has_header: has_header,
|
@@ -178,7 +178,7 @@ module Polars
|
|
178
178
|
# projections to the scan level, thereby potentially reducing
|
179
179
|
# memory overhead.
|
180
180
|
#
|
181
|
-
# @param
|
181
|
+
# @param source [Object]
|
182
182
|
# Path to a file.
|
183
183
|
# @param has_header [Boolean]
|
184
184
|
# Indicate if the first row of dataset is a header or not.
|
@@ -242,7 +242,7 @@ module Polars
|
|
242
242
|
#
|
243
243
|
# @return [LazyFrame]
|
244
244
|
def scan_csv(
|
245
|
-
|
245
|
+
source,
|
246
246
|
has_header: true,
|
247
247
|
sep: ",",
|
248
248
|
comment_char: nil,
|
@@ -268,12 +268,12 @@ module Polars
|
|
268
268
|
_check_arg_is_1byte("comment_char", comment_char, false)
|
269
269
|
_check_arg_is_1byte("quote_char", quote_char, true)
|
270
270
|
|
271
|
-
if Utils.pathlike?(
|
272
|
-
|
271
|
+
if Utils.pathlike?(source)
|
272
|
+
source = Utils.normalise_filepath(source)
|
273
273
|
end
|
274
274
|
|
275
275
|
LazyFrame._scan_csv(
|
276
|
-
|
276
|
+
source,
|
277
277
|
has_header: has_header,
|
278
278
|
sep: sep,
|
279
279
|
comment_char: comment_char,
|
@@ -302,7 +302,7 @@ module Polars
|
|
302
302
|
# This allows the query optimizer to push down predicates and projections to the scan
|
303
303
|
# level, thereby potentially reducing memory overhead.
|
304
304
|
#
|
305
|
-
# @param
|
305
|
+
# @param source [String]
|
306
306
|
# Path to a IPC file.
|
307
307
|
# @param n_rows [Integer]
|
308
308
|
# Stop reading from IPC file after reading `n_rows`.
|
@@ -324,7 +324,7 @@ module Polars
|
|
324
324
|
#
|
325
325
|
# @return [LazyFrame]
|
326
326
|
def scan_ipc(
|
327
|
-
|
327
|
+
source,
|
328
328
|
n_rows: nil,
|
329
329
|
cache: true,
|
330
330
|
rechunk: true,
|
@@ -334,7 +334,7 @@ module Polars
|
|
334
334
|
memory_map: true
|
335
335
|
)
|
336
336
|
LazyFrame._scan_ipc(
|
337
|
-
|
337
|
+
source,
|
338
338
|
n_rows: n_rows,
|
339
339
|
cache: cache,
|
340
340
|
rechunk: rechunk,
|
@@ -350,7 +350,7 @@ module Polars
|
|
350
350
|
# This allows the query optimizer to push down predicates and projections to the scan
|
351
351
|
# level, thereby potentially reducing memory overhead.
|
352
352
|
#
|
353
|
-
# @param
|
353
|
+
# @param source [String]
|
354
354
|
# Path to a file.
|
355
355
|
# @param n_rows [Integer]
|
356
356
|
# Stop reading from parquet file after reading `n_rows`.
|
@@ -374,7 +374,7 @@ module Polars
|
|
374
374
|
#
|
375
375
|
# @return [LazyFrame]
|
376
376
|
def scan_parquet(
|
377
|
-
|
377
|
+
source,
|
378
378
|
n_rows: nil,
|
379
379
|
cache: true,
|
380
380
|
parallel: "auto",
|
@@ -384,12 +384,12 @@ module Polars
|
|
384
384
|
storage_options: nil,
|
385
385
|
low_memory: false
|
386
386
|
)
|
387
|
-
if Utils.pathlike?(
|
388
|
-
|
387
|
+
if Utils.pathlike?(source)
|
388
|
+
source = Utils.normalise_filepath(source)
|
389
389
|
end
|
390
390
|
|
391
391
|
LazyFrame._scan_parquet(
|
392
|
-
|
392
|
+
source,
|
393
393
|
n_rows:n_rows,
|
394
394
|
cache: cache,
|
395
395
|
parallel: parallel,
|
@@ -406,7 +406,7 @@ module Polars
|
|
406
406
|
# This allows the query optimizer to push down predicates and projections to the scan
|
407
407
|
# level, thereby potentially reducing memory overhead.
|
408
408
|
#
|
409
|
-
# @param
|
409
|
+
# @param source [String]
|
410
410
|
# Path to a file.
|
411
411
|
# @param infer_schema_length [Integer]
|
412
412
|
# Infer the schema length from the first `infer_schema_length` rows.
|
@@ -426,7 +426,7 @@ module Polars
|
|
426
426
|
#
|
427
427
|
# @return [LazyFrame]
|
428
428
|
def scan_ndjson(
|
429
|
-
|
429
|
+
source,
|
430
430
|
infer_schema_length: 100,
|
431
431
|
batch_size: 1024,
|
432
432
|
n_rows: nil,
|
@@ -435,12 +435,12 @@ module Polars
|
|
435
435
|
row_count_name: nil,
|
436
436
|
row_count_offset: 0
|
437
437
|
)
|
438
|
-
if Utils.pathlike?(
|
439
|
-
|
438
|
+
if Utils.pathlike?(source)
|
439
|
+
source = Utils.normalise_filepath(source)
|
440
440
|
end
|
441
441
|
|
442
442
|
LazyFrame._scan_ndjson(
|
443
|
-
|
443
|
+
source,
|
444
444
|
infer_schema_length: infer_schema_length,
|
445
445
|
batch_size: batch_size,
|
446
446
|
n_rows: n_rows,
|
@@ -453,7 +453,7 @@ module Polars
|
|
453
453
|
|
454
454
|
# Read into a DataFrame from Apache Avro format.
|
455
455
|
#
|
456
|
-
# @param
|
456
|
+
# @param source [Object]
|
457
457
|
# Path to a file or a file-like object.
|
458
458
|
# @param columns [Object]
|
459
459
|
# Columns to select. Accepts a list of column indices (starting at zero) or a list
|
@@ -462,17 +462,17 @@ module Polars
|
|
462
462
|
# Stop reading from Apache Avro file after reading ``n_rows``.
|
463
463
|
#
|
464
464
|
# @return [DataFrame]
|
465
|
-
def read_avro(
|
466
|
-
if Utils.pathlike?(
|
467
|
-
|
465
|
+
def read_avro(source, columns: nil, n_rows: nil)
|
466
|
+
if Utils.pathlike?(source)
|
467
|
+
source = Utils.normalise_filepath(source)
|
468
468
|
end
|
469
469
|
|
470
|
-
DataFrame._read_avro(
|
470
|
+
DataFrame._read_avro(source, n_rows: n_rows, columns: columns)
|
471
471
|
end
|
472
472
|
|
473
473
|
# Read into a DataFrame from Arrow IPC (Feather v2) file.
|
474
474
|
#
|
475
|
-
# @param
|
475
|
+
# @param source [Object]
|
476
476
|
# Path to a file or a file-like object.
|
477
477
|
# @param columns [Object]
|
478
478
|
# Columns to select. Accepts a list of column indices (starting at zero) or a list
|
@@ -495,7 +495,7 @@ module Polars
|
|
495
495
|
#
|
496
496
|
# @return [DataFrame]
|
497
497
|
def read_ipc(
|
498
|
-
|
498
|
+
source,
|
499
499
|
columns: nil,
|
500
500
|
n_rows: nil,
|
501
501
|
memory_map: true,
|
@@ -505,7 +505,7 @@ module Polars
|
|
505
505
|
rechunk: true
|
506
506
|
)
|
507
507
|
storage_options ||= {}
|
508
|
-
_prepare_file_arg(
|
508
|
+
_prepare_file_arg(source, **storage_options) do |data|
|
509
509
|
DataFrame._read_ipc(
|
510
510
|
data,
|
511
511
|
columns: columns,
|
@@ -520,8 +520,8 @@ module Polars
|
|
520
520
|
|
521
521
|
# Read into a DataFrame from a parquet file.
|
522
522
|
#
|
523
|
-
# @param
|
524
|
-
# Path to a file
|
523
|
+
# @param source [Object]
|
524
|
+
# Path to a file or a file-like object.
|
525
525
|
# @param columns [Object]
|
526
526
|
# Columns to select. Accepts a list of column indices (starting at zero) or a list
|
527
527
|
# of column names.
|
@@ -539,6 +539,12 @@ module Polars
|
|
539
539
|
# Offset to start the row_count column (only use if the name is set).
|
540
540
|
# @param low_memory [Boolean]
|
541
541
|
# Reduce memory pressure at the expense of performance.
|
542
|
+
# @param use_statistics [Boolean]
|
543
|
+
# Use statistics in the parquet to determine if pages
|
544
|
+
# can be skipped from reading.
|
545
|
+
# @param rechunk [Boolean]
|
546
|
+
# Make sure that all columns are contiguous in memory by
|
547
|
+
# aggregating the chunks into a single array.
|
542
548
|
#
|
543
549
|
# @return [DataFrame]
|
544
550
|
#
|
@@ -548,16 +554,18 @@ module Polars
|
|
548
554
|
# Set `rechunk: false` if you are benchmarking the parquet-reader. A `rechunk` is
|
549
555
|
# an expensive operation.
|
550
556
|
def read_parquet(
|
551
|
-
|
557
|
+
source,
|
552
558
|
columns: nil,
|
553
559
|
n_rows: nil,
|
554
560
|
storage_options: nil,
|
555
561
|
parallel: "auto",
|
556
562
|
row_count_name: nil,
|
557
563
|
row_count_offset: 0,
|
558
|
-
low_memory: false
|
564
|
+
low_memory: false,
|
565
|
+
use_statistics: true,
|
566
|
+
rechunk: true
|
559
567
|
)
|
560
|
-
_prepare_file_arg(
|
568
|
+
_prepare_file_arg(source) do |data|
|
561
569
|
DataFrame._read_parquet(
|
562
570
|
data,
|
563
571
|
columns: columns,
|
@@ -565,49 +573,51 @@ module Polars
|
|
565
573
|
parallel: parallel,
|
566
574
|
row_count_name: row_count_name,
|
567
575
|
row_count_offset: row_count_offset,
|
568
|
-
low_memory: low_memory
|
576
|
+
low_memory: low_memory,
|
577
|
+
use_statistics: use_statistics,
|
578
|
+
rechunk: rechunk
|
569
579
|
)
|
570
580
|
end
|
571
581
|
end
|
572
582
|
|
573
583
|
# Read into a DataFrame from a JSON file.
|
574
584
|
#
|
575
|
-
# @param
|
585
|
+
# @param source [Object]
|
576
586
|
# Path to a file or a file-like object.
|
577
587
|
#
|
578
588
|
# @return [DataFrame]
|
579
|
-
def read_json(
|
580
|
-
DataFrame._read_json(
|
589
|
+
def read_json(source)
|
590
|
+
DataFrame._read_json(source)
|
581
591
|
end
|
582
592
|
|
583
593
|
# Read into a DataFrame from a newline delimited JSON file.
|
584
594
|
#
|
585
|
-
# @param
|
595
|
+
# @param source [Object]
|
586
596
|
# Path to a file or a file-like object.
|
587
597
|
#
|
588
598
|
# @return [DataFrame]
|
589
|
-
def read_ndjson(
|
590
|
-
DataFrame._read_ndjson(
|
599
|
+
def read_ndjson(source)
|
600
|
+
DataFrame._read_ndjson(source)
|
591
601
|
end
|
592
602
|
|
593
603
|
# Read a SQL query into a DataFrame.
|
594
604
|
#
|
595
|
-
# @param
|
605
|
+
# @param query [Object]
|
596
606
|
# ActiveRecord::Relation or ActiveRecord::Result.
|
597
607
|
#
|
598
608
|
# @return [DataFrame]
|
599
|
-
def
|
609
|
+
def read_database(query)
|
600
610
|
if !defined?(ActiveRecord)
|
601
611
|
raise Error, "Active Record not available"
|
602
612
|
end
|
603
613
|
|
604
614
|
result =
|
605
|
-
if
|
606
|
-
|
607
|
-
elsif
|
608
|
-
|
609
|
-
elsif
|
610
|
-
ActiveRecord::Base.connection.select_all(
|
615
|
+
if query.is_a?(ActiveRecord::Result)
|
616
|
+
query
|
617
|
+
elsif query.is_a?(ActiveRecord::Relation)
|
618
|
+
query.connection.select_all(query.to_sql)
|
619
|
+
elsif query.is_a?(String)
|
620
|
+
ActiveRecord::Base.connection.select_all(query)
|
611
621
|
else
|
612
622
|
raise ArgumentError, "Expected ActiveRecord::Relation, ActiveRecord::Result, or String"
|
613
623
|
end
|
@@ -617,6 +627,7 @@ module Polars
|
|
617
627
|
end
|
618
628
|
DataFrame.new(data)
|
619
629
|
end
|
630
|
+
alias_method :read_sql, :read_database
|
620
631
|
|
621
632
|
# def read_excel
|
622
633
|
# end
|
@@ -628,7 +639,7 @@ module Polars
|
|
628
639
|
# file chunks. After that work will only be done
|
629
640
|
# if `next_batches` is called.
|
630
641
|
#
|
631
|
-
# @param
|
642
|
+
# @param source [Object]
|
632
643
|
# Path to a file or a file-like object.
|
633
644
|
# @param has_header [Boolean]
|
634
645
|
# Indicate if the first row of dataset is a header or not.
|
@@ -712,7 +723,7 @@ module Polars
|
|
712
723
|
# )
|
713
724
|
# reader.next_batches(5)
|
714
725
|
def read_csv_batched(
|
715
|
-
|
726
|
+
source,
|
716
727
|
has_header: true,
|
717
728
|
columns: nil,
|
718
729
|
new_columns: nil,
|
@@ -752,7 +763,7 @@ module Polars
|
|
752
763
|
end
|
753
764
|
|
754
765
|
BatchedCsvReader.new(
|
755
|
-
|
766
|
+
source,
|
756
767
|
has_header: has_header,
|
757
768
|
columns: columns || projection,
|
758
769
|
sep: sep,
|
@@ -781,30 +792,30 @@ module Polars
|
|
781
792
|
|
782
793
|
# Get a schema of the IPC file without reading data.
|
783
794
|
#
|
784
|
-
# @param
|
795
|
+
# @param source [Object]
|
785
796
|
# Path to a file or a file-like object.
|
786
797
|
#
|
787
798
|
# @return [Hash]
|
788
|
-
def read_ipc_schema(
|
789
|
-
if Utils.pathlike?(
|
790
|
-
|
799
|
+
def read_ipc_schema(source)
|
800
|
+
if Utils.pathlike?(source)
|
801
|
+
source = Utils.normalise_filepath(source)
|
791
802
|
end
|
792
803
|
|
793
|
-
_ipc_schema(
|
804
|
+
_ipc_schema(source)
|
794
805
|
end
|
795
806
|
|
796
807
|
# Get a schema of the Parquet file without reading data.
|
797
808
|
#
|
798
|
-
# @param
|
809
|
+
# @param source [Object]
|
799
810
|
# Path to a file or a file-like object.
|
800
811
|
#
|
801
812
|
# @return [Hash]
|
802
|
-
def read_parquet_schema(
|
803
|
-
if Utils.pathlike?(
|
804
|
-
|
813
|
+
def read_parquet_schema(source)
|
814
|
+
if Utils.pathlike?(source)
|
815
|
+
source = Utils.normalise_filepath(source)
|
805
816
|
end
|
806
817
|
|
807
|
-
_parquet_schema(
|
818
|
+
_parquet_schema(source)
|
808
819
|
end
|
809
820
|
|
810
821
|
private
|
data/lib/polars/lazy_frame.rb
CHANGED
@@ -80,7 +80,8 @@ module Polars
|
|
80
80
|
row_count_name: nil,
|
81
81
|
row_count_offset: 0,
|
82
82
|
storage_options: nil,
|
83
|
-
low_memory: false
|
83
|
+
low_memory: false,
|
84
|
+
use_statistics: true
|
84
85
|
)
|
85
86
|
_from_rbldf(
|
86
87
|
RbLazyFrame.new_from_parquet(
|
@@ -90,7 +91,8 @@ module Polars
|
|
90
91
|
parallel,
|
91
92
|
rechunk,
|
92
93
|
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
93
|
-
low_memory
|
94
|
+
low_memory,
|
95
|
+
use_statistics
|
94
96
|
)
|
95
97
|
)
|
96
98
|
end
|
@@ -107,7 +109,7 @@ module Polars
|
|
107
109
|
memory_map: true
|
108
110
|
)
|
109
111
|
if Utils.pathlike?(file)
|
110
|
-
file = Utils.
|
112
|
+
file = Utils.normalise_filepath(file)
|
111
113
|
end
|
112
114
|
|
113
115
|
_from_rbldf(
|
@@ -157,7 +159,7 @@ module Polars
|
|
157
159
|
# @return [LazyFrame]
|
158
160
|
def self.read_json(file)
|
159
161
|
if Utils.pathlike?(file)
|
160
|
-
file = Utils.
|
162
|
+
file = Utils.normalise_filepath(file)
|
161
163
|
end
|
162
164
|
|
163
165
|
Utils.wrap_ldf(RbLazyFrame.read_json(file))
|
@@ -264,7 +266,7 @@ module Polars
|
|
264
266
|
# @return [nil]
|
265
267
|
def write_json(file)
|
266
268
|
if Utils.pathlike?(file)
|
267
|
-
file = Utils.
|
269
|
+
file = Utils.normalise_filepath(file)
|
268
270
|
end
|
269
271
|
_ldf.write_json(file)
|
270
272
|
nil
|
@@ -473,6 +475,96 @@ module Polars
|
|
473
475
|
Utils.wrap_df(ldf.collect)
|
474
476
|
end
|
475
477
|
|
478
|
+
# Persists a LazyFrame at the provided path.
|
479
|
+
#
|
480
|
+
# This allows streaming results that are larger than RAM to be written to disk.
|
481
|
+
#
|
482
|
+
# @param path [String]
|
483
|
+
# File path to which the file should be written.
|
484
|
+
# @param compression ["lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd"]
|
485
|
+
# Choose "zstd" for good compression performance.
|
486
|
+
# Choose "lz4" for fast compression/decompression.
|
487
|
+
# Choose "snappy" for more backwards compatibility guarantees
|
488
|
+
# when you deal with older parquet readers.
|
489
|
+
# @param compression_level [Integer]
|
490
|
+
# The level of compression to use. Higher compression means smaller files on
|
491
|
+
# disk.
|
492
|
+
#
|
493
|
+
# - "gzip" : min-level: 0, max-level: 10.
|
494
|
+
# - "brotli" : min-level: 0, max-level: 11.
|
495
|
+
# - "zstd" : min-level: 1, max-level: 22.
|
496
|
+
# @param statistics [Boolean]
|
497
|
+
# Write statistics to the parquet headers. This requires extra compute.
|
498
|
+
# @param row_group_size [Integer]
|
499
|
+
# Size of the row groups in number of rows.
|
500
|
+
# If `nil` (default), the chunks of the `DataFrame` are
|
501
|
+
# used. Writing in smaller chunks may reduce memory pressure and improve
|
502
|
+
# writing speeds.
|
503
|
+
# @param data_pagesize_limit [Integer]
|
504
|
+
# Size limit of individual data pages.
|
505
|
+
# If not set defaults to 1024 * 1024 bytes
|
506
|
+
# @param maintain_order [Boolean]
|
507
|
+
# Maintain the order in which data is processed.
|
508
|
+
# Setting this to `false` will be slightly faster.
|
509
|
+
# @param type_coercion [Boolean]
|
510
|
+
# Do type coercion optimization.
|
511
|
+
# @param predicate_pushdown [Boolean]
|
512
|
+
# Do predicate pushdown optimization.
|
513
|
+
# @param projection_pushdown [Boolean]
|
514
|
+
# Do projection pushdown optimization.
|
515
|
+
# @param simplify_expression [Boolean]
|
516
|
+
# Run simplify expressions optimization.
|
517
|
+
# @param no_optimization [Boolean]
|
518
|
+
# Turn off (certain) optimizations.
|
519
|
+
# @param slice_pushdown [Boolean]
|
520
|
+
# Slice pushdown optimization.
|
521
|
+
#
|
522
|
+
# @return [DataFrame]
|
523
|
+
#
|
524
|
+
# @example
|
525
|
+
# lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
|
526
|
+
# lf.sink_parquet("out.parquet")
|
527
|
+
def sink_parquet(
|
528
|
+
path,
|
529
|
+
compression: "zstd",
|
530
|
+
compression_level: nil,
|
531
|
+
statistics: false,
|
532
|
+
row_group_size: nil,
|
533
|
+
data_pagesize_limit: nil,
|
534
|
+
maintain_order: true,
|
535
|
+
type_coercion: true,
|
536
|
+
predicate_pushdown: true,
|
537
|
+
projection_pushdown: true,
|
538
|
+
simplify_expression: true,
|
539
|
+
no_optimization: false,
|
540
|
+
slice_pushdown: true
|
541
|
+
)
|
542
|
+
if no_optimization
|
543
|
+
predicate_pushdown = false
|
544
|
+
projection_pushdown = false
|
545
|
+
slice_pushdown = false
|
546
|
+
end
|
547
|
+
|
548
|
+
lf = _ldf.optimization_toggle(
|
549
|
+
type_coercion,
|
550
|
+
predicate_pushdown,
|
551
|
+
projection_pushdown,
|
552
|
+
simplify_expression,
|
553
|
+
slice_pushdown,
|
554
|
+
false,
|
555
|
+
true
|
556
|
+
)
|
557
|
+
lf.sink_parquet(
|
558
|
+
path,
|
559
|
+
compression,
|
560
|
+
compression_level,
|
561
|
+
statistics,
|
562
|
+
row_group_size,
|
563
|
+
data_pagesize_limit,
|
564
|
+
maintain_order
|
565
|
+
)
|
566
|
+
end
|
567
|
+
|
476
568
|
# Collect a small number of rows for debugging purposes.
|
477
569
|
#
|
478
570
|
# Fetch is like a {#collect} operation, but it overwrites the number of rows
|
@@ -2192,6 +2284,10 @@ module Polars
|
|
2192
2284
|
# Name to give to the `value` column. Defaults to "variable"
|
2193
2285
|
# @param value_name [String]
|
2194
2286
|
# Name to give to the `value` column. Defaults to "value"
|
2287
|
+
# @param streamable [Boolean]
|
2288
|
+
# Allow this node to run in the streaming engine.
|
2289
|
+
# If this runs in streaming, the output of the melt operation
|
2290
|
+
# will not have a stable ordering.
|
2195
2291
|
#
|
2196
2292
|
# @return [LazyFrame]
|
2197
2293
|
#
|
@@ -2218,7 +2314,7 @@ module Polars
|
|
2218
2314
|
# # │ y ┆ c ┆ 4 │
|
2219
2315
|
# # │ z ┆ c ┆ 6 │
|
2220
2316
|
# # └─────┴──────────┴───────┘
|
2221
|
-
def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil)
|
2317
|
+
def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil, streamable: true)
|
2222
2318
|
if value_vars.is_a?(String)
|
2223
2319
|
value_vars = [value_vars]
|
2224
2320
|
end
|
@@ -2232,7 +2328,7 @@ module Polars
|
|
2232
2328
|
id_vars = []
|
2233
2329
|
end
|
2234
2330
|
_from_rbldf(
|
2235
|
-
_ldf.melt(id_vars, value_vars, value_name, variable_name)
|
2331
|
+
_ldf.melt(id_vars, value_vars, value_name, variable_name, streamable)
|
2236
2332
|
)
|
2237
2333
|
end
|
2238
2334
|
|
@@ -657,7 +657,7 @@ module Polars
|
|
657
657
|
# Default is ascending.
|
658
658
|
#
|
659
659
|
# @return [Expr]
|
660
|
-
def
|
660
|
+
def arg_sort_by(exprs, reverse: false)
|
661
661
|
if !exprs.is_a?(Array)
|
662
662
|
exprs = [exprs]
|
663
663
|
end
|
@@ -665,8 +665,9 @@ module Polars
|
|
665
665
|
reverse = [reverse] * exprs.length
|
666
666
|
end
|
667
667
|
exprs = Utils.selection_to_rbexpr_list(exprs)
|
668
|
-
Utils.wrap_expr(RbExpr.
|
668
|
+
Utils.wrap_expr(RbExpr.arg_sort_by(exprs, reverse))
|
669
669
|
end
|
670
|
+
alias_method :argsort_by, :arg_sort_by
|
670
671
|
|
671
672
|
# Create polars `Duration` from distinct time components.
|
672
673
|
#
|
data/lib/polars/list_expr.rb
CHANGED
@@ -426,7 +426,7 @@ module Polars
|
|
426
426
|
# # shape: (2,)
|
427
427
|
# # Series: 'a' [list[i64]]
|
428
428
|
# # [
|
429
|
-
# # [null, 1,
|
429
|
+
# # [null, 1, … 1]
|
430
430
|
# # [null, -8, -1]
|
431
431
|
# # ]
|
432
432
|
def diff(n: 1, null_behavior: "ignore")
|
@@ -447,7 +447,7 @@ module Polars
|
|
447
447
|
# # shape: (2,)
|
448
448
|
# # Series: 'a' [list[i64]]
|
449
449
|
# # [
|
450
|
-
# # [null, 1,
|
450
|
+
# # [null, 1, … 3]
|
451
451
|
# # [null, 10, 2]
|
452
452
|
# # ]
|
453
453
|
def shift(periods = 1)
|
@@ -185,7 +185,7 @@ module Polars
|
|
185
185
|
# # shape: (2,)
|
186
186
|
# # Series: 'a' [list[i64]]
|
187
187
|
# # [
|
188
|
-
# # [null, 1,
|
188
|
+
# # [null, 1, … 1]
|
189
189
|
# # [null, -8, -1]
|
190
190
|
# # ]
|
191
191
|
def diff(n: 1, null_behavior: "ignore")
|
@@ -206,7 +206,7 @@ module Polars
|
|
206
206
|
# # shape: (2,)
|
207
207
|
# # Series: 'a' [list[i64]]
|
208
208
|
# # [
|
209
|
-
# # [null, 1,
|
209
|
+
# # [null, 1, … 3]
|
210
210
|
# # [null, 10, 2]
|
211
211
|
# # ]
|
212
212
|
def shift(periods = 1)
|
data/lib/polars/series.rb
CHANGED
@@ -3531,6 +3531,13 @@ module Polars
|
|
3531
3531
|
ListNameSpace.new(self)
|
3532
3532
|
end
|
3533
3533
|
|
3534
|
+
# Create an object namespace of all binary related methods.
|
3535
|
+
#
|
3536
|
+
# @return [BinaryNameSpace]
|
3537
|
+
def bin
|
3538
|
+
BinaryNameSpace.new(self)
|
3539
|
+
end
|
3540
|
+
|
3534
3541
|
# Create an object namespace of all categorical related methods.
|
3535
3542
|
#
|
3536
3543
|
# @return [CatNameSpace]
|
@@ -3795,7 +3802,8 @@ module Polars
|
|
3795
3802
|
UInt32 => RbSeries.method(:new_opt_u32),
|
3796
3803
|
UInt64 => RbSeries.method(:new_opt_u64),
|
3797
3804
|
Boolean => RbSeries.method(:new_opt_bool),
|
3798
|
-
Utf8 => RbSeries.method(:new_str)
|
3805
|
+
Utf8 => RbSeries.method(:new_str),
|
3806
|
+
Binary => RbSeries.method(:new_binary)
|
3799
3807
|
}
|
3800
3808
|
|
3801
3809
|
SYM_TYPE_TO_CONSTRUCTOR = {
|
data/lib/polars/string_expr.rb
CHANGED
data/lib/polars/utils.rb
CHANGED
@@ -93,8 +93,12 @@ module Polars
|
|
93
93
|
Polars.lit(value)
|
94
94
|
end
|
95
95
|
|
96
|
-
def self.
|
97
|
-
File.expand_path(path)
|
96
|
+
def self.normalise_filepath(path, check_not_directory: true)
|
97
|
+
path = File.expand_path(path)
|
98
|
+
if check_not_directory && File.exist?(path) && Dir.exist?(path)
|
99
|
+
raise ArgumentError, "Expected a file path; #{path} is a directory"
|
100
|
+
end
|
101
|
+
path
|
98
102
|
end
|
99
103
|
|
100
104
|
# TODO fix
|
@@ -216,5 +220,9 @@ module Polars
|
|
216
220
|
val.is_a?(Array) && _is_iterable_of(val, String)
|
217
221
|
end
|
218
222
|
end
|
223
|
+
|
224
|
+
def self.local_file?(file)
|
225
|
+
Dir.glob(file).any?
|
226
|
+
end
|
219
227
|
end
|
220
228
|
end
|
data/lib/polars/version.rb
CHANGED
data/lib/polars.rb
CHANGED
@@ -12,6 +12,8 @@ require "stringio"
|
|
12
12
|
# modules
|
13
13
|
require_relative "polars/expr_dispatch"
|
14
14
|
require_relative "polars/batched_csv_reader"
|
15
|
+
require_relative "polars/binary_expr"
|
16
|
+
require_relative "polars/binary_name_space"
|
15
17
|
require_relative "polars/cat_expr"
|
16
18
|
require_relative "polars/cat_name_space"
|
17
19
|
require_relative "polars/convert"
|