polars-df 0.3.1-arm64-darwin → 0.4.0-arm64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -1
- data/Cargo.lock +335 -310
- data/Cargo.toml +0 -1
- data/LICENSE-THIRD-PARTY.txt +5797 -7758
- data/README.md +29 -0
- data/lib/polars/3.0/polars.bundle +0 -0
- data/lib/polars/3.1/polars.bundle +0 -0
- data/lib/polars/3.2/polars.bundle +0 -0
- data/lib/polars/batched_csv_reader.rb +1 -1
- data/lib/polars/binary_expr.rb +77 -0
- data/lib/polars/binary_name_space.rb +66 -0
- data/lib/polars/data_frame.rb +63 -38
- data/lib/polars/date_time_expr.rb +6 -6
- data/lib/polars/expr.rb +9 -2
- data/lib/polars/io.rb +73 -62
- data/lib/polars/lazy_frame.rb +103 -7
- data/lib/polars/lazy_functions.rb +3 -2
- data/lib/polars/list_expr.rb +2 -2
- data/lib/polars/list_name_space.rb +2 -2
- data/lib/polars/series.rb +9 -1
- data/lib/polars/string_expr.rb +1 -1
- data/lib/polars/utils.rb +10 -2
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +2 -0
- metadata +4 -2
data/lib/polars/io.rb
CHANGED
@@ -2,7 +2,7 @@ module Polars
|
|
2
2
|
module IO
|
3
3
|
# Read a CSV file into a DataFrame.
|
4
4
|
#
|
5
|
-
# @param
|
5
|
+
# @param source [Object]
|
6
6
|
# Path to a file or a file-like object.
|
7
7
|
# @param has_header [Boolean]
|
8
8
|
# Indicate if the first row of dataset is a header or not.
|
@@ -89,7 +89,7 @@ module Polars
|
|
89
89
|
# Set `rechunk: false` if you are benchmarking the csv-reader. A `rechunk` is
|
90
90
|
# an expensive operation.
|
91
91
|
def read_csv(
|
92
|
-
|
92
|
+
source,
|
93
93
|
has_header: true,
|
94
94
|
columns: nil,
|
95
95
|
new_columns: nil,
|
@@ -137,7 +137,7 @@ module Polars
|
|
137
137
|
end
|
138
138
|
|
139
139
|
df = nil
|
140
|
-
_prepare_file_arg(
|
140
|
+
_prepare_file_arg(source) do |data|
|
141
141
|
df = DataFrame._read_csv(
|
142
142
|
data,
|
143
143
|
has_header: has_header,
|
@@ -178,7 +178,7 @@ module Polars
|
|
178
178
|
# projections to the scan level, thereby potentially reducing
|
179
179
|
# memory overhead.
|
180
180
|
#
|
181
|
-
# @param
|
181
|
+
# @param source [Object]
|
182
182
|
# Path to a file.
|
183
183
|
# @param has_header [Boolean]
|
184
184
|
# Indicate if the first row of dataset is a header or not.
|
@@ -242,7 +242,7 @@ module Polars
|
|
242
242
|
#
|
243
243
|
# @return [LazyFrame]
|
244
244
|
def scan_csv(
|
245
|
-
|
245
|
+
source,
|
246
246
|
has_header: true,
|
247
247
|
sep: ",",
|
248
248
|
comment_char: nil,
|
@@ -268,12 +268,12 @@ module Polars
|
|
268
268
|
_check_arg_is_1byte("comment_char", comment_char, false)
|
269
269
|
_check_arg_is_1byte("quote_char", quote_char, true)
|
270
270
|
|
271
|
-
if Utils.pathlike?(
|
272
|
-
|
271
|
+
if Utils.pathlike?(source)
|
272
|
+
source = Utils.normalise_filepath(source)
|
273
273
|
end
|
274
274
|
|
275
275
|
LazyFrame._scan_csv(
|
276
|
-
|
276
|
+
source,
|
277
277
|
has_header: has_header,
|
278
278
|
sep: sep,
|
279
279
|
comment_char: comment_char,
|
@@ -302,7 +302,7 @@ module Polars
|
|
302
302
|
# This allows the query optimizer to push down predicates and projections to the scan
|
303
303
|
# level, thereby potentially reducing memory overhead.
|
304
304
|
#
|
305
|
-
# @param
|
305
|
+
# @param source [String]
|
306
306
|
# Path to a IPC file.
|
307
307
|
# @param n_rows [Integer]
|
308
308
|
# Stop reading from IPC file after reading `n_rows`.
|
@@ -324,7 +324,7 @@ module Polars
|
|
324
324
|
#
|
325
325
|
# @return [LazyFrame]
|
326
326
|
def scan_ipc(
|
327
|
-
|
327
|
+
source,
|
328
328
|
n_rows: nil,
|
329
329
|
cache: true,
|
330
330
|
rechunk: true,
|
@@ -334,7 +334,7 @@ module Polars
|
|
334
334
|
memory_map: true
|
335
335
|
)
|
336
336
|
LazyFrame._scan_ipc(
|
337
|
-
|
337
|
+
source,
|
338
338
|
n_rows: n_rows,
|
339
339
|
cache: cache,
|
340
340
|
rechunk: rechunk,
|
@@ -350,7 +350,7 @@ module Polars
|
|
350
350
|
# This allows the query optimizer to push down predicates and projections to the scan
|
351
351
|
# level, thereby potentially reducing memory overhead.
|
352
352
|
#
|
353
|
-
# @param
|
353
|
+
# @param source [String]
|
354
354
|
# Path to a file.
|
355
355
|
# @param n_rows [Integer]
|
356
356
|
# Stop reading from parquet file after reading `n_rows`.
|
@@ -374,7 +374,7 @@ module Polars
|
|
374
374
|
#
|
375
375
|
# @return [LazyFrame]
|
376
376
|
def scan_parquet(
|
377
|
-
|
377
|
+
source,
|
378
378
|
n_rows: nil,
|
379
379
|
cache: true,
|
380
380
|
parallel: "auto",
|
@@ -384,12 +384,12 @@ module Polars
|
|
384
384
|
storage_options: nil,
|
385
385
|
low_memory: false
|
386
386
|
)
|
387
|
-
if Utils.pathlike?(
|
388
|
-
|
387
|
+
if Utils.pathlike?(source)
|
388
|
+
source = Utils.normalise_filepath(source)
|
389
389
|
end
|
390
390
|
|
391
391
|
LazyFrame._scan_parquet(
|
392
|
-
|
392
|
+
source,
|
393
393
|
n_rows:n_rows,
|
394
394
|
cache: cache,
|
395
395
|
parallel: parallel,
|
@@ -406,7 +406,7 @@ module Polars
|
|
406
406
|
# This allows the query optimizer to push down predicates and projections to the scan
|
407
407
|
# level, thereby potentially reducing memory overhead.
|
408
408
|
#
|
409
|
-
# @param
|
409
|
+
# @param source [String]
|
410
410
|
# Path to a file.
|
411
411
|
# @param infer_schema_length [Integer]
|
412
412
|
# Infer the schema length from the first `infer_schema_length` rows.
|
@@ -426,7 +426,7 @@ module Polars
|
|
426
426
|
#
|
427
427
|
# @return [LazyFrame]
|
428
428
|
def scan_ndjson(
|
429
|
-
|
429
|
+
source,
|
430
430
|
infer_schema_length: 100,
|
431
431
|
batch_size: 1024,
|
432
432
|
n_rows: nil,
|
@@ -435,12 +435,12 @@ module Polars
|
|
435
435
|
row_count_name: nil,
|
436
436
|
row_count_offset: 0
|
437
437
|
)
|
438
|
-
if Utils.pathlike?(
|
439
|
-
|
438
|
+
if Utils.pathlike?(source)
|
439
|
+
source = Utils.normalise_filepath(source)
|
440
440
|
end
|
441
441
|
|
442
442
|
LazyFrame._scan_ndjson(
|
443
|
-
|
443
|
+
source,
|
444
444
|
infer_schema_length: infer_schema_length,
|
445
445
|
batch_size: batch_size,
|
446
446
|
n_rows: n_rows,
|
@@ -453,7 +453,7 @@ module Polars
|
|
453
453
|
|
454
454
|
# Read into a DataFrame from Apache Avro format.
|
455
455
|
#
|
456
|
-
# @param
|
456
|
+
# @param source [Object]
|
457
457
|
# Path to a file or a file-like object.
|
458
458
|
# @param columns [Object]
|
459
459
|
# Columns to select. Accepts a list of column indices (starting at zero) or a list
|
@@ -462,17 +462,17 @@ module Polars
|
|
462
462
|
# Stop reading from Apache Avro file after reading ``n_rows``.
|
463
463
|
#
|
464
464
|
# @return [DataFrame]
|
465
|
-
def read_avro(
|
466
|
-
if Utils.pathlike?(
|
467
|
-
|
465
|
+
def read_avro(source, columns: nil, n_rows: nil)
|
466
|
+
if Utils.pathlike?(source)
|
467
|
+
source = Utils.normalise_filepath(source)
|
468
468
|
end
|
469
469
|
|
470
|
-
DataFrame._read_avro(
|
470
|
+
DataFrame._read_avro(source, n_rows: n_rows, columns: columns)
|
471
471
|
end
|
472
472
|
|
473
473
|
# Read into a DataFrame from Arrow IPC (Feather v2) file.
|
474
474
|
#
|
475
|
-
# @param
|
475
|
+
# @param source [Object]
|
476
476
|
# Path to a file or a file-like object.
|
477
477
|
# @param columns [Object]
|
478
478
|
# Columns to select. Accepts a list of column indices (starting at zero) or a list
|
@@ -495,7 +495,7 @@ module Polars
|
|
495
495
|
#
|
496
496
|
# @return [DataFrame]
|
497
497
|
def read_ipc(
|
498
|
-
|
498
|
+
source,
|
499
499
|
columns: nil,
|
500
500
|
n_rows: nil,
|
501
501
|
memory_map: true,
|
@@ -505,7 +505,7 @@ module Polars
|
|
505
505
|
rechunk: true
|
506
506
|
)
|
507
507
|
storage_options ||= {}
|
508
|
-
_prepare_file_arg(
|
508
|
+
_prepare_file_arg(source, **storage_options) do |data|
|
509
509
|
DataFrame._read_ipc(
|
510
510
|
data,
|
511
511
|
columns: columns,
|
@@ -520,8 +520,8 @@ module Polars
|
|
520
520
|
|
521
521
|
# Read into a DataFrame from a parquet file.
|
522
522
|
#
|
523
|
-
# @param
|
524
|
-
# Path to a file
|
523
|
+
# @param source [Object]
|
524
|
+
# Path to a file or a file-like object.
|
525
525
|
# @param columns [Object]
|
526
526
|
# Columns to select. Accepts a list of column indices (starting at zero) or a list
|
527
527
|
# of column names.
|
@@ -539,6 +539,12 @@ module Polars
|
|
539
539
|
# Offset to start the row_count column (only use if the name is set).
|
540
540
|
# @param low_memory [Boolean]
|
541
541
|
# Reduce memory pressure at the expense of performance.
|
542
|
+
# @param use_statistics [Boolean]
|
543
|
+
# Use statistics in the parquet to determine if pages
|
544
|
+
# can be skipped from reading.
|
545
|
+
# @param rechunk [Boolean]
|
546
|
+
# Make sure that all columns are contiguous in memory by
|
547
|
+
# aggregating the chunks into a single array.
|
542
548
|
#
|
543
549
|
# @return [DataFrame]
|
544
550
|
#
|
@@ -548,16 +554,18 @@ module Polars
|
|
548
554
|
# Set `rechunk: false` if you are benchmarking the parquet-reader. A `rechunk` is
|
549
555
|
# an expensive operation.
|
550
556
|
def read_parquet(
|
551
|
-
|
557
|
+
source,
|
552
558
|
columns: nil,
|
553
559
|
n_rows: nil,
|
554
560
|
storage_options: nil,
|
555
561
|
parallel: "auto",
|
556
562
|
row_count_name: nil,
|
557
563
|
row_count_offset: 0,
|
558
|
-
low_memory: false
|
564
|
+
low_memory: false,
|
565
|
+
use_statistics: true,
|
566
|
+
rechunk: true
|
559
567
|
)
|
560
|
-
_prepare_file_arg(
|
568
|
+
_prepare_file_arg(source) do |data|
|
561
569
|
DataFrame._read_parquet(
|
562
570
|
data,
|
563
571
|
columns: columns,
|
@@ -565,49 +573,51 @@ module Polars
|
|
565
573
|
parallel: parallel,
|
566
574
|
row_count_name: row_count_name,
|
567
575
|
row_count_offset: row_count_offset,
|
568
|
-
low_memory: low_memory
|
576
|
+
low_memory: low_memory,
|
577
|
+
use_statistics: use_statistics,
|
578
|
+
rechunk: rechunk
|
569
579
|
)
|
570
580
|
end
|
571
581
|
end
|
572
582
|
|
573
583
|
# Read into a DataFrame from a JSON file.
|
574
584
|
#
|
575
|
-
# @param
|
585
|
+
# @param source [Object]
|
576
586
|
# Path to a file or a file-like object.
|
577
587
|
#
|
578
588
|
# @return [DataFrame]
|
579
|
-
def read_json(
|
580
|
-
DataFrame._read_json(
|
589
|
+
def read_json(source)
|
590
|
+
DataFrame._read_json(source)
|
581
591
|
end
|
582
592
|
|
583
593
|
# Read into a DataFrame from a newline delimited JSON file.
|
584
594
|
#
|
585
|
-
# @param
|
595
|
+
# @param source [Object]
|
586
596
|
# Path to a file or a file-like object.
|
587
597
|
#
|
588
598
|
# @return [DataFrame]
|
589
|
-
def read_ndjson(
|
590
|
-
DataFrame._read_ndjson(
|
599
|
+
def read_ndjson(source)
|
600
|
+
DataFrame._read_ndjson(source)
|
591
601
|
end
|
592
602
|
|
593
603
|
# Read a SQL query into a DataFrame.
|
594
604
|
#
|
595
|
-
# @param
|
605
|
+
# @param query [Object]
|
596
606
|
# ActiveRecord::Relation or ActiveRecord::Result.
|
597
607
|
#
|
598
608
|
# @return [DataFrame]
|
599
|
-
def
|
609
|
+
def read_database(query)
|
600
610
|
if !defined?(ActiveRecord)
|
601
611
|
raise Error, "Active Record not available"
|
602
612
|
end
|
603
613
|
|
604
614
|
result =
|
605
|
-
if
|
606
|
-
|
607
|
-
elsif
|
608
|
-
|
609
|
-
elsif
|
610
|
-
ActiveRecord::Base.connection.select_all(
|
615
|
+
if query.is_a?(ActiveRecord::Result)
|
616
|
+
query
|
617
|
+
elsif query.is_a?(ActiveRecord::Relation)
|
618
|
+
query.connection.select_all(query.to_sql)
|
619
|
+
elsif query.is_a?(String)
|
620
|
+
ActiveRecord::Base.connection.select_all(query)
|
611
621
|
else
|
612
622
|
raise ArgumentError, "Expected ActiveRecord::Relation, ActiveRecord::Result, or String"
|
613
623
|
end
|
@@ -617,6 +627,7 @@ module Polars
|
|
617
627
|
end
|
618
628
|
DataFrame.new(data)
|
619
629
|
end
|
630
|
+
alias_method :read_sql, :read_database
|
620
631
|
|
621
632
|
# def read_excel
|
622
633
|
# end
|
@@ -628,7 +639,7 @@ module Polars
|
|
628
639
|
# file chunks. After that work will only be done
|
629
640
|
# if `next_batches` is called.
|
630
641
|
#
|
631
|
-
# @param
|
642
|
+
# @param source [Object]
|
632
643
|
# Path to a file or a file-like object.
|
633
644
|
# @param has_header [Boolean]
|
634
645
|
# Indicate if the first row of dataset is a header or not.
|
@@ -712,7 +723,7 @@ module Polars
|
|
712
723
|
# )
|
713
724
|
# reader.next_batches(5)
|
714
725
|
def read_csv_batched(
|
715
|
-
|
726
|
+
source,
|
716
727
|
has_header: true,
|
717
728
|
columns: nil,
|
718
729
|
new_columns: nil,
|
@@ -752,7 +763,7 @@ module Polars
|
|
752
763
|
end
|
753
764
|
|
754
765
|
BatchedCsvReader.new(
|
755
|
-
|
766
|
+
source,
|
756
767
|
has_header: has_header,
|
757
768
|
columns: columns || projection,
|
758
769
|
sep: sep,
|
@@ -781,30 +792,30 @@ module Polars
|
|
781
792
|
|
782
793
|
# Get a schema of the IPC file without reading data.
|
783
794
|
#
|
784
|
-
# @param
|
795
|
+
# @param source [Object]
|
785
796
|
# Path to a file or a file-like object.
|
786
797
|
#
|
787
798
|
# @return [Hash]
|
788
|
-
def read_ipc_schema(
|
789
|
-
if Utils.pathlike?(
|
790
|
-
|
799
|
+
def read_ipc_schema(source)
|
800
|
+
if Utils.pathlike?(source)
|
801
|
+
source = Utils.normalise_filepath(source)
|
791
802
|
end
|
792
803
|
|
793
|
-
_ipc_schema(
|
804
|
+
_ipc_schema(source)
|
794
805
|
end
|
795
806
|
|
796
807
|
# Get a schema of the Parquet file without reading data.
|
797
808
|
#
|
798
|
-
# @param
|
809
|
+
# @param source [Object]
|
799
810
|
# Path to a file or a file-like object.
|
800
811
|
#
|
801
812
|
# @return [Hash]
|
802
|
-
def read_parquet_schema(
|
803
|
-
if Utils.pathlike?(
|
804
|
-
|
813
|
+
def read_parquet_schema(source)
|
814
|
+
if Utils.pathlike?(source)
|
815
|
+
source = Utils.normalise_filepath(source)
|
805
816
|
end
|
806
817
|
|
807
|
-
_parquet_schema(
|
818
|
+
_parquet_schema(source)
|
808
819
|
end
|
809
820
|
|
810
821
|
private
|
data/lib/polars/lazy_frame.rb
CHANGED
@@ -80,7 +80,8 @@ module Polars
|
|
80
80
|
row_count_name: nil,
|
81
81
|
row_count_offset: 0,
|
82
82
|
storage_options: nil,
|
83
|
-
low_memory: false
|
83
|
+
low_memory: false,
|
84
|
+
use_statistics: true
|
84
85
|
)
|
85
86
|
_from_rbldf(
|
86
87
|
RbLazyFrame.new_from_parquet(
|
@@ -90,7 +91,8 @@ module Polars
|
|
90
91
|
parallel,
|
91
92
|
rechunk,
|
92
93
|
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
93
|
-
low_memory
|
94
|
+
low_memory,
|
95
|
+
use_statistics
|
94
96
|
)
|
95
97
|
)
|
96
98
|
end
|
@@ -107,7 +109,7 @@ module Polars
|
|
107
109
|
memory_map: true
|
108
110
|
)
|
109
111
|
if Utils.pathlike?(file)
|
110
|
-
file = Utils.
|
112
|
+
file = Utils.normalise_filepath(file)
|
111
113
|
end
|
112
114
|
|
113
115
|
_from_rbldf(
|
@@ -157,7 +159,7 @@ module Polars
|
|
157
159
|
# @return [LazyFrame]
|
158
160
|
def self.read_json(file)
|
159
161
|
if Utils.pathlike?(file)
|
160
|
-
file = Utils.
|
162
|
+
file = Utils.normalise_filepath(file)
|
161
163
|
end
|
162
164
|
|
163
165
|
Utils.wrap_ldf(RbLazyFrame.read_json(file))
|
@@ -264,7 +266,7 @@ module Polars
|
|
264
266
|
# @return [nil]
|
265
267
|
def write_json(file)
|
266
268
|
if Utils.pathlike?(file)
|
267
|
-
file = Utils.
|
269
|
+
file = Utils.normalise_filepath(file)
|
268
270
|
end
|
269
271
|
_ldf.write_json(file)
|
270
272
|
nil
|
@@ -473,6 +475,96 @@ module Polars
|
|
473
475
|
Utils.wrap_df(ldf.collect)
|
474
476
|
end
|
475
477
|
|
478
|
+
# Persists a LazyFrame at the provided path.
|
479
|
+
#
|
480
|
+
# This allows streaming results that are larger than RAM to be written to disk.
|
481
|
+
#
|
482
|
+
# @param path [String]
|
483
|
+
# File path to which the file should be written.
|
484
|
+
# @param compression ["lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd"]
|
485
|
+
# Choose "zstd" for good compression performance.
|
486
|
+
# Choose "lz4" for fast compression/decompression.
|
487
|
+
# Choose "snappy" for more backwards compatibility guarantees
|
488
|
+
# when you deal with older parquet readers.
|
489
|
+
# @param compression_level [Integer]
|
490
|
+
# The level of compression to use. Higher compression means smaller files on
|
491
|
+
# disk.
|
492
|
+
#
|
493
|
+
# - "gzip" : min-level: 0, max-level: 10.
|
494
|
+
# - "brotli" : min-level: 0, max-level: 11.
|
495
|
+
# - "zstd" : min-level: 1, max-level: 22.
|
496
|
+
# @param statistics [Boolean]
|
497
|
+
# Write statistics to the parquet headers. This requires extra compute.
|
498
|
+
# @param row_group_size [Integer]
|
499
|
+
# Size of the row groups in number of rows.
|
500
|
+
# If `nil` (default), the chunks of the `DataFrame` are
|
501
|
+
# used. Writing in smaller chunks may reduce memory pressure and improve
|
502
|
+
# writing speeds.
|
503
|
+
# @param data_pagesize_limit [Integer]
|
504
|
+
# Size limit of individual data pages.
|
505
|
+
# If not set defaults to 1024 * 1024 bytes
|
506
|
+
# @param maintain_order [Boolean]
|
507
|
+
# Maintain the order in which data is processed.
|
508
|
+
# Setting this to `false` will be slightly faster.
|
509
|
+
# @param type_coercion [Boolean]
|
510
|
+
# Do type coercion optimization.
|
511
|
+
# @param predicate_pushdown [Boolean]
|
512
|
+
# Do predicate pushdown optimization.
|
513
|
+
# @param projection_pushdown [Boolean]
|
514
|
+
# Do projection pushdown optimization.
|
515
|
+
# @param simplify_expression [Boolean]
|
516
|
+
# Run simplify expressions optimization.
|
517
|
+
# @param no_optimization [Boolean]
|
518
|
+
# Turn off (certain) optimizations.
|
519
|
+
# @param slice_pushdown [Boolean]
|
520
|
+
# Slice pushdown optimization.
|
521
|
+
#
|
522
|
+
# @return [DataFrame]
|
523
|
+
#
|
524
|
+
# @example
|
525
|
+
# lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
|
526
|
+
# lf.sink_parquet("out.parquet")
|
527
|
+
def sink_parquet(
|
528
|
+
path,
|
529
|
+
compression: "zstd",
|
530
|
+
compression_level: nil,
|
531
|
+
statistics: false,
|
532
|
+
row_group_size: nil,
|
533
|
+
data_pagesize_limit: nil,
|
534
|
+
maintain_order: true,
|
535
|
+
type_coercion: true,
|
536
|
+
predicate_pushdown: true,
|
537
|
+
projection_pushdown: true,
|
538
|
+
simplify_expression: true,
|
539
|
+
no_optimization: false,
|
540
|
+
slice_pushdown: true
|
541
|
+
)
|
542
|
+
if no_optimization
|
543
|
+
predicate_pushdown = false
|
544
|
+
projection_pushdown = false
|
545
|
+
slice_pushdown = false
|
546
|
+
end
|
547
|
+
|
548
|
+
lf = _ldf.optimization_toggle(
|
549
|
+
type_coercion,
|
550
|
+
predicate_pushdown,
|
551
|
+
projection_pushdown,
|
552
|
+
simplify_expression,
|
553
|
+
slice_pushdown,
|
554
|
+
false,
|
555
|
+
true
|
556
|
+
)
|
557
|
+
lf.sink_parquet(
|
558
|
+
path,
|
559
|
+
compression,
|
560
|
+
compression_level,
|
561
|
+
statistics,
|
562
|
+
row_group_size,
|
563
|
+
data_pagesize_limit,
|
564
|
+
maintain_order
|
565
|
+
)
|
566
|
+
end
|
567
|
+
|
476
568
|
# Collect a small number of rows for debugging purposes.
|
477
569
|
#
|
478
570
|
# Fetch is like a {#collect} operation, but it overwrites the number of rows
|
@@ -2192,6 +2284,10 @@ module Polars
|
|
2192
2284
|
# Name to give to the `value` column. Defaults to "variable"
|
2193
2285
|
# @param value_name [String]
|
2194
2286
|
# Name to give to the `value` column. Defaults to "value"
|
2287
|
+
# @param streamable [Boolean]
|
2288
|
+
# Allow this node to run in the streaming engine.
|
2289
|
+
# If this runs in streaming, the output of the melt operation
|
2290
|
+
# will not have a stable ordering.
|
2195
2291
|
#
|
2196
2292
|
# @return [LazyFrame]
|
2197
2293
|
#
|
@@ -2218,7 +2314,7 @@ module Polars
|
|
2218
2314
|
# # │ y ┆ c ┆ 4 │
|
2219
2315
|
# # │ z ┆ c ┆ 6 │
|
2220
2316
|
# # └─────┴──────────┴───────┘
|
2221
|
-
def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil)
|
2317
|
+
def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil, streamable: true)
|
2222
2318
|
if value_vars.is_a?(String)
|
2223
2319
|
value_vars = [value_vars]
|
2224
2320
|
end
|
@@ -2232,7 +2328,7 @@ module Polars
|
|
2232
2328
|
id_vars = []
|
2233
2329
|
end
|
2234
2330
|
_from_rbldf(
|
2235
|
-
_ldf.melt(id_vars, value_vars, value_name, variable_name)
|
2331
|
+
_ldf.melt(id_vars, value_vars, value_name, variable_name, streamable)
|
2236
2332
|
)
|
2237
2333
|
end
|
2238
2334
|
|
@@ -657,7 +657,7 @@ module Polars
|
|
657
657
|
# Default is ascending.
|
658
658
|
#
|
659
659
|
# @return [Expr]
|
660
|
-
def
|
660
|
+
def arg_sort_by(exprs, reverse: false)
|
661
661
|
if !exprs.is_a?(Array)
|
662
662
|
exprs = [exprs]
|
663
663
|
end
|
@@ -665,8 +665,9 @@ module Polars
|
|
665
665
|
reverse = [reverse] * exprs.length
|
666
666
|
end
|
667
667
|
exprs = Utils.selection_to_rbexpr_list(exprs)
|
668
|
-
Utils.wrap_expr(RbExpr.
|
668
|
+
Utils.wrap_expr(RbExpr.arg_sort_by(exprs, reverse))
|
669
669
|
end
|
670
|
+
alias_method :argsort_by, :arg_sort_by
|
670
671
|
|
671
672
|
# Create polars `Duration` from distinct time components.
|
672
673
|
#
|
data/lib/polars/list_expr.rb
CHANGED
@@ -426,7 +426,7 @@ module Polars
|
|
426
426
|
# # shape: (2,)
|
427
427
|
# # Series: 'a' [list[i64]]
|
428
428
|
# # [
|
429
|
-
# # [null, 1,
|
429
|
+
# # [null, 1, … 1]
|
430
430
|
# # [null, -8, -1]
|
431
431
|
# # ]
|
432
432
|
def diff(n: 1, null_behavior: "ignore")
|
@@ -447,7 +447,7 @@ module Polars
|
|
447
447
|
# # shape: (2,)
|
448
448
|
# # Series: 'a' [list[i64]]
|
449
449
|
# # [
|
450
|
-
# # [null, 1,
|
450
|
+
# # [null, 1, … 3]
|
451
451
|
# # [null, 10, 2]
|
452
452
|
# # ]
|
453
453
|
def shift(periods = 1)
|
@@ -185,7 +185,7 @@ module Polars
|
|
185
185
|
# # shape: (2,)
|
186
186
|
# # Series: 'a' [list[i64]]
|
187
187
|
# # [
|
188
|
-
# # [null, 1,
|
188
|
+
# # [null, 1, … 1]
|
189
189
|
# # [null, -8, -1]
|
190
190
|
# # ]
|
191
191
|
def diff(n: 1, null_behavior: "ignore")
|
@@ -206,7 +206,7 @@ module Polars
|
|
206
206
|
# # shape: (2,)
|
207
207
|
# # Series: 'a' [list[i64]]
|
208
208
|
# # [
|
209
|
-
# # [null, 1,
|
209
|
+
# # [null, 1, … 3]
|
210
210
|
# # [null, 10, 2]
|
211
211
|
# # ]
|
212
212
|
def shift(periods = 1)
|
data/lib/polars/series.rb
CHANGED
@@ -3531,6 +3531,13 @@ module Polars
|
|
3531
3531
|
ListNameSpace.new(self)
|
3532
3532
|
end
|
3533
3533
|
|
3534
|
+
# Create an object namespace of all binary related methods.
|
3535
|
+
#
|
3536
|
+
# @return [BinaryNameSpace]
|
3537
|
+
def bin
|
3538
|
+
BinaryNameSpace.new(self)
|
3539
|
+
end
|
3540
|
+
|
3534
3541
|
# Create an object namespace of all categorical related methods.
|
3535
3542
|
#
|
3536
3543
|
# @return [CatNameSpace]
|
@@ -3795,7 +3802,8 @@ module Polars
|
|
3795
3802
|
UInt32 => RbSeries.method(:new_opt_u32),
|
3796
3803
|
UInt64 => RbSeries.method(:new_opt_u64),
|
3797
3804
|
Boolean => RbSeries.method(:new_opt_bool),
|
3798
|
-
Utf8 => RbSeries.method(:new_str)
|
3805
|
+
Utf8 => RbSeries.method(:new_str),
|
3806
|
+
Binary => RbSeries.method(:new_binary)
|
3799
3807
|
}
|
3800
3808
|
|
3801
3809
|
SYM_TYPE_TO_CONSTRUCTOR = {
|
data/lib/polars/string_expr.rb
CHANGED
data/lib/polars/utils.rb
CHANGED
@@ -93,8 +93,12 @@ module Polars
|
|
93
93
|
Polars.lit(value)
|
94
94
|
end
|
95
95
|
|
96
|
-
def self.
|
97
|
-
File.expand_path(path)
|
96
|
+
def self.normalise_filepath(path, check_not_directory: true)
|
97
|
+
path = File.expand_path(path)
|
98
|
+
if check_not_directory && File.exist?(path) && Dir.exist?(path)
|
99
|
+
raise ArgumentError, "Expected a file path; #{path} is a directory"
|
100
|
+
end
|
101
|
+
path
|
98
102
|
end
|
99
103
|
|
100
104
|
# TODO fix
|
@@ -216,5 +220,9 @@ module Polars
|
|
216
220
|
val.is_a?(Array) && _is_iterable_of(val, String)
|
217
221
|
end
|
218
222
|
end
|
223
|
+
|
224
|
+
def self.local_file?(file)
|
225
|
+
Dir.glob(file).any?
|
226
|
+
end
|
219
227
|
end
|
220
228
|
end
|
data/lib/polars/version.rb
CHANGED
data/lib/polars.rb
CHANGED
@@ -12,6 +12,8 @@ require "stringio"
|
|
12
12
|
# modules
|
13
13
|
require_relative "polars/expr_dispatch"
|
14
14
|
require_relative "polars/batched_csv_reader"
|
15
|
+
require_relative "polars/binary_expr"
|
16
|
+
require_relative "polars/binary_name_space"
|
15
17
|
require_relative "polars/cat_expr"
|
16
18
|
require_relative "polars/cat_name_space"
|
17
19
|
require_relative "polars/convert"
|