polars-df 0.22.0-x86_64-linux-musl → 0.23.0-x86_64-linux-musl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/Cargo.lock +112 -89
- data/LICENSE-THIRD-PARTY.txt +528 -77
- data/README.md +8 -7
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/3.4/polars.so +0 -0
- data/lib/polars/data_frame.rb +11 -9
- data/lib/polars/data_types.rb +9 -1
- data/lib/polars/date_time_expr.rb +35 -14
- data/lib/polars/expr.rb +2 -2
- data/lib/polars/iceberg_dataset.rb +113 -0
- data/lib/polars/io/iceberg.rb +8 -1
- data/lib/polars/io/ipc.rb +28 -49
- data/lib/polars/io/scan_options.rb +9 -3
- data/lib/polars/io/utils.rb +17 -0
- data/lib/polars/lazy_frame.rb +5 -2
- data/lib/polars/scan_cast_options.rb +4 -1
- data/lib/polars/selectors.rb +8 -8
- data/lib/polars/series.rb +23 -1
- data/lib/polars/string_expr.rb +1 -1
- data/lib/polars/string_name_space.rb +1 -1
- data/lib/polars/utils/convert.rb +2 -2
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +2 -0
- metadata +4 -2
data/README.md
CHANGED
|
@@ -400,13 +400,14 @@ Polars::DataFrame.new(data, schema: {"a" => Polars::Int32, "b" => Polars::Float3
|
|
|
400
400
|
Supported types are:
|
|
401
401
|
|
|
402
402
|
- boolean - `Boolean`
|
|
403
|
-
-
|
|
404
|
-
-
|
|
405
|
-
-
|
|
406
|
-
-
|
|
407
|
-
-
|
|
408
|
-
-
|
|
409
|
-
-
|
|
403
|
+
- decimal - `Decimal`
|
|
404
|
+
- float - `Float32`, `Float64`
|
|
405
|
+
- integer - `Int8`, `Int16`, `Int32`, `Int64`, `Int128`
|
|
406
|
+
- unsigned integer - `UInt8`, `UInt16`, `UInt32`, `UInt64`, `UInt128`
|
|
407
|
+
- string - `String`, `Categorical`, `Enum`
|
|
408
|
+
- temporal - `Date`, `Datetime`, `Duration`, `Time`
|
|
409
|
+
- nested - `Array`, `List`, `Struct`
|
|
410
|
+
- other - `Binary`, `Object`, `Null`, `Unknown`
|
|
410
411
|
|
|
411
412
|
Get column types
|
|
412
413
|
|
data/lib/polars/3.2/polars.so
CHANGED
|
Binary file
|
data/lib/polars/3.3/polars.so
CHANGED
|
Binary file
|
data/lib/polars/3.4/polars.so
CHANGED
|
Binary file
|
data/lib/polars/data_frame.rb
CHANGED
|
@@ -75,8 +75,8 @@ module Polars
|
|
|
75
75
|
# Read a serialized DataFrame from a file.
|
|
76
76
|
#
|
|
77
77
|
# @param source [Object]
|
|
78
|
-
#
|
|
79
|
-
#
|
|
78
|
+
# Path to a file or a file-like object (by file-like object, we refer to
|
|
79
|
+
# objects that have a `read` method, such as a file handler or `StringIO`).
|
|
80
80
|
#
|
|
81
81
|
# @return [DataFrame]
|
|
82
82
|
#
|
|
@@ -6059,8 +6059,13 @@ module Polars
|
|
|
6059
6059
|
# The fields will be inserted into the `DataFrame` on the location of the
|
|
6060
6060
|
# `struct` type.
|
|
6061
6061
|
#
|
|
6062
|
-
# @param
|
|
6063
|
-
#
|
|
6062
|
+
# @param columns [Object]
|
|
6063
|
+
# Name of the struct column(s) that should be unnested.
|
|
6064
|
+
# @param more_columns [Array]
|
|
6065
|
+
# Additional columns to unnest, specified as positional arguments.
|
|
6066
|
+
# @param separator [String]
|
|
6067
|
+
# Rename output column names as combination of the struct column name,
|
|
6068
|
+
# name separator and field name.
|
|
6064
6069
|
#
|
|
6065
6070
|
# @return [DataFrame]
|
|
6066
6071
|
#
|
|
@@ -6086,11 +6091,8 @@ module Polars
|
|
|
6086
6091
|
# # │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │
|
|
6087
6092
|
# # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
|
|
6088
6093
|
# # └────────┴─────┴─────┴──────┴───────────┴───────┘
|
|
6089
|
-
def unnest(
|
|
6090
|
-
|
|
6091
|
-
names = [names]
|
|
6092
|
-
end
|
|
6093
|
-
_from_rbdf(_df.unnest(names))
|
|
6094
|
+
def unnest(columns, *more_columns, separator: nil)
|
|
6095
|
+
lazy.unnest(columns, *more_columns, separator: separator).collect(_eager: true)
|
|
6094
6096
|
end
|
|
6095
6097
|
|
|
6096
6098
|
# Requires NumPy
|
data/lib/polars/data_types.rb
CHANGED
|
@@ -209,6 +209,10 @@ module Polars
|
|
|
209
209
|
class UInt64 < UnsignedIntegerType
|
|
210
210
|
end
|
|
211
211
|
|
|
212
|
+
# 128-bit unsigned integer type.
|
|
213
|
+
class UInt128 < UnsignedIntegerType
|
|
214
|
+
end
|
|
215
|
+
|
|
212
216
|
# 32-bit floating point type.
|
|
213
217
|
class Float32 < FloatType
|
|
214
218
|
end
|
|
@@ -223,7 +227,11 @@ module Polars
|
|
|
223
227
|
class Decimal < NumericType
|
|
224
228
|
attr_reader :precision, :scale
|
|
225
229
|
|
|
226
|
-
def initialize(precision, scale)
|
|
230
|
+
def initialize(precision = nil, scale = 0)
|
|
231
|
+
if precision.nil?
|
|
232
|
+
precision = 38
|
|
233
|
+
end
|
|
234
|
+
|
|
227
235
|
@precision = precision
|
|
228
236
|
@scale = scale
|
|
229
237
|
end
|
|
@@ -1435,6 +1435,9 @@ module Polars
|
|
|
1435
1435
|
|
|
1436
1436
|
# Extract the days from a Duration type.
|
|
1437
1437
|
#
|
|
1438
|
+
# @param fractional [Boolean]
|
|
1439
|
+
# Whether to include the fractional component of the second.
|
|
1440
|
+
#
|
|
1438
1441
|
# @return [Expr]
|
|
1439
1442
|
#
|
|
1440
1443
|
# @example
|
|
@@ -1462,13 +1465,16 @@ module Polars
|
|
|
1462
1465
|
# # │ 2020-04-01 00:00:00 ┆ 31 │
|
|
1463
1466
|
# # │ 2020-05-01 00:00:00 ┆ 30 │
|
|
1464
1467
|
# # └─────────────────────┴───────────┘
|
|
1465
|
-
def total_days
|
|
1466
|
-
Utils.wrap_expr(_rbexpr.dt_total_days)
|
|
1468
|
+
def total_days(fractional: false)
|
|
1469
|
+
Utils.wrap_expr(_rbexpr.dt_total_days(fractional))
|
|
1467
1470
|
end
|
|
1468
1471
|
alias_method :days, :total_days
|
|
1469
1472
|
|
|
1470
1473
|
# Extract the hours from a Duration type.
|
|
1471
1474
|
#
|
|
1475
|
+
# @param fractional [Boolean]
|
|
1476
|
+
# Whether to include the fractional component of the second.
|
|
1477
|
+
#
|
|
1472
1478
|
# @return [Expr]
|
|
1473
1479
|
#
|
|
1474
1480
|
# @example
|
|
@@ -1497,13 +1503,16 @@ module Polars
|
|
|
1497
1503
|
# # │ 2020-01-03 00:00:00 ┆ 24 │
|
|
1498
1504
|
# # │ 2020-01-04 00:00:00 ┆ 24 │
|
|
1499
1505
|
# # └─────────────────────┴────────────┘
|
|
1500
|
-
def total_hours
|
|
1501
|
-
Utils.wrap_expr(_rbexpr.dt_total_hours)
|
|
1506
|
+
def total_hours(fractional: false)
|
|
1507
|
+
Utils.wrap_expr(_rbexpr.dt_total_hours(fractional))
|
|
1502
1508
|
end
|
|
1503
1509
|
alias_method :hours, :total_hours
|
|
1504
1510
|
|
|
1505
1511
|
# Extract the minutes from a Duration type.
|
|
1506
1512
|
#
|
|
1513
|
+
# @param fractional [Boolean]
|
|
1514
|
+
# Whether to include the fractional component of the second.
|
|
1515
|
+
#
|
|
1507
1516
|
# @return [Expr]
|
|
1508
1517
|
#
|
|
1509
1518
|
# @example
|
|
@@ -1532,13 +1541,16 @@ module Polars
|
|
|
1532
1541
|
# # │ 2020-01-03 00:00:00 ┆ 1440 │
|
|
1533
1542
|
# # │ 2020-01-04 00:00:00 ┆ 1440 │
|
|
1534
1543
|
# # └─────────────────────┴──────────────┘
|
|
1535
|
-
def total_minutes
|
|
1536
|
-
Utils.wrap_expr(_rbexpr.dt_total_minutes)
|
|
1544
|
+
def total_minutes(fractional: false)
|
|
1545
|
+
Utils.wrap_expr(_rbexpr.dt_total_minutes(fractional))
|
|
1537
1546
|
end
|
|
1538
1547
|
alias_method :minutes, :total_minutes
|
|
1539
1548
|
|
|
1540
1549
|
# Extract the seconds from a Duration type.
|
|
1541
1550
|
#
|
|
1551
|
+
# @param fractional [Boolean]
|
|
1552
|
+
# Whether to include the fractional component of the second.
|
|
1553
|
+
#
|
|
1542
1554
|
# @return [Expr]
|
|
1543
1555
|
#
|
|
1544
1556
|
# @example
|
|
@@ -1568,13 +1580,16 @@ module Polars
|
|
|
1568
1580
|
# # │ 2020-01-01 00:03:00 ┆ 60 │
|
|
1569
1581
|
# # │ 2020-01-01 00:04:00 ┆ 60 │
|
|
1570
1582
|
# # └─────────────────────┴──────────────┘
|
|
1571
|
-
def total_seconds
|
|
1572
|
-
Utils.wrap_expr(_rbexpr.dt_total_seconds)
|
|
1583
|
+
def total_seconds(fractional: false)
|
|
1584
|
+
Utils.wrap_expr(_rbexpr.dt_total_seconds(fractional))
|
|
1573
1585
|
end
|
|
1574
1586
|
alias_method :seconds, :total_seconds
|
|
1575
1587
|
|
|
1576
1588
|
# Extract the milliseconds from a Duration type.
|
|
1577
1589
|
#
|
|
1590
|
+
# @param fractional [Boolean]
|
|
1591
|
+
# Whether to include the fractional component of the second.
|
|
1592
|
+
#
|
|
1578
1593
|
# @return [Expr]
|
|
1579
1594
|
#
|
|
1580
1595
|
# @example
|
|
@@ -1610,13 +1625,16 @@ module Polars
|
|
|
1610
1625
|
# # │ 2020-01-01 00:00:00.999 ┆ 1 │
|
|
1611
1626
|
# # │ 2020-01-01 00:00:01 ┆ 1 │
|
|
1612
1627
|
# # └─────────────────────────┴───────────────────┘
|
|
1613
|
-
def total_milliseconds
|
|
1614
|
-
Utils.wrap_expr(_rbexpr.dt_total_milliseconds)
|
|
1628
|
+
def total_milliseconds(fractional: false)
|
|
1629
|
+
Utils.wrap_expr(_rbexpr.dt_total_milliseconds(fractional))
|
|
1615
1630
|
end
|
|
1616
1631
|
alias_method :milliseconds, :total_milliseconds
|
|
1617
1632
|
|
|
1618
1633
|
# Extract the microseconds from a Duration type.
|
|
1619
1634
|
#
|
|
1635
|
+
# @param fractional [Boolean]
|
|
1636
|
+
# Whether to include the fractional component of the second.
|
|
1637
|
+
#
|
|
1620
1638
|
# @return [Expr]
|
|
1621
1639
|
#
|
|
1622
1640
|
# @example
|
|
@@ -1652,13 +1670,16 @@ module Polars
|
|
|
1652
1670
|
# # │ 2020-01-01 00:00:00.999 ┆ 1000 │
|
|
1653
1671
|
# # │ 2020-01-01 00:00:01 ┆ 1000 │
|
|
1654
1672
|
# # └─────────────────────────┴───────────────────┘
|
|
1655
|
-
def total_microseconds
|
|
1656
|
-
Utils.wrap_expr(_rbexpr.dt_total_microseconds)
|
|
1673
|
+
def total_microseconds(fractional: false)
|
|
1674
|
+
Utils.wrap_expr(_rbexpr.dt_total_microseconds(fractional))
|
|
1657
1675
|
end
|
|
1658
1676
|
alias_method :microseconds, :total_microseconds
|
|
1659
1677
|
|
|
1660
1678
|
# Extract the nanoseconds from a Duration type.
|
|
1661
1679
|
#
|
|
1680
|
+
# @param fractional [Boolean]
|
|
1681
|
+
# Whether to include the fractional component of the second.
|
|
1682
|
+
#
|
|
1662
1683
|
# @return [Expr]
|
|
1663
1684
|
#
|
|
1664
1685
|
# @example
|
|
@@ -1694,8 +1715,8 @@ module Polars
|
|
|
1694
1715
|
# # │ 2020-01-01 00:00:00.999 ┆ 1000000 │
|
|
1695
1716
|
# # │ 2020-01-01 00:00:01 ┆ 1000000 │
|
|
1696
1717
|
# # └─────────────────────────┴──────────────────┘
|
|
1697
|
-
def total_nanoseconds
|
|
1698
|
-
Utils.wrap_expr(_rbexpr.dt_total_nanoseconds)
|
|
1718
|
+
def total_nanoseconds(fractional: false)
|
|
1719
|
+
Utils.wrap_expr(_rbexpr.dt_total_nanoseconds(fractional))
|
|
1699
1720
|
end
|
|
1700
1721
|
alias_method :nanoseconds, :total_nanoseconds
|
|
1701
1722
|
|
data/lib/polars/expr.rb
CHANGED
|
@@ -6612,8 +6612,8 @@ module Polars
|
|
|
6612
6612
|
# # │ 10 ┆ null │
|
|
6613
6613
|
# # │ 11 ┆ 0.1 │
|
|
6614
6614
|
# # │ 12 ┆ 0.090909 │
|
|
6615
|
-
# # │ null ┆
|
|
6616
|
-
# # │ 12 ┆
|
|
6615
|
+
# # │ null ┆ null │
|
|
6616
|
+
# # │ 12 ┆ null │
|
|
6617
6617
|
# # └──────┴────────────┘
|
|
6618
6618
|
def pct_change(n: 1)
|
|
6619
6619
|
n = Utils.parse_into_expression(n)
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
module Polars
|
|
2
|
+
# @private
|
|
3
|
+
class IcebergDataset
|
|
4
|
+
def initialize(
|
|
5
|
+
source,
|
|
6
|
+
snapshot_id:,
|
|
7
|
+
storage_options:
|
|
8
|
+
)
|
|
9
|
+
@source = source
|
|
10
|
+
@snapshot_id = snapshot_id
|
|
11
|
+
@storage_options = storage_options
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def to_lazyframe
|
|
15
|
+
# for iceberg < 0.1.3
|
|
16
|
+
if !@source.respond_to?(:scan)
|
|
17
|
+
return @source.to_polars(snapshot_id: @snapshot_id, storage_options: @storage_options)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
scan = @source.scan(snapshot_id: @snapshot_id)
|
|
21
|
+
files = scan.plan_files
|
|
22
|
+
|
|
23
|
+
table = scan.table
|
|
24
|
+
snapshot = scan.snapshot
|
|
25
|
+
schema = snapshot ? table.schema_by_id(snapshot[:schema_id]) : table.current_schema
|
|
26
|
+
|
|
27
|
+
if files.empty?
|
|
28
|
+
# TODO improve
|
|
29
|
+
schema =
|
|
30
|
+
schema.fields.to_h do |field|
|
|
31
|
+
dtype =
|
|
32
|
+
case field[:type]
|
|
33
|
+
when "int"
|
|
34
|
+
Polars::Int32
|
|
35
|
+
when "long"
|
|
36
|
+
Polars::Int64
|
|
37
|
+
when "double"
|
|
38
|
+
Polars::Float64
|
|
39
|
+
when "string"
|
|
40
|
+
Polars::String
|
|
41
|
+
when "timestamp"
|
|
42
|
+
Polars::Datetime
|
|
43
|
+
else
|
|
44
|
+
raise Todo
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
[field[:name], dtype]
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
LazyFrame.new(schema: schema)
|
|
51
|
+
else
|
|
52
|
+
sources = files.map { |v| v[:data_file_path] }
|
|
53
|
+
|
|
54
|
+
column_mapping = [
|
|
55
|
+
"iceberg-column-mapping",
|
|
56
|
+
arrow_schema(schema)
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
deletion_files = [
|
|
60
|
+
"iceberg-position-delete",
|
|
61
|
+
files.map.with_index
|
|
62
|
+
.select { |v, i| v[:deletes].any? }
|
|
63
|
+
.to_h { |v, i| [i, v[:deletes].map { |d| d[:file_path] }] }
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
scan_options = {
|
|
67
|
+
storage_options: @storage_options,
|
|
68
|
+
cast_options: Polars::ScanCastOptions._default_iceberg,
|
|
69
|
+
allow_missing_columns: true,
|
|
70
|
+
extra_columns: "ignore",
|
|
71
|
+
_column_mapping: column_mapping,
|
|
72
|
+
_deletion_files: deletion_files
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
Polars.scan_parquet(sources, **scan_options)
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
private
|
|
80
|
+
|
|
81
|
+
def arrow_schema(schema)
|
|
82
|
+
fields =
|
|
83
|
+
schema.fields.map do |field|
|
|
84
|
+
type =
|
|
85
|
+
case field[:type]
|
|
86
|
+
when "boolean"
|
|
87
|
+
"boolean"
|
|
88
|
+
when "int"
|
|
89
|
+
"int32"
|
|
90
|
+
when "long"
|
|
91
|
+
"int64"
|
|
92
|
+
when "float"
|
|
93
|
+
"float32"
|
|
94
|
+
when "double"
|
|
95
|
+
"float64"
|
|
96
|
+
else
|
|
97
|
+
raise Todo
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
{
|
|
101
|
+
name: field[:name],
|
|
102
|
+
type: type,
|
|
103
|
+
nullable: !field[:required],
|
|
104
|
+
metadata: {
|
|
105
|
+
"PARQUET:field_id" => field[:id].to_s
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
{fields: fields}
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|
data/lib/polars/io/iceberg.rb
CHANGED
data/lib/polars/io/ipc.rb
CHANGED
|
@@ -187,8 +187,16 @@ module Polars
|
|
|
187
187
|
# DataFrame.
|
|
188
188
|
# @param row_count_offset [Integer]
|
|
189
189
|
# Offset to start the row_count column (only use if the name is set).
|
|
190
|
+
# @param glob [Boolean]
|
|
191
|
+
# Expand path given via globbing rules.
|
|
190
192
|
# @param storage_options [Hash]
|
|
191
193
|
# Extra options that make sense for a particular storage connection.
|
|
194
|
+
# @param retries [Integer]
|
|
195
|
+
# Number of retries if accessing a cloud instance fails.
|
|
196
|
+
# @param file_cache_ttl [Integer]
|
|
197
|
+
# Amount of time to keep downloaded cloud files since their last access time,
|
|
198
|
+
# in seconds. Uses the `POLARS_FILE_CACHE_TTL` environment variable
|
|
199
|
+
# (which defaults to 1 hour) if not given.
|
|
192
200
|
# @param hive_partitioning [Boolean]
|
|
193
201
|
# Infer statistics and schema from Hive partitioned URL and use them
|
|
194
202
|
# to prune reads. This is unset by default (i.e. `nil`), meaning it is
|
|
@@ -210,66 +218,37 @@ module Polars
|
|
|
210
218
|
rechunk: true,
|
|
211
219
|
row_count_name: nil,
|
|
212
220
|
row_count_offset: 0,
|
|
221
|
+
glob: true,
|
|
213
222
|
storage_options: nil,
|
|
223
|
+
retries: 2,
|
|
224
|
+
file_cache_ttl: nil,
|
|
214
225
|
hive_partitioning: nil,
|
|
215
226
|
hive_schema: nil,
|
|
216
227
|
try_parse_hive_dates: true,
|
|
217
228
|
include_file_paths: nil
|
|
218
229
|
)
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
n_rows: n_rows,
|
|
222
|
-
cache: cache,
|
|
223
|
-
rechunk: rechunk,
|
|
224
|
-
row_count_name: row_count_name,
|
|
225
|
-
row_count_offset: row_count_offset,
|
|
226
|
-
storage_options: storage_options,
|
|
227
|
-
hive_partitioning: hive_partitioning,
|
|
228
|
-
hive_schema: hive_schema,
|
|
229
|
-
try_parse_hive_dates: try_parse_hive_dates,
|
|
230
|
-
include_file_paths: include_file_paths
|
|
231
|
-
)
|
|
232
|
-
end
|
|
233
|
-
|
|
234
|
-
# @private
|
|
235
|
-
def _scan_ipc_impl(
|
|
236
|
-
source,
|
|
237
|
-
n_rows: nil,
|
|
238
|
-
cache: true,
|
|
239
|
-
rechunk: true,
|
|
240
|
-
row_count_name: nil,
|
|
241
|
-
row_count_offset: 0,
|
|
242
|
-
storage_options: nil,
|
|
243
|
-
hive_partitioning: nil,
|
|
244
|
-
hive_schema: nil,
|
|
245
|
-
try_parse_hive_dates: true,
|
|
246
|
-
include_file_paths: nil
|
|
247
|
-
)
|
|
248
|
-
sources = []
|
|
249
|
-
if Utils.pathlike?(source)
|
|
250
|
-
source = Utils.normalize_filepath(source)
|
|
251
|
-
elsif source.is_a?(::Array)
|
|
252
|
-
if Utils.is_path_or_str_sequence(source)
|
|
253
|
-
sources = source.map { |s| Utils.normalize_filepath(s) }
|
|
254
|
-
else
|
|
255
|
-
sources = source
|
|
256
|
-
end
|
|
230
|
+
row_index_name = row_count_name
|
|
231
|
+
row_index_offset = row_count_offset
|
|
257
232
|
|
|
258
|
-
|
|
259
|
-
end
|
|
233
|
+
sources = get_sources(source)
|
|
260
234
|
|
|
261
235
|
rblf =
|
|
262
236
|
RbLazyFrame.new_from_ipc(
|
|
263
|
-
source,
|
|
264
237
|
sources,
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
238
|
+
ScanOptions.new(
|
|
239
|
+
row_index: !row_index_name.nil? ? [row_index_name, row_index_offset] : nil,
|
|
240
|
+
pre_slice: !n_rows.nil? ? [0, n_rows] : nil,
|
|
241
|
+
include_file_paths: include_file_paths,
|
|
242
|
+
glob: glob,
|
|
243
|
+
hive_partitioning: hive_partitioning,
|
|
244
|
+
hive_schema: hive_schema,
|
|
245
|
+
try_parse_hive_dates: try_parse_hive_dates,
|
|
246
|
+
rechunk: rechunk,
|
|
247
|
+
cache: cache,
|
|
248
|
+
storage_options: !storage_options.nil? ? storage_options.to_a : nil,
|
|
249
|
+
retries: retries
|
|
250
|
+
),
|
|
251
|
+
file_cache_ttl
|
|
273
252
|
)
|
|
274
253
|
Utils.wrap_ldf(rblf)
|
|
275
254
|
end
|
|
@@ -2,9 +2,9 @@ module Polars
|
|
|
2
2
|
module IO
|
|
3
3
|
class ScanOptions
|
|
4
4
|
attr_reader :row_index, :pre_slice, :cast_options, :extra_columns, :missing_columns,
|
|
5
|
-
:include_file_paths, :glob, :hive_partitioning, :hive_schema, :try_parse_hive_dates,
|
|
5
|
+
:include_file_paths, :glob, :hidden_file_prefix, :hive_partitioning, :hive_schema, :try_parse_hive_dates,
|
|
6
6
|
:rechunk, :cache, :storage_options, :credential_provider, :retries, :column_mapping,
|
|
7
|
-
:default_values, :deletion_files
|
|
7
|
+
:default_values, :deletion_files, :table_statistics, :row_count
|
|
8
8
|
|
|
9
9
|
def initialize(
|
|
10
10
|
row_index: nil,
|
|
@@ -14,6 +14,7 @@ module Polars
|
|
|
14
14
|
missing_columns: "raise",
|
|
15
15
|
include_file_paths: nil,
|
|
16
16
|
glob: true,
|
|
17
|
+
hidden_file_prefix: nil,
|
|
17
18
|
hive_partitioning: nil,
|
|
18
19
|
hive_schema: nil,
|
|
19
20
|
try_parse_hive_dates: true,
|
|
@@ -24,7 +25,9 @@ module Polars
|
|
|
24
25
|
retries: 2,
|
|
25
26
|
column_mapping: nil,
|
|
26
27
|
default_values: nil,
|
|
27
|
-
deletion_files: nil
|
|
28
|
+
deletion_files: nil,
|
|
29
|
+
table_statistics: nil,
|
|
30
|
+
row_count: nil
|
|
28
31
|
)
|
|
29
32
|
@row_index = row_index
|
|
30
33
|
@pre_slice = pre_slice
|
|
@@ -33,6 +36,7 @@ module Polars
|
|
|
33
36
|
@missing_columns = missing_columns
|
|
34
37
|
@include_file_paths = include_file_paths
|
|
35
38
|
@glob = glob
|
|
39
|
+
@hidden_file_prefix = hidden_file_prefix
|
|
36
40
|
@hive_partitioning = hive_partitioning
|
|
37
41
|
@hive_schema = hive_schema
|
|
38
42
|
@try_parse_hive_dates = try_parse_hive_dates
|
|
@@ -44,6 +48,8 @@ module Polars
|
|
|
44
48
|
@column_mapping = column_mapping
|
|
45
49
|
@default_values = default_values
|
|
46
50
|
@deletion_files = deletion_files
|
|
51
|
+
@table_statistics = table_statistics
|
|
52
|
+
@row_count = row_count
|
|
47
53
|
end
|
|
48
54
|
end
|
|
49
55
|
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
module Polars
|
|
2
|
+
module IO
|
|
3
|
+
private
|
|
4
|
+
|
|
5
|
+
def get_sources(source)
|
|
6
|
+
if Utils.pathlike?(source)
|
|
7
|
+
source = Utils.normalize_filepath(source, check_not_directory: false)
|
|
8
|
+
elsif Utils.is_path_or_str_sequence(source)
|
|
9
|
+
source = source.map { |s| Utils.normalize_filepath(s, check_not_directory: false) }
|
|
10
|
+
end
|
|
11
|
+
unless source.is_a?(::Array)
|
|
12
|
+
source = [source]
|
|
13
|
+
end
|
|
14
|
+
source
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
data/lib/polars/lazy_frame.rb
CHANGED
|
@@ -4143,6 +4143,9 @@ module Polars
|
|
|
4143
4143
|
# Names of the struct columns that will be decomposed by its fields
|
|
4144
4144
|
# @param more_columns [Array]
|
|
4145
4145
|
# Additional columns to unnest, specified as positional arguments.
|
|
4146
|
+
# @param separator [String]
|
|
4147
|
+
# Rename output column names as combination of the struct column name,
|
|
4148
|
+
# name separator and field name.
|
|
4146
4149
|
#
|
|
4147
4150
|
# @return [LazyFrame]
|
|
4148
4151
|
#
|
|
@@ -4187,11 +4190,11 @@ module Polars
|
|
|
4187
4190
|
# # │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │
|
|
4188
4191
|
# # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
|
|
4189
4192
|
# # └────────┴─────┴─────┴──────┴───────────┴───────┘
|
|
4190
|
-
def unnest(columns, *more_columns)
|
|
4193
|
+
def unnest(columns, *more_columns, separator: nil)
|
|
4191
4194
|
subset = Utils.parse_list_into_selector(columns) | Utils.parse_list_into_selector(
|
|
4192
4195
|
more_columns
|
|
4193
4196
|
)
|
|
4194
|
-
_from_rbldf(_ldf.unnest(subset._rbselector))
|
|
4197
|
+
_from_rbldf(_ldf.unnest(subset._rbselector, separator))
|
|
4195
4198
|
end
|
|
4196
4199
|
|
|
4197
4200
|
# Take two sorted DataFrames and merge them by the sorted key.
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
module Polars
|
|
2
2
|
# Options for scanning files.
|
|
3
3
|
class ScanCastOptions
|
|
4
|
-
attr_reader :integer_cast, :float_cast, :datetime_cast, :missing_struct_fields, :extra_struct_fields
|
|
4
|
+
attr_reader :integer_cast, :float_cast, :datetime_cast, :missing_struct_fields, :extra_struct_fields, :categorical_to_string
|
|
5
5
|
|
|
6
6
|
# Common configuration for scanning files.
|
|
7
7
|
#
|
|
@@ -50,6 +50,7 @@ module Polars
|
|
|
50
50
|
datetime_cast: "forbid",
|
|
51
51
|
missing_struct_fields: "raise",
|
|
52
52
|
extra_struct_fields: "raise",
|
|
53
|
+
categorical_to_string: "forbid",
|
|
53
54
|
_internal_call: false
|
|
54
55
|
)
|
|
55
56
|
if !_internal_call
|
|
@@ -61,6 +62,7 @@ module Polars
|
|
|
61
62
|
@datetime_cast = datetime_cast
|
|
62
63
|
@missing_struct_fields = missing_struct_fields
|
|
63
64
|
@extra_struct_fields = extra_struct_fields
|
|
65
|
+
@categorical_to_string = categorical_to_string
|
|
64
66
|
end
|
|
65
67
|
|
|
66
68
|
def self._default
|
|
@@ -75,6 +77,7 @@ module Polars
|
|
|
75
77
|
datetime_cast: ["nanosecond-downcast", "convert-timezone"],
|
|
76
78
|
missing_struct_fields: "insert",
|
|
77
79
|
extra_struct_fields: "ignore",
|
|
80
|
+
categorical_to_string: "allow",
|
|
78
81
|
_internal_call: true
|
|
79
82
|
)
|
|
80
83
|
end
|
data/lib/polars/selectors.rb
CHANGED
|
@@ -1063,14 +1063,14 @@ module Polars
|
|
|
1063
1063
|
# df.select(Polars.cs.decimal)
|
|
1064
1064
|
# # =>
|
|
1065
1065
|
# # shape: (2, 2)
|
|
1066
|
-
# #
|
|
1067
|
-
# # │ bar
|
|
1068
|
-
# # │ ---
|
|
1069
|
-
# # │ decimal[
|
|
1070
|
-
# #
|
|
1071
|
-
# # │ 123
|
|
1072
|
-
# # │ 456
|
|
1073
|
-
# #
|
|
1066
|
+
# # ┌───────────────┬───────────────┐
|
|
1067
|
+
# # │ bar ┆ baz │
|
|
1068
|
+
# # │ --- ┆ --- │
|
|
1069
|
+
# # │ decimal[38,0] ┆ decimal[10,5] │
|
|
1070
|
+
# # ╞═══════════════╪═══════════════╡
|
|
1071
|
+
# # │ 123 ┆ 2.00050 │
|
|
1072
|
+
# # │ 456 ┆ -50.55550 │
|
|
1073
|
+
# # └───────────────┴───────────────┘
|
|
1074
1074
|
#
|
|
1075
1075
|
# @example Select all columns *except* the decimal ones:
|
|
1076
1076
|
#
|
data/lib/polars/series.rb
CHANGED
|
@@ -5747,11 +5747,31 @@ module Polars
|
|
|
5747
5747
|
end
|
|
5748
5748
|
|
|
5749
5749
|
base_type = dtype.is_a?(DataType) ? dtype.class : dtype
|
|
5750
|
-
if [Date, Datetime, Duration, Time, Categorical, Boolean, Enum
|
|
5750
|
+
if [Date, Datetime, Duration, Time, Categorical, Boolean, Enum].include?(base_type) || dtype.is_a?(Decimal)
|
|
5751
5751
|
if rbseries.dtype != dtype
|
|
5752
5752
|
rbseries = rbseries.cast(dtype, true)
|
|
5753
5753
|
end
|
|
5754
5754
|
end
|
|
5755
|
+
|
|
5756
|
+
# Uninstanced Decimal is a bit special and has various inference paths
|
|
5757
|
+
if dtype == Decimal
|
|
5758
|
+
if rbseries.dtype == String
|
|
5759
|
+
rbseries = rbseries.str_to_decimal_infer(0)
|
|
5760
|
+
elsif rbseries.dtype.float?
|
|
5761
|
+
# Go through string so we infer an appropriate scale.
|
|
5762
|
+
rbseries = rbseries.cast(
|
|
5763
|
+
String, strict: strict, wrap_numerical: false
|
|
5764
|
+
).str_to_decimal_infer(0)
|
|
5765
|
+
elsif rbseries.dtype.integer? || rbseries.dtype == Null
|
|
5766
|
+
rbseries = rbseries.cast(
|
|
5767
|
+
Decimal.new(nil, 0), strict: strict, wrap_numerical: false
|
|
5768
|
+
)
|
|
5769
|
+
elsif !rbseries.dtype.is_a?(Decimal)
|
|
5770
|
+
msg = "can't convert #{rbseries.dtype} to Decimal"
|
|
5771
|
+
raise TypeError, msg
|
|
5772
|
+
end
|
|
5773
|
+
end
|
|
5774
|
+
|
|
5755
5775
|
rbseries
|
|
5756
5776
|
elsif dtype == Struct
|
|
5757
5777
|
struct_schema = dtype.is_a?(Struct) ? dtype.to_schema : nil
|
|
@@ -5856,6 +5876,7 @@ module Polars
|
|
|
5856
5876
|
UInt16 => RbSeries.method(:new_opt_u16),
|
|
5857
5877
|
UInt32 => RbSeries.method(:new_opt_u32),
|
|
5858
5878
|
UInt64 => RbSeries.method(:new_opt_u64),
|
|
5879
|
+
UInt128 => RbSeries.method(:new_opt_u128),
|
|
5859
5880
|
Decimal => RbSeries.method(:new_decimal),
|
|
5860
5881
|
Date => RbSeries.method(:new_from_any_values),
|
|
5861
5882
|
Datetime => RbSeries.method(:new_from_any_values),
|
|
@@ -5882,6 +5903,7 @@ module Polars
|
|
|
5882
5903
|
u16: RbSeries.method(:new_opt_u16),
|
|
5883
5904
|
u32: RbSeries.method(:new_opt_u32),
|
|
5884
5905
|
u64: RbSeries.method(:new_opt_u64),
|
|
5906
|
+
u128: RbSeries.method(:new_opt_u128),
|
|
5885
5907
|
bool: RbSeries.method(:new_opt_bool),
|
|
5886
5908
|
str: RbSeries.method(:new_str)
|
|
5887
5909
|
}
|
data/lib/polars/string_expr.rb
CHANGED
|
@@ -247,7 +247,7 @@ module Polars
|
|
|
247
247
|
# # ┌───────────┬─────────────────┐
|
|
248
248
|
# # │ numbers ┆ numbers_decimal │
|
|
249
249
|
# # │ --- ┆ --- │
|
|
250
|
-
# # │ str ┆ decimal[
|
|
250
|
+
# # │ str ┆ decimal[38,2] │
|
|
251
251
|
# # ╞═══════════╪═════════════════╡
|
|
252
252
|
# # │ 40.12 ┆ 40.12 │
|
|
253
253
|
# # │ 3420.13 ┆ 3420.13 │
|