parquet 0.4.2 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +66 -59
- data/README.md +105 -1
- data/ext/parquet/Cargo.toml +4 -3
- data/ext/parquet/src/enumerator.rs +8 -0
- data/ext/parquet/src/header_cache.rs +11 -12
- data/ext/parquet/src/lib.rs +1 -0
- data/ext/parquet/src/logger.rs +171 -0
- data/ext/parquet/src/reader/common.rs +110 -0
- data/ext/parquet/src/reader/mod.rs +1 -43
- data/ext/parquet/src/reader/parquet_column_reader.rs +50 -86
- data/ext/parquet/src/reader/parquet_row_reader.rs +53 -23
- data/ext/parquet/src/ruby_reader.rs +37 -25
- data/ext/parquet/src/types/core_types.rs +47 -6
- data/ext/parquet/src/types/mod.rs +64 -1
- data/ext/parquet/src/types/parquet_value.rs +284 -102
- data/ext/parquet/src/types/record_types.rs +24 -23
- data/ext/parquet/src/types/schema_converter.rs +244 -0
- data/ext/parquet/src/types/schema_node.rs +329 -0
- data/ext/parquet/src/types/timestamp.rs +16 -8
- data/ext/parquet/src/types/type_conversion.rs +1151 -521
- data/ext/parquet/src/types/writer_types.rs +94 -151
- data/ext/parquet/src/utils.rs +29 -9
- data/ext/parquet/src/writer/mod.rs +342 -457
- data/ext/parquet/src/writer/write_columns.rs +226 -0
- data/ext/parquet/src/writer/write_rows.rs +484 -0
- data/lib/parquet/schema.rb +154 -0
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rb +1 -0
- metadata +9 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0d72c16371c10a011af5118f2915de9bbeb33cde133369bdac2050e3c035572e
|
4
|
+
data.tar.gz: b39c6ec9a8232eca5b5b156bf28992ed59c05e9a36e4c13db2b8933a74485ba0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c7f338b1d010fa59c2344065b233ff20a08d4a17c6ca987ef72677150dd1cbf55d134855585d68e187b748dc5121f13d5e86cb82aabc1eeb3562a3326aca459c
|
7
|
+
data.tar.gz: 69eaa6b133123944138a826612a7b48d9f87acb202ecbe172e253be02a1a1c7009e3d7182e8bb31ae423098bc34bb5dddc4ce042453f0d1cb41505d56d02c21e
|
data/Cargo.lock
CHANGED
@@ -63,9 +63,9 @@ dependencies = [
|
|
63
63
|
|
64
64
|
[[package]]
|
65
65
|
name = "arrow-array"
|
66
|
-
version = "54.
|
66
|
+
version = "54.2.0"
|
67
67
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
68
|
-
checksum = "
|
68
|
+
checksum = "57a4a6d2896083cfbdf84a71a863b22460d0708f8206a8373c52e326cc72ea1a"
|
69
69
|
dependencies = [
|
70
70
|
"ahash",
|
71
71
|
"arrow-buffer",
|
@@ -79,9 +79,9 @@ dependencies = [
|
|
79
79
|
|
80
80
|
[[package]]
|
81
81
|
name = "arrow-buffer"
|
82
|
-
version = "54.
|
82
|
+
version = "54.2.0"
|
83
83
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
84
|
-
checksum = "
|
84
|
+
checksum = "cef870583ce5e4f3b123c181706f2002fb134960f9a911900f64ba4830c7a43a"
|
85
85
|
dependencies = [
|
86
86
|
"bytes",
|
87
87
|
"half",
|
@@ -90,9 +90,9 @@ dependencies = [
|
|
90
90
|
|
91
91
|
[[package]]
|
92
92
|
name = "arrow-cast"
|
93
|
-
version = "54.
|
93
|
+
version = "54.2.0"
|
94
94
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
95
|
-
checksum = "
|
95
|
+
checksum = "1ac7eba5a987f8b4a7d9629206ba48e19a1991762795bbe5d08497b7736017ee"
|
96
96
|
dependencies = [
|
97
97
|
"arrow-array",
|
98
98
|
"arrow-buffer",
|
@@ -110,9 +110,9 @@ dependencies = [
|
|
110
110
|
|
111
111
|
[[package]]
|
112
112
|
name = "arrow-data"
|
113
|
-
version = "54.
|
113
|
+
version = "54.2.0"
|
114
114
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
115
|
-
checksum = "
|
115
|
+
checksum = "b095e8a4f3c309544935d53e04c3bfe4eea4e71c3de6fe0416d1f08bb4441a83"
|
116
116
|
dependencies = [
|
117
117
|
"arrow-buffer",
|
118
118
|
"arrow-schema",
|
@@ -122,9 +122,9 @@ dependencies = [
|
|
122
122
|
|
123
123
|
[[package]]
|
124
124
|
name = "arrow-ipc"
|
125
|
-
version = "54.
|
125
|
+
version = "54.2.0"
|
126
126
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
127
|
-
checksum = "
|
127
|
+
checksum = "65c63da4afedde2b25ef69825cd4663ca76f78f79ffe2d057695742099130ff6"
|
128
128
|
dependencies = [
|
129
129
|
"arrow-array",
|
130
130
|
"arrow-buffer",
|
@@ -135,15 +135,15 @@ dependencies = [
|
|
135
135
|
|
136
136
|
[[package]]
|
137
137
|
name = "arrow-schema"
|
138
|
-
version = "54.
|
138
|
+
version = "54.2.0"
|
139
139
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
140
|
-
checksum = "
|
140
|
+
checksum = "0f40f6be8f78af1ab610db7d9b236e21d587b7168e368a36275d2e5670096735"
|
141
141
|
|
142
142
|
[[package]]
|
143
143
|
name = "arrow-select"
|
144
|
-
version = "54.
|
144
|
+
version = "54.2.0"
|
145
145
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
146
|
-
checksum = "
|
146
|
+
checksum = "ac265273864a820c4a179fc67182ccc41ea9151b97024e1be956f0f2369c2539"
|
147
147
|
dependencies = [
|
148
148
|
"ahash",
|
149
149
|
"arrow-array",
|
@@ -247,9 +247,9 @@ checksum = "f61dac84819c6588b558454b194026eb1f09c293b9036ae9b159e74e73ab6cf9"
|
|
247
247
|
|
248
248
|
[[package]]
|
249
249
|
name = "cc"
|
250
|
-
version = "1.2.
|
250
|
+
version = "1.2.15"
|
251
251
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
252
|
-
checksum = "
|
252
|
+
checksum = "c736e259eea577f443d5c86c304f9f4ae0295c43f3ba05c21f1d66b5f06001af"
|
253
253
|
dependencies = [
|
254
254
|
"jobserver",
|
255
255
|
"libc",
|
@@ -273,14 +273,14 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
|
273
273
|
|
274
274
|
[[package]]
|
275
275
|
name = "chrono"
|
276
|
-
version = "0.4.
|
276
|
+
version = "0.4.40"
|
277
277
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
278
|
-
checksum = "
|
278
|
+
checksum = "1a7964611d71df112cb1730f2ee67324fcf4d0fc6606acbbe9bfe06df124637c"
|
279
279
|
dependencies = [
|
280
280
|
"android-tzdata",
|
281
281
|
"iana-time-zone",
|
282
282
|
"num-traits",
|
283
|
-
"windows-
|
283
|
+
"windows-link",
|
284
284
|
]
|
285
285
|
|
286
286
|
[[package]]
|
@@ -337,9 +337,9 @@ checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929"
|
|
337
337
|
|
338
338
|
[[package]]
|
339
339
|
name = "either"
|
340
|
-
version = "1.
|
340
|
+
version = "1.14.0"
|
341
341
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
342
|
-
checksum = "
|
342
|
+
checksum = "b7914353092ddf589ad78f25c5c1c21b7f80b0ff8621e7c814c3485b5306da9d"
|
343
343
|
|
344
344
|
[[package]]
|
345
345
|
name = "errno"
|
@@ -369,9 +369,9 @@ dependencies = [
|
|
369
369
|
|
370
370
|
[[package]]
|
371
371
|
name = "flate2"
|
372
|
-
version = "1.0
|
372
|
+
version = "1.1.0"
|
373
373
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
374
|
-
checksum = "
|
374
|
+
checksum = "11faaf5a5236997af9848be0bef4db95824b1d534ebc64d0f0c6cf3e67bd38dc"
|
375
375
|
dependencies = [
|
376
376
|
"crc32fast",
|
377
377
|
"miniz_oxide",
|
@@ -498,9 +498,9 @@ dependencies = [
|
|
498
498
|
|
499
499
|
[[package]]
|
500
500
|
name = "jiff"
|
501
|
-
version = "0.1
|
501
|
+
version = "0.2.1"
|
502
502
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
503
|
-
checksum = "
|
503
|
+
checksum = "3590fea8e9e22d449600c9bbd481a8163bef223e4ff938e5f55899f8cf1adb93"
|
504
504
|
dependencies = [
|
505
505
|
"jiff-tzdb-platform",
|
506
506
|
"log",
|
@@ -622,9 +622,9 @@ dependencies = [
|
|
622
622
|
|
623
623
|
[[package]]
|
624
624
|
name = "libc"
|
625
|
-
version = "0.2.
|
625
|
+
version = "0.2.170"
|
626
626
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
627
|
-
checksum = "
|
627
|
+
checksum = "875b3680cb2f8f71bdcf9a30f38d48282f5d3c95cbf9b3fa57269bb5d5c06828"
|
628
628
|
|
629
629
|
[[package]]
|
630
630
|
name = "libloading"
|
@@ -660,9 +660,9 @@ checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
|
|
660
660
|
|
661
661
|
[[package]]
|
662
662
|
name = "log"
|
663
|
-
version = "0.4.
|
663
|
+
version = "0.4.26"
|
664
664
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
665
|
-
checksum = "
|
665
|
+
checksum = "30bde2b3dc3671ae49d8e2e9f044c7c005836e7a023ee57cffa25ab82764bb9e"
|
666
666
|
|
667
667
|
[[package]]
|
668
668
|
name = "lz4_flex"
|
@@ -719,9 +719,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
|
|
719
719
|
|
720
720
|
[[package]]
|
721
721
|
name = "miniz_oxide"
|
722
|
-
version = "0.8.
|
722
|
+
version = "0.8.5"
|
723
723
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
724
|
-
checksum = "
|
724
|
+
checksum = "8e3e04debbb59698c15bacbb6d93584a8c0ca9cc3213cb423d31f760d8843ce5"
|
725
725
|
dependencies = [
|
726
726
|
"adler2",
|
727
727
|
]
|
@@ -812,9 +812,9 @@ dependencies = [
|
|
812
812
|
|
813
813
|
[[package]]
|
814
814
|
name = "once_cell"
|
815
|
-
version = "1.20.
|
815
|
+
version = "1.20.3"
|
816
816
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
817
|
-
checksum = "
|
817
|
+
checksum = "945462a4b81e43c4e3ba96bd7b49d834c6f61198356aa858733bc4acf3cbe62e"
|
818
818
|
|
819
819
|
[[package]]
|
820
820
|
name = "ordered-float"
|
@@ -833,12 +833,13 @@ dependencies = [
|
|
833
833
|
"arrow-array",
|
834
834
|
"arrow-schema",
|
835
835
|
"bytes",
|
836
|
+
"either",
|
836
837
|
"itertools 0.14.0",
|
837
838
|
"jemallocator",
|
838
839
|
"jiff",
|
839
840
|
"magnus",
|
840
841
|
"mimalloc",
|
841
|
-
"parquet 54.
|
842
|
+
"parquet 54.2.0",
|
842
843
|
"rand",
|
843
844
|
"rb-sys",
|
844
845
|
"simdutf8",
|
@@ -848,9 +849,9 @@ dependencies = [
|
|
848
849
|
|
849
850
|
[[package]]
|
850
851
|
name = "parquet"
|
851
|
-
version = "54.
|
852
|
+
version = "54.2.0"
|
852
853
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
853
|
-
checksum = "
|
854
|
+
checksum = "761c44d824fe83106e0600d2510c07bf4159a4985bf0569b513ea4288dc1b4fb"
|
854
855
|
dependencies = [
|
855
856
|
"ahash",
|
856
857
|
"arrow-array",
|
@@ -895,9 +896,9 @@ checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2"
|
|
895
896
|
|
896
897
|
[[package]]
|
897
898
|
name = "portable-atomic"
|
898
|
-
version = "1.
|
899
|
+
version = "1.11.0"
|
899
900
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
900
|
-
checksum = "
|
901
|
+
checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e"
|
901
902
|
|
902
903
|
[[package]]
|
903
904
|
name = "portable-atomic-util"
|
@@ -943,7 +944,7 @@ checksum = "3779b94aeb87e8bd4e834cee3650289ee9e0d5677f976ecdb6d219e5f4f6cd94"
|
|
943
944
|
dependencies = [
|
944
945
|
"rand_chacha",
|
945
946
|
"rand_core",
|
946
|
-
"zerocopy 0.8.
|
947
|
+
"zerocopy 0.8.20",
|
947
948
|
]
|
948
949
|
|
949
950
|
[[package]]
|
@@ -958,12 +959,12 @@ dependencies = [
|
|
958
959
|
|
959
960
|
[[package]]
|
960
961
|
name = "rand_core"
|
961
|
-
version = "0.9.
|
962
|
+
version = "0.9.2"
|
962
963
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
963
|
-
checksum = "
|
964
|
+
checksum = "7a509b1a2ffbe92afab0e55c8fd99dea1c280e8171bd2d88682bb20bc41cbc2c"
|
964
965
|
dependencies = [
|
965
966
|
"getrandom 0.3.1",
|
966
|
-
"zerocopy 0.8.
|
967
|
+
"zerocopy 0.8.20",
|
967
968
|
]
|
968
969
|
|
969
970
|
[[package]]
|
@@ -1079,18 +1080,18 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
|
|
1079
1080
|
|
1080
1081
|
[[package]]
|
1081
1082
|
name = "serde"
|
1082
|
-
version = "1.0.
|
1083
|
+
version = "1.0.218"
|
1083
1084
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1084
|
-
checksum = "
|
1085
|
+
checksum = "e8dfc9d19bdbf6d17e22319da49161d5d0108e4188e8b680aef6299eed22df60"
|
1085
1086
|
dependencies = [
|
1086
1087
|
"serde_derive",
|
1087
1088
|
]
|
1088
1089
|
|
1089
1090
|
[[package]]
|
1090
1091
|
name = "serde_derive"
|
1091
|
-
version = "1.0.
|
1092
|
+
version = "1.0.218"
|
1092
1093
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1093
|
-
checksum = "
|
1094
|
+
checksum = "f09503e191f4e797cb8aac08e9a4a4695c5edf6a2e70e376d961ddd5c969f82b"
|
1094
1095
|
dependencies = [
|
1095
1096
|
"proc-macro2",
|
1096
1097
|
"quote",
|
@@ -1099,9 +1100,9 @@ dependencies = [
|
|
1099
1100
|
|
1100
1101
|
[[package]]
|
1101
1102
|
name = "serde_json"
|
1102
|
-
version = "1.0.
|
1103
|
+
version = "1.0.139"
|
1103
1104
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1104
|
-
checksum = "
|
1105
|
+
checksum = "44f86c3acccc9c65b153fe1b85a3be07fe5515274ec9f0653b4a0875731c72a6"
|
1105
1106
|
dependencies = [
|
1106
1107
|
"itoa",
|
1107
1108
|
"memchr",
|
@@ -1152,9 +1153,9 @@ dependencies = [
|
|
1152
1153
|
|
1153
1154
|
[[package]]
|
1154
1155
|
name = "tempfile"
|
1155
|
-
version = "3.
|
1156
|
+
version = "3.17.1"
|
1156
1157
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1157
|
-
checksum = "
|
1158
|
+
checksum = "22e5a0acb1f3f55f65cc4a866c361b2fb2a0ff6366785ae6fbb5f85df07ba230"
|
1158
1159
|
dependencies = [
|
1159
1160
|
"cfg-if",
|
1160
1161
|
"fastrand",
|
@@ -1216,9 +1217,9 @@ dependencies = [
|
|
1216
1217
|
|
1217
1218
|
[[package]]
|
1218
1219
|
name = "unicode-ident"
|
1219
|
-
version = "1.0.
|
1220
|
+
version = "1.0.17"
|
1220
1221
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1221
|
-
checksum = "
|
1222
|
+
checksum = "00e2473a93778eb0bad35909dff6a10d28e63f792f16ed15e404fca9d5eeedbe"
|
1222
1223
|
|
1223
1224
|
[[package]]
|
1224
1225
|
name = "version_check"
|
@@ -1308,6 +1309,12 @@ dependencies = [
|
|
1308
1309
|
"windows-targets",
|
1309
1310
|
]
|
1310
1311
|
|
1312
|
+
[[package]]
|
1313
|
+
name = "windows-link"
|
1314
|
+
version = "0.1.0"
|
1315
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1316
|
+
checksum = "6dccfd733ce2b1753b03b6d3c65edf020262ea35e20ccdf3e288043e6dd620e3"
|
1317
|
+
|
1311
1318
|
[[package]]
|
1312
1319
|
name = "windows-sys"
|
1313
1320
|
version = "0.59.0"
|
@@ -1402,11 +1409,11 @@ dependencies = [
|
|
1402
1409
|
|
1403
1410
|
[[package]]
|
1404
1411
|
name = "zerocopy"
|
1405
|
-
version = "0.8.
|
1412
|
+
version = "0.8.20"
|
1406
1413
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1407
|
-
checksum = "
|
1414
|
+
checksum = "dde3bb8c68a8f3f1ed4ac9221aad6b10cece3e60a8e2ea54a6a2dec806d0084c"
|
1408
1415
|
dependencies = [
|
1409
|
-
"zerocopy-derive 0.8.
|
1416
|
+
"zerocopy-derive 0.8.20",
|
1410
1417
|
]
|
1411
1418
|
|
1412
1419
|
[[package]]
|
@@ -1422,9 +1429,9 @@ dependencies = [
|
|
1422
1429
|
|
1423
1430
|
[[package]]
|
1424
1431
|
name = "zerocopy-derive"
|
1425
|
-
version = "0.8.
|
1432
|
+
version = "0.8.20"
|
1426
1433
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1427
|
-
checksum = "
|
1434
|
+
checksum = "eea57037071898bf96a6da35fd626f4f27e9cee3ead2a6c703cf09d472b2e700"
|
1428
1435
|
dependencies = [
|
1429
1436
|
"proc-macro2",
|
1430
1437
|
"quote",
|
@@ -1433,9 +1440,9 @@ dependencies = [
|
|
1433
1440
|
|
1434
1441
|
[[package]]
|
1435
1442
|
name = "zstd"
|
1436
|
-
version = "0.13.
|
1443
|
+
version = "0.13.3"
|
1437
1444
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1438
|
-
checksum = "
|
1445
|
+
checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a"
|
1439
1446
|
dependencies = [
|
1440
1447
|
"zstd-safe",
|
1441
1448
|
]
|
data/README.md
CHANGED
@@ -194,4 +194,108 @@ The following data types are supported in the schema:
|
|
194
194
|
- `date32`
|
195
195
|
- `timestamp_millis`, `timestamp_micros`
|
196
196
|
|
197
|
-
|
197
|
+
### Schema DSL for Complex Data Types
|
198
|
+
|
199
|
+
In addition to the hash-based schema definition shown above, this library provides a more expressive DSL for defining complex schemas with nested structures:
|
200
|
+
|
201
|
+
```ruby
|
202
|
+
require "parquet"
|
203
|
+
|
204
|
+
# Define a complex schema using the Schema DSL
|
205
|
+
schema = Parquet::Schema.define do
|
206
|
+
field :id, :int64, nullable: false # Required field
|
207
|
+
field :name, :string # Optional field (nullable: true is default)
|
208
|
+
|
209
|
+
# Nested struct
|
210
|
+
field :address, :struct do
|
211
|
+
field :street, :string
|
212
|
+
field :city, :string
|
213
|
+
field :zip, :string
|
214
|
+
field :coordinates, :struct do
|
215
|
+
field :latitude, :double
|
216
|
+
field :longitude, :double
|
217
|
+
end
|
218
|
+
end
|
219
|
+
|
220
|
+
# List of primitives
|
221
|
+
field :scores, :list, item: :float
|
222
|
+
|
223
|
+
# List of structs
|
224
|
+
field :contacts, :list, item: :struct do
|
225
|
+
field :name, :string
|
226
|
+
field :phone, :string
|
227
|
+
field :primary, :boolean
|
228
|
+
end
|
229
|
+
|
230
|
+
# Map with string values
|
231
|
+
field :metadata, :map, key: :string, value: :string
|
232
|
+
|
233
|
+
# Map with struct values
|
234
|
+
field :properties, :map, key: :string, value: :struct do
|
235
|
+
field :count, :int32
|
236
|
+
field :description, :string
|
237
|
+
end
|
238
|
+
|
239
|
+
# Nested lists
|
240
|
+
field :nested_lists, :list, item: :list do
|
241
|
+
field :item, :string # For nested lists, inner item must be named 'item'
|
242
|
+
end
|
243
|
+
|
244
|
+
# Map of lists
|
245
|
+
field :map_of_lists, :map, key: :string, value: :list do
|
246
|
+
field :item, :int32 # For list items in maps, item must be named 'item'
|
247
|
+
end
|
248
|
+
end
|
249
|
+
|
250
|
+
# Sample data with nested structures
|
251
|
+
data = [
|
252
|
+
[
|
253
|
+
1, # id
|
254
|
+
"John Doe", # name
|
255
|
+
{ # address (struct)
|
256
|
+
"street" => "123 Main St",
|
257
|
+
"city" => "Springfield",
|
258
|
+
"zip" => "12345",
|
259
|
+
"coordinates" => {
|
260
|
+
"latitude" => 37.7749,
|
261
|
+
"longitude" => -122.4194
|
262
|
+
}
|
263
|
+
},
|
264
|
+
[85.5, 92.0, 78.5], # scores (list of floats)
|
265
|
+
[ # contacts (list of structs)
|
266
|
+
{ "name" => "Contact 1", "phone" => "555-1234", "primary" => true },
|
267
|
+
{ "name" => "Contact 2", "phone" => "555-5678", "primary" => false }
|
268
|
+
],
|
269
|
+
{ "created" => "2023-01-01", "status" => "active" }, # metadata (map)
|
270
|
+
{ # properties (map of structs)
|
271
|
+
"feature1" => { "count" => 5, "description" => "Main feature" },
|
272
|
+
"feature2" => { "count" => 3, "description" => "Secondary feature" }
|
273
|
+
},
|
274
|
+
[["a", "b"], ["c", "d", "e"]], # nested_lists
|
275
|
+
{ # map_of_lists
|
276
|
+
"group1" => [1, 2, 3],
|
277
|
+
"group2" => [4, 5, 6]
|
278
|
+
}
|
279
|
+
]
|
280
|
+
]
|
281
|
+
|
282
|
+
# Write to a parquet file using the schema
|
283
|
+
Parquet.write_rows(data.each, schema: schema, write_to: "complex_data.parquet")
|
284
|
+
|
285
|
+
# Read back the data
|
286
|
+
Parquet.each_row("complex_data.parquet") do |row|
|
287
|
+
puts row.inspect
|
288
|
+
end
|
289
|
+
```
|
290
|
+
|
291
|
+
The Schema DSL supports:
|
292
|
+
|
293
|
+
- **Primitive types**: All standard Parquet types (`int32`, `string`, etc.)
|
294
|
+
- **Complex types**: Structs, lists, and maps with arbitrary nesting
|
295
|
+
- **Nullability control**: Specify which fields can contain null values with `nullable: false/true`
|
296
|
+
- **List item nullability**: Control whether list items can be null with `item_nullable: false/true`
|
297
|
+
- **Map key/value nullability**: Control whether map keys or values can be null with `value_nullable: false/true`
|
298
|
+
|
299
|
+
Note: When using List and Map types, you need to provide at least:
|
300
|
+
- For lists: The `item:` parameter specifying the item type
|
301
|
+
- For maps: Both `key:` and `value:` parameters specifying key and value types
|
data/ext/parquet/Cargo.toml
CHANGED
@@ -11,15 +11,16 @@ ahash = "0.8"
|
|
11
11
|
arrow-array = "54.0.0"
|
12
12
|
arrow-schema = "54.0.0"
|
13
13
|
bytes = "^1.9"
|
14
|
+
either = "1.9"
|
14
15
|
itertools = "^0.14"
|
15
|
-
jiff = "0.
|
16
|
+
jiff = "0.2"
|
16
17
|
magnus = { version = "0.7", features = ["rb-sys"] }
|
17
18
|
parquet = { version = "^54.0", features = ["json"] }
|
18
19
|
rand = "0.9"
|
19
20
|
rb-sys = "^0.9"
|
20
|
-
thiserror = "2.0"
|
21
|
-
tempfile = "^3.15"
|
22
21
|
simdutf8 = "0.1.5"
|
22
|
+
tempfile = "^3.15"
|
23
|
+
thiserror = "2.0"
|
23
24
|
|
24
25
|
[target.'cfg(target_os = "linux")'.dependencies]
|
25
26
|
jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
|
@@ -7,6 +7,7 @@ pub struct RowEnumeratorArgs {
|
|
7
7
|
pub result_type: ParserResultType,
|
8
8
|
pub columns: Option<Vec<String>>,
|
9
9
|
pub strict: bool,
|
10
|
+
pub logger: Option<Value>,
|
10
11
|
}
|
11
12
|
|
12
13
|
/// Creates an enumerator for lazy Parquet row parsing
|
@@ -22,6 +23,9 @@ pub fn create_row_enumerator(args: RowEnumeratorArgs) -> Result<magnus::Enumerat
|
|
22
23
|
if args.strict {
|
23
24
|
kwargs.aset(Symbol::new("strict"), true)?;
|
24
25
|
}
|
26
|
+
if let Some(logger) = args.logger {
|
27
|
+
kwargs.aset(Symbol::new("logger"), logger)?;
|
28
|
+
}
|
25
29
|
Ok(args
|
26
30
|
.rb_self
|
27
31
|
.enumeratorize("each_row", (args.to_read, KwArgs(kwargs))))
|
@@ -34,6 +38,7 @@ pub struct ColumnEnumeratorArgs {
|
|
34
38
|
pub columns: Option<Vec<String>>,
|
35
39
|
pub batch_size: Option<usize>,
|
36
40
|
pub strict: bool,
|
41
|
+
pub logger: Option<Value>,
|
37
42
|
}
|
38
43
|
|
39
44
|
#[inline]
|
@@ -54,6 +59,9 @@ pub fn create_column_enumerator(
|
|
54
59
|
if args.strict {
|
55
60
|
kwargs.aset(Symbol::new("strict"), true)?;
|
56
61
|
}
|
62
|
+
if let Some(logger) = args.logger {
|
63
|
+
kwargs.aset(Symbol::new("logger"), logger)?;
|
64
|
+
}
|
57
65
|
Ok(args
|
58
66
|
.rb_self
|
59
67
|
.enumeratorize("each_column", (args.to_read, KwArgs(kwargs))))
|
@@ -6,10 +6,7 @@
|
|
6
6
|
/// so this optimization could be removed if any issues arise.
|
7
7
|
use std::{
|
8
8
|
collections::HashMap,
|
9
|
-
sync::{
|
10
|
-
atomic::{AtomicU32, Ordering},
|
11
|
-
LazyLock, Mutex,
|
12
|
-
},
|
9
|
+
sync::{LazyLock, Mutex},
|
13
10
|
};
|
14
11
|
|
15
12
|
use magnus::{IntoValue, RString, Ruby, Value};
|
@@ -20,9 +17,11 @@ use thiserror::Error;
|
|
20
17
|
pub enum CacheError {
|
21
18
|
#[error("Failed to acquire lock: {0}")]
|
22
19
|
LockError(String),
|
20
|
+
#[error("Failed to convert Ruby String to interned string: {0}")]
|
21
|
+
RStringConversion(String),
|
23
22
|
}
|
24
23
|
|
25
|
-
static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str,
|
24
|
+
static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, StringCacheKey>>> =
|
26
25
|
LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
|
27
26
|
|
28
27
|
pub struct StringCache;
|
@@ -31,10 +30,12 @@ pub struct StringCache;
|
|
31
30
|
pub struct StringCacheKey(&'static str);
|
32
31
|
|
33
32
|
impl StringCacheKey {
|
34
|
-
pub fn new(string: &str) -> Self {
|
33
|
+
pub fn new(string: &str) -> Result<Self, CacheError> {
|
35
34
|
let rstr = RString::new(string);
|
36
35
|
let fstr = rstr.to_interned_str();
|
37
|
-
Self(fstr.as_str().
|
36
|
+
Ok(Self(fstr.as_str().map_err(|e| {
|
37
|
+
CacheError::RStringConversion(e.to_string())
|
38
|
+
})?))
|
38
39
|
}
|
39
40
|
}
|
40
41
|
|
@@ -80,18 +81,16 @@ impl StringCache {
|
|
80
81
|
pub fn intern_many<AsStr: AsRef<str>>(
|
81
82
|
strings: &[AsStr],
|
82
83
|
) -> Result<Vec<StringCacheKey>, CacheError> {
|
83
|
-
let
|
84
|
+
let cache = STRING_CACHE
|
84
85
|
.lock()
|
85
86
|
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
86
87
|
|
87
88
|
let mut result: Vec<StringCacheKey> = Vec::with_capacity(strings.len());
|
88
89
|
for string in strings {
|
89
|
-
if let Some((_,
|
90
|
-
counter.fetch_add(1, Ordering::Relaxed);
|
90
|
+
if let Some((_, interned_string)) = cache.get_key_value(string.as_ref()) {
|
91
91
|
result.push(*interned_string);
|
92
92
|
} else {
|
93
|
-
let interned = StringCacheKey::new(string.as_ref())
|
94
|
-
cache.insert(interned.0, (interned, AtomicU32::new(1)));
|
93
|
+
let interned = StringCacheKey::new(string.as_ref())?;
|
95
94
|
result.push(interned);
|
96
95
|
}
|
97
96
|
}
|