parquet 0.2.8 → 0.2.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2dea9b9b171070949497da37aff1888de71c0782e76968ba218f38e5dc2f1606
4
- data.tar.gz: 74f4599b00a818cfca62d7fc162d02a87658da014ace361a76c998b718def9f2
3
+ metadata.gz: d7ad6471a6c32833344fa6196794733c398a8de814652cc85f1121ab85f14f86
4
+ data.tar.gz: edeb31c6b6683b45913e58782753678302c980165aab4764bbc5b498c1203798
5
5
  SHA512:
6
- metadata.gz: 209ca0339ccb11224501efc1d1adfed241097763475aa44e3997fce811123e9744f1bbfb1447e91decd1b020181b722ded94a6655630288db1f22e88aa8c09ae
7
- data.tar.gz: a889e46dc8fca484043b3f1513ee6487b0f8caa8096c826cdbe4fa9ff2d6aa457c2d84e1bd95f7b05819e0ce2e33017a77a720aa331be7115cfa2ac470557a59
6
+ metadata.gz: 13929dda3279394f9a8b2c25a3c605fd813393c44030b2c5fc52c815e0582b75c008bcb4146b08d6079ab3d19e8d545ffbe00b5a10d6757556d4f27122039927
7
+ data.tar.gz: a542706c99bf184b0833239950d7a269dad091dece68e09dc495528df14f90ae614fc76017a8af64231e081c247a36def095b31deaa454eac9874c60805c47ca
data/Cargo.lock CHANGED
@@ -2,15 +2,6 @@
2
2
  # It is not intended for manual editing.
3
3
  version = 4
4
4
 
5
- [[package]]
6
- name = "addr2line"
7
- version = "0.24.2"
8
- source = "registry+https://github.com/rust-lang/crates.io-index"
9
- checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1"
10
- dependencies = [
11
- "gimli",
12
- ]
13
-
14
5
  [[package]]
15
6
  name = "adler2"
16
7
  version = "2.0.0"
@@ -25,10 +16,10 @@ checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
25
16
  dependencies = [
26
17
  "cfg-if",
27
18
  "const-random",
28
- "getrandom",
19
+ "getrandom 0.2.15",
29
20
  "once_cell",
30
21
  "version_check",
31
- "zerocopy",
22
+ "zerocopy 0.7.35",
32
23
  ]
33
24
 
34
25
  [[package]]
@@ -162,17 +153,6 @@ dependencies = [
162
153
  "num",
163
154
  ]
164
155
 
165
- [[package]]
166
- name = "async-trait"
167
- version = "0.1.83"
168
- source = "registry+https://github.com/rust-lang/crates.io-index"
169
- checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd"
170
- dependencies = [
171
- "proc-macro2",
172
- "quote",
173
- "syn",
174
- ]
175
-
176
156
  [[package]]
177
157
  name = "atoi"
178
158
  version = "2.0.0"
@@ -188,21 +168,6 @@ version = "1.4.0"
188
168
  source = "registry+https://github.com/rust-lang/crates.io-index"
189
169
  checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
190
170
 
191
- [[package]]
192
- name = "backtrace"
193
- version = "0.3.74"
194
- source = "registry+https://github.com/rust-lang/crates.io-index"
195
- checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a"
196
- dependencies = [
197
- "addr2line",
198
- "cfg-if",
199
- "libc",
200
- "miniz_oxide",
201
- "object",
202
- "rustc-demangle",
203
- "windows-targets",
204
- ]
205
-
206
171
  [[package]]
207
172
  name = "base64"
208
173
  version = "0.22.1"
@@ -344,7 +309,7 @@ version = "0.1.16"
344
309
  source = "registry+https://github.com/rust-lang/crates.io-index"
345
310
  checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e"
346
311
  dependencies = [
347
- "getrandom",
312
+ "getrandom 0.2.15",
348
313
  "once_cell",
349
314
  "tiny-keccak",
350
315
  ]
@@ -370,17 +335,6 @@ version = "0.2.2"
370
335
  source = "registry+https://github.com/rust-lang/crates.io-index"
371
336
  checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
372
337
 
373
- [[package]]
374
- name = "displaydoc"
375
- version = "0.2.5"
376
- source = "registry+https://github.com/rust-lang/crates.io-index"
377
- checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
378
- dependencies = [
379
- "proc-macro2",
380
- "quote",
381
- "syn",
382
- ]
383
-
384
338
  [[package]]
385
339
  name = "either"
386
340
  version = "1.13.0"
@@ -423,104 +377,6 @@ dependencies = [
423
377
  "miniz_oxide",
424
378
  ]
425
379
 
426
- [[package]]
427
- name = "form_urlencoded"
428
- version = "1.2.1"
429
- source = "registry+https://github.com/rust-lang/crates.io-index"
430
- checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456"
431
- dependencies = [
432
- "percent-encoding",
433
- ]
434
-
435
- [[package]]
436
- name = "futures"
437
- version = "0.3.31"
438
- source = "registry+https://github.com/rust-lang/crates.io-index"
439
- checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876"
440
- dependencies = [
441
- "futures-channel",
442
- "futures-core",
443
- "futures-executor",
444
- "futures-io",
445
- "futures-sink",
446
- "futures-task",
447
- "futures-util",
448
- ]
449
-
450
- [[package]]
451
- name = "futures-channel"
452
- version = "0.3.31"
453
- source = "registry+https://github.com/rust-lang/crates.io-index"
454
- checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10"
455
- dependencies = [
456
- "futures-core",
457
- "futures-sink",
458
- ]
459
-
460
- [[package]]
461
- name = "futures-core"
462
- version = "0.3.31"
463
- source = "registry+https://github.com/rust-lang/crates.io-index"
464
- checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
465
-
466
- [[package]]
467
- name = "futures-executor"
468
- version = "0.3.31"
469
- source = "registry+https://github.com/rust-lang/crates.io-index"
470
- checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f"
471
- dependencies = [
472
- "futures-core",
473
- "futures-task",
474
- "futures-util",
475
- ]
476
-
477
- [[package]]
478
- name = "futures-io"
479
- version = "0.3.31"
480
- source = "registry+https://github.com/rust-lang/crates.io-index"
481
- checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
482
-
483
- [[package]]
484
- name = "futures-macro"
485
- version = "0.3.31"
486
- source = "registry+https://github.com/rust-lang/crates.io-index"
487
- checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
488
- dependencies = [
489
- "proc-macro2",
490
- "quote",
491
- "syn",
492
- ]
493
-
494
- [[package]]
495
- name = "futures-sink"
496
- version = "0.3.31"
497
- source = "registry+https://github.com/rust-lang/crates.io-index"
498
- checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7"
499
-
500
- [[package]]
501
- name = "futures-task"
502
- version = "0.3.31"
503
- source = "registry+https://github.com/rust-lang/crates.io-index"
504
- checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988"
505
-
506
- [[package]]
507
- name = "futures-util"
508
- version = "0.3.31"
509
- source = "registry+https://github.com/rust-lang/crates.io-index"
510
- checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
511
- dependencies = [
512
- "futures-channel",
513
- "futures-core",
514
- "futures-io",
515
- "futures-macro",
516
- "futures-sink",
517
- "futures-task",
518
- "memchr",
519
- "pin-project-lite",
520
- "pin-utils",
521
- "slab",
522
- ]
523
-
524
380
  [[package]]
525
381
  name = "getrandom"
526
382
  version = "0.2.15"
@@ -529,14 +385,20 @@ checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
529
385
  dependencies = [
530
386
  "cfg-if",
531
387
  "libc",
532
- "wasi",
388
+ "wasi 0.11.0+wasi-snapshot-preview1",
533
389
  ]
534
390
 
535
391
  [[package]]
536
- name = "gimli"
537
- version = "0.31.1"
392
+ name = "getrandom"
393
+ version = "0.3.1"
538
394
  source = "registry+https://github.com/rust-lang/crates.io-index"
539
- checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
395
+ checksum = "43a49c392881ce6d5c3b8cb70f98717b7c07aabbdff06687b9030dbfbe2725f8"
396
+ dependencies = [
397
+ "cfg-if",
398
+ "libc",
399
+ "wasi 0.13.3+wasi-0.2.2",
400
+ "windows-targets",
401
+ ]
540
402
 
541
403
  [[package]]
542
404
  name = "glob"
@@ -561,18 +423,6 @@ version = "0.15.2"
561
423
  source = "registry+https://github.com/rust-lang/crates.io-index"
562
424
  checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
563
425
 
564
- [[package]]
565
- name = "heck"
566
- version = "0.5.0"
567
- source = "registry+https://github.com/rust-lang/crates.io-index"
568
- checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
569
-
570
- [[package]]
571
- name = "humantime"
572
- version = "2.1.0"
573
- source = "registry+https://github.com/rust-lang/crates.io-index"
574
- checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
575
-
576
426
  [[package]]
577
427
  name = "iana-time-zone"
578
428
  version = "0.1.61"
@@ -596,145 +446,6 @@ dependencies = [
596
446
  "cc",
597
447
  ]
598
448
 
599
- [[package]]
600
- name = "icu_collections"
601
- version = "1.5.0"
602
- source = "registry+https://github.com/rust-lang/crates.io-index"
603
- checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526"
604
- dependencies = [
605
- "displaydoc",
606
- "yoke",
607
- "zerofrom",
608
- "zerovec",
609
- ]
610
-
611
- [[package]]
612
- name = "icu_locid"
613
- version = "1.5.0"
614
- source = "registry+https://github.com/rust-lang/crates.io-index"
615
- checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637"
616
- dependencies = [
617
- "displaydoc",
618
- "litemap",
619
- "tinystr",
620
- "writeable",
621
- "zerovec",
622
- ]
623
-
624
- [[package]]
625
- name = "icu_locid_transform"
626
- version = "1.5.0"
627
- source = "registry+https://github.com/rust-lang/crates.io-index"
628
- checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e"
629
- dependencies = [
630
- "displaydoc",
631
- "icu_locid",
632
- "icu_locid_transform_data",
633
- "icu_provider",
634
- "tinystr",
635
- "zerovec",
636
- ]
637
-
638
- [[package]]
639
- name = "icu_locid_transform_data"
640
- version = "1.5.0"
641
- source = "registry+https://github.com/rust-lang/crates.io-index"
642
- checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e"
643
-
644
- [[package]]
645
- name = "icu_normalizer"
646
- version = "1.5.0"
647
- source = "registry+https://github.com/rust-lang/crates.io-index"
648
- checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f"
649
- dependencies = [
650
- "displaydoc",
651
- "icu_collections",
652
- "icu_normalizer_data",
653
- "icu_properties",
654
- "icu_provider",
655
- "smallvec",
656
- "utf16_iter",
657
- "utf8_iter",
658
- "write16",
659
- "zerovec",
660
- ]
661
-
662
- [[package]]
663
- name = "icu_normalizer_data"
664
- version = "1.5.0"
665
- source = "registry+https://github.com/rust-lang/crates.io-index"
666
- checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516"
667
-
668
- [[package]]
669
- name = "icu_properties"
670
- version = "1.5.1"
671
- source = "registry+https://github.com/rust-lang/crates.io-index"
672
- checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5"
673
- dependencies = [
674
- "displaydoc",
675
- "icu_collections",
676
- "icu_locid_transform",
677
- "icu_properties_data",
678
- "icu_provider",
679
- "tinystr",
680
- "zerovec",
681
- ]
682
-
683
- [[package]]
684
- name = "icu_properties_data"
685
- version = "1.5.0"
686
- source = "registry+https://github.com/rust-lang/crates.io-index"
687
- checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569"
688
-
689
- [[package]]
690
- name = "icu_provider"
691
- version = "1.5.0"
692
- source = "registry+https://github.com/rust-lang/crates.io-index"
693
- checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9"
694
- dependencies = [
695
- "displaydoc",
696
- "icu_locid",
697
- "icu_provider_macros",
698
- "stable_deref_trait",
699
- "tinystr",
700
- "writeable",
701
- "yoke",
702
- "zerofrom",
703
- "zerovec",
704
- ]
705
-
706
- [[package]]
707
- name = "icu_provider_macros"
708
- version = "1.5.0"
709
- source = "registry+https://github.com/rust-lang/crates.io-index"
710
- checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6"
711
- dependencies = [
712
- "proc-macro2",
713
- "quote",
714
- "syn",
715
- ]
716
-
717
- [[package]]
718
- name = "idna"
719
- version = "1.0.3"
720
- source = "registry+https://github.com/rust-lang/crates.io-index"
721
- checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e"
722
- dependencies = [
723
- "idna_adapter",
724
- "smallvec",
725
- "utf8_iter",
726
- ]
727
-
728
- [[package]]
729
- name = "idna_adapter"
730
- version = "1.2.0"
731
- source = "registry+https://github.com/rust-lang/crates.io-index"
732
- checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71"
733
- dependencies = [
734
- "icu_normalizer",
735
- "icu_properties",
736
- ]
737
-
738
449
  [[package]]
739
450
  name = "integer-encoding"
740
451
  version = "3.0.4"
@@ -750,15 +461,6 @@ dependencies = [
750
461
  "either",
751
462
  ]
752
463
 
753
- [[package]]
754
- name = "itertools"
755
- version = "0.13.0"
756
- source = "registry+https://github.com/rust-lang/crates.io-index"
757
- checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186"
758
- dependencies = [
759
- "either",
760
- ]
761
-
762
464
  [[package]]
763
465
  name = "itertools"
764
466
  version = "0.14.0"
@@ -956,22 +658,6 @@ version = "0.4.15"
956
658
  source = "registry+https://github.com/rust-lang/crates.io-index"
957
659
  checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
958
660
 
959
- [[package]]
960
- name = "litemap"
961
- version = "0.7.4"
962
- source = "registry+https://github.com/rust-lang/crates.io-index"
963
- checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104"
964
-
965
- [[package]]
966
- name = "lock_api"
967
- version = "0.4.12"
968
- source = "registry+https://github.com/rust-lang/crates.io-index"
969
- checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17"
970
- dependencies = [
971
- "autocfg",
972
- "scopeguard",
973
- ]
974
-
975
661
  [[package]]
976
662
  name = "log"
977
663
  version = "0.4.22"
@@ -1124,36 +810,6 @@ dependencies = [
1124
810
  "libm",
1125
811
  ]
1126
812
 
1127
- [[package]]
1128
- name = "object"
1129
- version = "0.36.7"
1130
- source = "registry+https://github.com/rust-lang/crates.io-index"
1131
- checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87"
1132
- dependencies = [
1133
- "memchr",
1134
- ]
1135
-
1136
- [[package]]
1137
- name = "object_store"
1138
- version = "0.11.2"
1139
- source = "registry+https://github.com/rust-lang/crates.io-index"
1140
- checksum = "3cfccb68961a56facde1163f9319e0d15743352344e7808a11795fb99698dcaf"
1141
- dependencies = [
1142
- "async-trait",
1143
- "bytes",
1144
- "chrono",
1145
- "futures",
1146
- "humantime",
1147
- "itertools 0.13.0",
1148
- "parking_lot",
1149
- "percent-encoding",
1150
- "snafu",
1151
- "tokio",
1152
- "tracing",
1153
- "url",
1154
- "walkdir",
1155
- ]
1156
-
1157
813
  [[package]]
1158
814
  name = "once_cell"
1159
815
  version = "1.20.2"
@@ -1169,29 +825,6 @@ dependencies = [
1169
825
  "num-traits",
1170
826
  ]
1171
827
 
1172
- [[package]]
1173
- name = "parking_lot"
1174
- version = "0.12.3"
1175
- source = "registry+https://github.com/rust-lang/crates.io-index"
1176
- checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27"
1177
- dependencies = [
1178
- "lock_api",
1179
- "parking_lot_core",
1180
- ]
1181
-
1182
- [[package]]
1183
- name = "parking_lot_core"
1184
- version = "0.9.10"
1185
- source = "registry+https://github.com/rust-lang/crates.io-index"
1186
- checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8"
1187
- dependencies = [
1188
- "cfg-if",
1189
- "libc",
1190
- "redox_syscall",
1191
- "smallvec",
1192
- "windows-targets",
1193
- ]
1194
-
1195
828
  [[package]]
1196
829
  name = "parquet"
1197
830
  version = "0.1.0"
@@ -1206,6 +839,7 @@ dependencies = [
1206
839
  "magnus",
1207
840
  "mimalloc",
1208
841
  "parquet 54.0.0",
842
+ "rand",
1209
843
  "rb-sys",
1210
844
  "tempfile",
1211
845
  "thiserror",
@@ -1230,19 +864,16 @@ dependencies = [
1230
864
  "bytes",
1231
865
  "chrono",
1232
866
  "flate2",
1233
- "futures",
1234
867
  "half",
1235
868
  "hashbrown",
1236
869
  "lz4_flex",
1237
870
  "num",
1238
871
  "num-bigint",
1239
- "object_store",
1240
872
  "paste",
1241
873
  "seq-macro",
1242
874
  "serde_json",
1243
875
  "snap",
1244
876
  "thrift",
1245
- "tokio",
1246
877
  "twox-hash",
1247
878
  "zstd",
1248
879
  "zstd-sys",
@@ -1254,24 +885,6 @@ version = "1.0.15"
1254
885
  source = "registry+https://github.com/rust-lang/crates.io-index"
1255
886
  checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
1256
887
 
1257
- [[package]]
1258
- name = "percent-encoding"
1259
- version = "2.3.1"
1260
- source = "registry+https://github.com/rust-lang/crates.io-index"
1261
- checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
1262
-
1263
- [[package]]
1264
- name = "pin-project-lite"
1265
- version = "0.2.15"
1266
- source = "registry+https://github.com/rust-lang/crates.io-index"
1267
- checksum = "915a1e146535de9163f3987b8944ed8cf49a18bb0056bcebcdcece385cece4ff"
1268
-
1269
- [[package]]
1270
- name = "pin-utils"
1271
- version = "0.1.0"
1272
- source = "registry+https://github.com/rust-lang/crates.io-index"
1273
- checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
1274
-
1275
888
  [[package]]
1276
889
  name = "pkg-config"
1277
890
  version = "0.3.31"
@@ -1293,6 +906,15 @@ dependencies = [
1293
906
  "portable-atomic",
1294
907
  ]
1295
908
 
909
+ [[package]]
910
+ name = "ppv-lite86"
911
+ version = "0.2.20"
912
+ source = "registry+https://github.com/rust-lang/crates.io-index"
913
+ checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
914
+ dependencies = [
915
+ "zerocopy 0.7.35",
916
+ ]
917
+
1296
918
  [[package]]
1297
919
  name = "proc-macro2"
1298
920
  version = "1.0.92"
@@ -1311,6 +933,37 @@ dependencies = [
1311
933
  "proc-macro2",
1312
934
  ]
1313
935
 
936
+ [[package]]
937
+ name = "rand"
938
+ version = "0.9.0"
939
+ source = "registry+https://github.com/rust-lang/crates.io-index"
940
+ checksum = "3779b94aeb87e8bd4e834cee3650289ee9e0d5677f976ecdb6d219e5f4f6cd94"
941
+ dependencies = [
942
+ "rand_chacha",
943
+ "rand_core",
944
+ "zerocopy 0.8.14",
945
+ ]
946
+
947
+ [[package]]
948
+ name = "rand_chacha"
949
+ version = "0.9.0"
950
+ source = "registry+https://github.com/rust-lang/crates.io-index"
951
+ checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
952
+ dependencies = [
953
+ "ppv-lite86",
954
+ "rand_core",
955
+ ]
956
+
957
+ [[package]]
958
+ name = "rand_core"
959
+ version = "0.9.0"
960
+ source = "registry+https://github.com/rust-lang/crates.io-index"
961
+ checksum = "b08f3c9802962f7e1b25113931d94f43ed9725bebc59db9d0c3e9a23b67e15ff"
962
+ dependencies = [
963
+ "getrandom 0.3.1",
964
+ "zerocopy 0.8.14",
965
+ ]
966
+
1314
967
  [[package]]
1315
968
  name = "rb-sys"
1316
969
  version = "0.9.104"
@@ -1341,15 +994,6 @@ version = "0.1.2"
1341
994
  source = "registry+https://github.com/rust-lang/crates.io-index"
1342
995
  checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
1343
996
 
1344
- [[package]]
1345
- name = "redox_syscall"
1346
- version = "0.5.8"
1347
- source = "registry+https://github.com/rust-lang/crates.io-index"
1348
- checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834"
1349
- dependencies = [
1350
- "bitflags 2.6.0",
1351
- ]
1352
-
1353
997
  [[package]]
1354
998
  name = "regex"
1355
999
  version = "1.11.1"
@@ -1379,12 +1023,6 @@ version = "0.8.5"
1379
1023
  source = "registry+https://github.com/rust-lang/crates.io-index"
1380
1024
  checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
1381
1025
 
1382
- [[package]]
1383
- name = "rustc-demangle"
1384
- version = "0.1.24"
1385
- source = "registry+https://github.com/rust-lang/crates.io-index"
1386
- checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
1387
-
1388
1026
  [[package]]
1389
1027
  name = "rustc-hash"
1390
1028
  version = "1.1.0"
@@ -1419,21 +1057,6 @@ version = "1.0.18"
1419
1057
  source = "registry+https://github.com/rust-lang/crates.io-index"
1420
1058
  checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
1421
1059
 
1422
- [[package]]
1423
- name = "same-file"
1424
- version = "1.0.6"
1425
- source = "registry+https://github.com/rust-lang/crates.io-index"
1426
- checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
1427
- dependencies = [
1428
- "winapi-util",
1429
- ]
1430
-
1431
- [[package]]
1432
- name = "scopeguard"
1433
- version = "1.2.0"
1434
- source = "registry+https://github.com/rust-lang/crates.io-index"
1435
- checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
1436
-
1437
1060
  [[package]]
1438
1061
  name = "semver"
1439
1062
  version = "1.0.24"
@@ -1490,54 +1113,12 @@ version = "1.3.0"
1490
1113
  source = "registry+https://github.com/rust-lang/crates.io-index"
1491
1114
  checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
1492
1115
 
1493
- [[package]]
1494
- name = "slab"
1495
- version = "0.4.9"
1496
- source = "registry+https://github.com/rust-lang/crates.io-index"
1497
- checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67"
1498
- dependencies = [
1499
- "autocfg",
1500
- ]
1501
-
1502
- [[package]]
1503
- name = "smallvec"
1504
- version = "1.13.2"
1505
- source = "registry+https://github.com/rust-lang/crates.io-index"
1506
- checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
1507
-
1508
- [[package]]
1509
- name = "snafu"
1510
- version = "0.8.5"
1511
- source = "registry+https://github.com/rust-lang/crates.io-index"
1512
- checksum = "223891c85e2a29c3fe8fb900c1fae5e69c2e42415e3177752e8718475efa5019"
1513
- dependencies = [
1514
- "snafu-derive",
1515
- ]
1516
-
1517
- [[package]]
1518
- name = "snafu-derive"
1519
- version = "0.8.5"
1520
- source = "registry+https://github.com/rust-lang/crates.io-index"
1521
- checksum = "03c3c6b7927ffe7ecaa769ee0e3994da3b8cafc8f444578982c83ecb161af917"
1522
- dependencies = [
1523
- "heck",
1524
- "proc-macro2",
1525
- "quote",
1526
- "syn",
1527
- ]
1528
-
1529
1116
  [[package]]
1530
1117
  name = "snap"
1531
1118
  version = "1.1.1"
1532
1119
  source = "registry+https://github.com/rust-lang/crates.io-index"
1533
1120
  checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b"
1534
1121
 
1535
- [[package]]
1536
- name = "stable_deref_trait"
1537
- version = "1.2.0"
1538
- source = "registry+https://github.com/rust-lang/crates.io-index"
1539
- checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
1540
-
1541
1122
  [[package]]
1542
1123
  name = "static_assertions"
1543
1124
  version = "1.1.0"
@@ -1555,17 +1136,6 @@ dependencies = [
1555
1136
  "unicode-ident",
1556
1137
  ]
1557
1138
 
1558
- [[package]]
1559
- name = "synstructure"
1560
- version = "0.13.1"
1561
- source = "registry+https://github.com/rust-lang/crates.io-index"
1562
- checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
1563
- dependencies = [
1564
- "proc-macro2",
1565
- "quote",
1566
- "syn",
1567
- ]
1568
-
1569
1139
  [[package]]
1570
1140
  name = "tempfile"
1571
1141
  version = "3.15.0"
@@ -1574,7 +1144,7 @@ checksum = "9a8a559c81686f576e8cd0290cd2a24a2a9ad80c98b3478856500fcbd7acd704"
1574
1144
  dependencies = [
1575
1145
  "cfg-if",
1576
1146
  "fastrand",
1577
- "getrandom",
1147
+ "getrandom 0.2.15",
1578
1148
  "once_cell",
1579
1149
  "rustix",
1580
1150
  "windows-sys",
@@ -1620,70 +1190,6 @@ dependencies = [
1620
1190
  "crunchy",
1621
1191
  ]
1622
1192
 
1623
- [[package]]
1624
- name = "tinystr"
1625
- version = "0.7.6"
1626
- source = "registry+https://github.com/rust-lang/crates.io-index"
1627
- checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f"
1628
- dependencies = [
1629
- "displaydoc",
1630
- "zerovec",
1631
- ]
1632
-
1633
- [[package]]
1634
- name = "tokio"
1635
- version = "1.42.0"
1636
- source = "registry+https://github.com/rust-lang/crates.io-index"
1637
- checksum = "5cec9b21b0450273377fc97bd4c33a8acffc8c996c987a7c5b319a0083707551"
1638
- dependencies = [
1639
- "backtrace",
1640
- "bytes",
1641
- "pin-project-lite",
1642
- "tokio-macros",
1643
- ]
1644
-
1645
- [[package]]
1646
- name = "tokio-macros"
1647
- version = "2.4.0"
1648
- source = "registry+https://github.com/rust-lang/crates.io-index"
1649
- checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752"
1650
- dependencies = [
1651
- "proc-macro2",
1652
- "quote",
1653
- "syn",
1654
- ]
1655
-
1656
- [[package]]
1657
- name = "tracing"
1658
- version = "0.1.41"
1659
- source = "registry+https://github.com/rust-lang/crates.io-index"
1660
- checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
1661
- dependencies = [
1662
- "pin-project-lite",
1663
- "tracing-attributes",
1664
- "tracing-core",
1665
- ]
1666
-
1667
- [[package]]
1668
- name = "tracing-attributes"
1669
- version = "0.1.28"
1670
- source = "registry+https://github.com/rust-lang/crates.io-index"
1671
- checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
1672
- dependencies = [
1673
- "proc-macro2",
1674
- "quote",
1675
- "syn",
1676
- ]
1677
-
1678
- [[package]]
1679
- name = "tracing-core"
1680
- version = "0.1.33"
1681
- source = "registry+https://github.com/rust-lang/crates.io-index"
1682
- checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
1683
- dependencies = [
1684
- "once_cell",
1685
- ]
1686
-
1687
1193
  [[package]]
1688
1194
  name = "twox-hash"
1689
1195
  version = "1.6.3"
@@ -1700,29 +1206,6 @@ version = "1.0.14"
1700
1206
  source = "registry+https://github.com/rust-lang/crates.io-index"
1701
1207
  checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
1702
1208
 
1703
- [[package]]
1704
- name = "url"
1705
- version = "2.5.4"
1706
- source = "registry+https://github.com/rust-lang/crates.io-index"
1707
- checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60"
1708
- dependencies = [
1709
- "form_urlencoded",
1710
- "idna",
1711
- "percent-encoding",
1712
- ]
1713
-
1714
- [[package]]
1715
- name = "utf16_iter"
1716
- version = "1.0.5"
1717
- source = "registry+https://github.com/rust-lang/crates.io-index"
1718
- checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
1719
-
1720
- [[package]]
1721
- name = "utf8_iter"
1722
- version = "1.0.4"
1723
- source = "registry+https://github.com/rust-lang/crates.io-index"
1724
- checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
1725
-
1726
1209
  [[package]]
1727
1210
  name = "version_check"
1728
1211
  version = "0.9.5"
@@ -1730,20 +1213,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
1730
1213
  checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
1731
1214
 
1732
1215
  [[package]]
1733
- name = "walkdir"
1734
- version = "2.5.0"
1216
+ name = "wasi"
1217
+ version = "0.11.0+wasi-snapshot-preview1"
1735
1218
  source = "registry+https://github.com/rust-lang/crates.io-index"
1736
- checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
1737
- dependencies = [
1738
- "same-file",
1739
- "winapi-util",
1740
- ]
1219
+ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
1741
1220
 
1742
1221
  [[package]]
1743
1222
  name = "wasi"
1744
- version = "0.11.0+wasi-snapshot-preview1"
1223
+ version = "0.13.3+wasi-0.2.2"
1745
1224
  source = "registry+https://github.com/rust-lang/crates.io-index"
1746
- checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
1225
+ checksum = "26816d2e1a4a36a2940b96c5296ce403917633dff8f3440e9b236ed6f6bacad2"
1226
+ dependencies = [
1227
+ "wit-bindgen-rt",
1228
+ ]
1747
1229
 
1748
1230
  [[package]]
1749
1231
  name = "wasm-bindgen"
@@ -1799,15 +1281,6 @@ version = "0.2.99"
1799
1281
  source = "registry+https://github.com/rust-lang/crates.io-index"
1800
1282
  checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6"
1801
1283
 
1802
- [[package]]
1803
- name = "winapi-util"
1804
- version = "0.1.9"
1805
- source = "registry+https://github.com/rust-lang/crates.io-index"
1806
- checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
1807
- dependencies = [
1808
- "windows-sys",
1809
- ]
1810
-
1811
1284
  [[package]]
1812
1285
  name = "windows-core"
1813
1286
  version = "0.52.0"
@@ -1891,48 +1364,31 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
1891
1364
  checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
1892
1365
 
1893
1366
  [[package]]
1894
- name = "write16"
1895
- version = "1.0.0"
1367
+ name = "wit-bindgen-rt"
1368
+ version = "0.33.0"
1896
1369
  source = "registry+https://github.com/rust-lang/crates.io-index"
1897
- checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
1898
-
1899
- [[package]]
1900
- name = "writeable"
1901
- version = "0.5.5"
1902
- source = "registry+https://github.com/rust-lang/crates.io-index"
1903
- checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51"
1904
-
1905
- [[package]]
1906
- name = "yoke"
1907
- version = "0.7.5"
1908
- source = "registry+https://github.com/rust-lang/crates.io-index"
1909
- checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40"
1370
+ checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c"
1910
1371
  dependencies = [
1911
- "serde",
1912
- "stable_deref_trait",
1913
- "yoke-derive",
1914
- "zerofrom",
1372
+ "bitflags 2.6.0",
1915
1373
  ]
1916
1374
 
1917
1375
  [[package]]
1918
- name = "yoke-derive"
1919
- version = "0.7.5"
1376
+ name = "zerocopy"
1377
+ version = "0.7.35"
1920
1378
  source = "registry+https://github.com/rust-lang/crates.io-index"
1921
- checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154"
1379
+ checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
1922
1380
  dependencies = [
1923
- "proc-macro2",
1924
- "quote",
1925
- "syn",
1926
- "synstructure",
1381
+ "byteorder",
1382
+ "zerocopy-derive 0.7.35",
1927
1383
  ]
1928
1384
 
1929
1385
  [[package]]
1930
1386
  name = "zerocopy"
1931
- version = "0.7.35"
1387
+ version = "0.8.14"
1932
1388
  source = "registry+https://github.com/rust-lang/crates.io-index"
1933
- checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
1389
+ checksum = "a367f292d93d4eab890745e75a778da40909cab4d6ff8173693812f79c4a2468"
1934
1390
  dependencies = [
1935
- "zerocopy-derive",
1391
+ "zerocopy-derive 0.8.14",
1936
1392
  ]
1937
1393
 
1938
1394
  [[package]]
@@ -1947,42 +1403,10 @@ dependencies = [
1947
1403
  ]
1948
1404
 
1949
1405
  [[package]]
1950
- name = "zerofrom"
1951
- version = "0.1.5"
1952
- source = "registry+https://github.com/rust-lang/crates.io-index"
1953
- checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e"
1954
- dependencies = [
1955
- "zerofrom-derive",
1956
- ]
1957
-
1958
- [[package]]
1959
- name = "zerofrom-derive"
1960
- version = "0.1.5"
1961
- source = "registry+https://github.com/rust-lang/crates.io-index"
1962
- checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808"
1963
- dependencies = [
1964
- "proc-macro2",
1965
- "quote",
1966
- "syn",
1967
- "synstructure",
1968
- ]
1969
-
1970
- [[package]]
1971
- name = "zerovec"
1972
- version = "0.10.4"
1973
- source = "registry+https://github.com/rust-lang/crates.io-index"
1974
- checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079"
1975
- dependencies = [
1976
- "yoke",
1977
- "zerofrom",
1978
- "zerovec-derive",
1979
- ]
1980
-
1981
- [[package]]
1982
- name = "zerovec-derive"
1983
- version = "0.10.3"
1406
+ name = "zerocopy-derive"
1407
+ version = "0.8.14"
1984
1408
  source = "registry+https://github.com/rust-lang/crates.io-index"
1985
- checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6"
1409
+ checksum = "d3931cb58c62c13adec22e38686b559c86a30565e16ad6e8510a337cedc611e1"
1986
1410
  dependencies = [
1987
1411
  "proc-macro2",
1988
1412
  "quote",
data/README.md CHANGED
@@ -117,6 +117,20 @@ Parquet.write_rows(rows,
117
117
  write_to: "data.parquet",
118
118
  batch_size: 500
119
119
  )
120
+
121
+ # Optionally specify memory threshold for flushing (default is 64MB)
122
+ Parquet.write_rows(rows,
123
+ schema: schema,
124
+ write_to: "data.parquet",
125
+ flush_threshold: 32 * 1024 * 1024 # 32MB
126
+ )
127
+
128
+ # Optionally specify sample size for row size estimation (default is 100)
129
+ Parquet.write_rows(rows,
130
+ schema: schema,
131
+ write_to: "data.parquet",
132
+ sample_size: 200 # Sample 200 rows for size estimation
133
+ )
120
134
  ```
121
135
 
122
136
  ### Writing Column-wise Data
@@ -155,11 +169,12 @@ columns = batches.each
155
169
  # Write to a parquet file with default ZSTD compression
156
170
  Parquet.write_columns(columns, schema: schema, write_to: "data.parquet")
157
171
 
158
- # Write to a parquet file with specific compression
172
+ # Write to a parquet file with specific compression and memory threshold
159
173
  Parquet.write_columns(columns,
160
174
  schema: schema,
161
175
  write_to: "data.parquet",
162
- compression: "snappy" # Supported: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
176
+ compression: "snappy", # Supported: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
177
+ flush_threshold: 32 * 1024 * 1024 # 32MB
163
178
  )
164
179
 
165
180
  # Write to an IO object
@@ -14,7 +14,8 @@ bytes = "^1.9"
14
14
  itertools = "^0.14"
15
15
  jiff = "0.1.19"
16
16
  magnus = { version = "0.7", features = ["rb-sys"] }
17
- parquet = { version = "^54.0", features = ["json", "object_store"] }
17
+ parquet = { version = "^54.0", features = ["json"] }
18
+ rand = "0.9"
18
19
  rb-sys = "^0.9"
19
20
  thiserror = "2.0"
20
21
  tempfile = "^3.15"
@@ -24,7 +24,9 @@ pub struct ParquetWriteArgs<'a> {
24
24
  pub write_to: Value,
25
25
  pub schema: Vec<SchemaField<'a>>,
26
26
  pub batch_size: Option<usize>,
27
+ pub flush_threshold: Option<usize>,
27
28
  pub compression: Option<String>,
29
+ pub sample_size: Option<usize>,
28
30
  }
29
31
 
30
32
  pub trait SendableWrite: Send + Write {}
@@ -1,6 +1,7 @@
1
1
  use std::{
2
2
  fs::File,
3
3
  io::{self, BufReader, BufWriter},
4
+ mem,
4
5
  sync::Arc,
5
6
  };
6
7
 
@@ -16,6 +17,7 @@ use parquet::{
16
17
  basic::{Compression, GzipLevel, ZstdLevel},
17
18
  file::properties::WriterProperties,
18
19
  };
20
+ use rand::Rng;
19
21
  use tempfile::NamedTempFile;
20
22
 
21
23
  use crate::{
@@ -24,7 +26,12 @@ use crate::{
24
26
  IoLikeValue, ParquetSchemaType, ParquetWriteArgs, SchemaField, SendableWrite,
25
27
  };
26
28
 
27
- const DEFAULT_BATCH_SIZE: usize = 1000;
29
+ const SAMPLE_SIZE: usize = 100; // Number of rows to sample for size estimation
30
+ const MIN_BATCH_SIZE: usize = 100; // Minimum batch size to maintain efficiency
31
+ const INITIAL_BATCH_SIZE: usize = 100; // Initial batch size while sampling
32
+
33
+ // Maximum memory usage per batch (64MB by default)
34
+ const DEFAULT_MEMORY_THRESHOLD: usize = 64 * 1024 * 1024;
28
35
 
29
36
  /// Parse arguments for Parquet writing
30
37
  pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, MagnusError> {
@@ -32,12 +39,26 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
32
39
  let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
33
40
  let (read_from,) = parsed_args.required;
34
41
 
35
- let kwargs =
36
- get_kwargs::<_, (Value, Value), (Option<Option<usize>>, Option<Option<String>>), ()>(
37
- parsed_args.keywords,
38
- &["schema", "write_to"],
39
- &["batch_size", "compression"],
40
- )?;
42
+ let kwargs = get_kwargs::<
43
+ _,
44
+ (Value, Value),
45
+ (
46
+ Option<Option<usize>>,
47
+ Option<Option<usize>>,
48
+ Option<Option<String>>,
49
+ Option<Option<usize>>,
50
+ ),
51
+ (),
52
+ >(
53
+ parsed_args.keywords,
54
+ &["schema", "write_to"],
55
+ &[
56
+ "batch_size",
57
+ "flush_threshold",
58
+ "compression",
59
+ "sample_size",
60
+ ],
61
+ )?;
41
62
 
42
63
  let schema_array = RArray::from_value(kwargs.required.0).ok_or_else(|| {
43
64
  MagnusError::new(
@@ -110,10 +131,59 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
110
131
  write_to: kwargs.required.1,
111
132
  schema,
112
133
  batch_size: kwargs.optional.0.flatten(),
113
- compression: kwargs.optional.1.flatten(),
134
+ flush_threshold: kwargs.optional.1.flatten(),
135
+ compression: kwargs.optional.2.flatten(),
136
+ sample_size: kwargs.optional.3.flatten(),
114
137
  })
115
138
  }
116
139
 
140
+ /// Estimate the size of a row
141
+ fn estimate_single_row_size(row: &RArray, schema: &[SchemaField]) -> Result<usize, MagnusError> {
142
+ let mut row_size = 0;
143
+ for (field, value) in schema.iter().zip(row.into_iter()) {
144
+ // Estimate size based on type and value
145
+ row_size += match &field.type_ {
146
+ // Use reference to avoid moving
147
+ ParquetSchemaType::Int8 | ParquetSchemaType::UInt8 => 1,
148
+ ParquetSchemaType::Int16 | ParquetSchemaType::UInt16 => 2,
149
+ ParquetSchemaType::Int32
150
+ | ParquetSchemaType::UInt32
151
+ | ParquetSchemaType::Float
152
+ | ParquetSchemaType::Date32 => 4,
153
+ ParquetSchemaType::Int64
154
+ | ParquetSchemaType::UInt64
155
+ | ParquetSchemaType::Double
156
+ | ParquetSchemaType::TimestampMillis
157
+ | ParquetSchemaType::TimestampMicros => 8,
158
+ ParquetSchemaType::String => {
159
+ if let Ok(s) = String::try_convert(value) {
160
+ s.len() + mem::size_of::<usize>() // account for length prefix
161
+ } else {
162
+ 16 // default estimate for string
163
+ }
164
+ }
165
+ ParquetSchemaType::Binary => {
166
+ if let Ok(bytes) = Vec::<u8>::try_convert(value) {
167
+ bytes.len() + mem::size_of::<usize>() // account for length prefix
168
+ } else {
169
+ 16 // default estimate for binary
170
+ }
171
+ }
172
+ ParquetSchemaType::Boolean => 1,
173
+ ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
174
+ 32 // rough estimate for complex types
175
+ }
176
+ };
177
+ }
178
+ Ok(row_size)
179
+ }
180
+
181
+ /// Calculate optimal batch size based on memory threshold and estimated row size
182
+ fn calculate_batch_size(row_size: usize, memory_threshold: usize) -> usize {
183
+ let batch_size = memory_threshold / row_size;
184
+ batch_size.max(MIN_BATCH_SIZE)
185
+ }
186
+
117
187
  #[inline]
118
188
  pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
119
189
  let ruby = unsafe { Ruby::get_unchecked() };
@@ -122,11 +192,13 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
122
192
  read_from,
123
193
  write_to,
124
194
  schema,
125
- batch_size,
195
+ batch_size: user_batch_size,
126
196
  compression,
197
+ flush_threshold,
198
+ sample_size: user_sample_size,
127
199
  } = parse_parquet_write_args(args)?;
128
200
 
129
- let batch_size = batch_size.unwrap_or(DEFAULT_BATCH_SIZE);
201
+ let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
130
202
 
131
203
  // Convert schema to Arrow schema
132
204
  let arrow_fields: Vec<Field> = schema
@@ -170,11 +242,20 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
170
242
  if read_from.is_kind_of(ruby.class_enumerator()) {
171
243
  // Create collectors for each column
172
244
  let mut column_collectors: Vec<ColumnCollector> = schema
173
- .into_iter()
174
- .map(|field| ColumnCollector::new(field.name, field.type_, field.format))
245
+ .iter()
246
+ .map(|field| {
247
+ // Clone the type to avoid moving from a reference
248
+ let type_clone = field.type_.clone();
249
+ ColumnCollector::new(field.name.clone(), type_clone, field.format.clone())
250
+ })
175
251
  .collect();
176
252
 
177
253
  let mut rows_in_batch = 0;
254
+ let mut total_rows = 0;
255
+ let mut rng = rand::rng();
256
+ let sample_size = user_sample_size.unwrap_or(SAMPLE_SIZE);
257
+ let mut size_samples = Vec::with_capacity(sample_size);
258
+ let mut current_batch_size = user_batch_size.unwrap_or(INITIAL_BATCH_SIZE);
178
259
 
179
260
  loop {
180
261
  match read_from.funcall::<_, _, Value>("next", ()) {
@@ -196,16 +277,31 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
196
277
  ));
197
278
  }
198
279
 
199
- // Process each value in the row immediately
280
+ // Sample row sizes using reservoir sampling
281
+ if size_samples.len() < sample_size {
282
+ size_samples.push(estimate_single_row_size(&row_array, &schema)?);
283
+ } else if rng.random_range(0..=total_rows) < sample_size {
284
+ let idx = rng.random_range(0..sample_size);
285
+ size_samples[idx] = estimate_single_row_size(&row_array, &schema)?;
286
+ }
287
+
288
+ // Process each value in the row
200
289
  for (collector, value) in column_collectors.iter_mut().zip(row_array) {
201
290
  collector.push_value(value)?;
202
291
  }
203
292
 
204
293
  rows_in_batch += 1;
294
+ total_rows += 1;
295
+
296
+ // Recalculate batch size if we have enough samples and no user-specified size
297
+ if size_samples.len() >= sample_size && user_batch_size.is_none() {
298
+ let avg_row_size = size_samples.iter().sum::<usize>() / size_samples.len();
299
+ current_batch_size = calculate_batch_size(avg_row_size, flush_threshold);
300
+ }
205
301
 
206
302
  // When we reach batch size, write the batch
207
- if rows_in_batch >= batch_size {
208
- write_batch(&mut writer, &mut column_collectors)?;
303
+ if rows_in_batch >= current_batch_size {
304
+ write_batch(&mut writer, &mut column_collectors, flush_threshold)?;
209
305
  rows_in_batch = 0;
210
306
  }
211
307
  }
@@ -213,7 +309,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
213
309
  if e.is_kind_of(ruby.exception_stop_iteration()) {
214
310
  // Write any remaining rows
215
311
  if rows_in_batch > 0 {
216
- write_batch(&mut writer, &mut column_collectors)?;
312
+ write_batch(&mut writer, &mut column_collectors, flush_threshold)?;
217
313
  }
218
314
  break;
219
315
  }
@@ -247,8 +343,12 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
247
343
  schema,
248
344
  batch_size: _,
249
345
  compression,
346
+ flush_threshold,
347
+ sample_size: _,
250
348
  } = parse_parquet_write_args(args)?;
251
349
 
350
+ let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
351
+
252
352
  // Convert schema to Arrow schema
253
353
  let arrow_fields: Vec<Field> = schema
254
354
  .iter()
@@ -339,6 +439,14 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
339
439
  writer
340
440
  .write(&record_batch)
341
441
  .map_err(|e| ParquetErrorWrapper(e))?;
442
+
443
+ match &mut writer {
444
+ WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
445
+ if w.in_progress_size() >= flush_threshold {
446
+ w.flush().map_err(|e| ParquetErrorWrapper(e))?;
447
+ }
448
+ }
449
+ }
342
450
  }
343
451
  Err(e) => {
344
452
  if e.is_kind_of(ruby.exception_stop_iteration()) {
@@ -435,6 +543,7 @@ fn copy_temp_file_to_io_like(
435
543
  fn write_batch(
436
544
  writer: &mut WriterOutput,
437
545
  collectors: &mut [ColumnCollector],
546
+ flush_threshold: usize,
438
547
  ) -> Result<(), MagnusError> {
439
548
  // Convert columns to Arrow arrays
440
549
  let arrow_arrays: Vec<(String, Arc<dyn Array>)> = collectors
@@ -454,5 +563,13 @@ fn write_batch(
454
563
  .write(&record_batch)
455
564
  .map_err(|e| ParquetErrorWrapper(e))?;
456
565
 
566
+ match writer {
567
+ WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
568
+ if w.in_progress_size() >= flush_threshold || w.memory_size() >= flush_threshold {
569
+ w.flush().map_err(|e| ParquetErrorWrapper(e))?;
570
+ }
571
+ }
572
+ }
573
+
457
574
  Ok(())
458
575
  }
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.2.8"
2
+ VERSION = "0.2.10"
3
3
  end
data/lib/parquet.rbi CHANGED
@@ -65,18 +65,22 @@ module Parquet
65
65
  # - `timestamp_millis`, `timestamp_micros`
66
66
  # - `write_to`: String path or IO object to write the parquet file to
67
67
  # - `batch_size`: Optional batch size for writing (defaults to 1000)
68
+ # - `flush_threshold`: Optional memory threshold in bytes before flushing (defaults to 64MB)
68
69
  # - `compression`: Optional compression type to use (defaults to "zstd")
69
70
  # Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
71
+ # - `sample_size`: Optional number of rows to sample for size estimation (defaults to 100)
70
72
  sig do
71
73
  params(
72
74
  read_from: T::Enumerator[T::Array[T.untyped]],
73
75
  schema: T::Array[T::Hash[String, String]],
74
76
  write_to: T.any(String, IO),
75
77
  batch_size: T.nilable(Integer),
76
- compression: T.nilable(String)
78
+ flush_threshold: T.nilable(Integer),
79
+ compression: T.nilable(String),
80
+ sample_size: T.nilable(Integer)
77
81
  ).void
78
82
  end
79
- def self.write_rows(read_from, schema:, write_to:, batch_size: nil, compression: nil)
83
+ def self.write_rows(read_from, schema:, write_to:, batch_size: nil, flush_threshold: nil, compression: nil, sample_size: nil)
80
84
  end
81
85
 
82
86
  # Options:
@@ -92,6 +96,7 @@ module Parquet
92
96
  # - `timestamp_millis`, `timestamp_micros`
93
97
  # - Looks like [{"column_name" => {"type" => "date32", "format" => "%Y-%m-%d"}}, {"column_name" => "int8"}]
94
98
  # - `write_to`: String path or IO object to write the parquet file to
99
+ # - `flush_threshold`: Optional memory threshold in bytes before flushing (defaults to 64MB)
95
100
  # - `compression`: Optional compression type to use (defaults to "zstd")
96
101
  # Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
97
102
  sig do
@@ -99,9 +104,10 @@ module Parquet
99
104
  read_from: T::Enumerator[T::Array[T::Array[T.untyped]]],
100
105
  schema: T::Array[T::Hash[String, String]],
101
106
  write_to: T.any(String, IO),
107
+ flush_threshold: T.nilable(Integer),
102
108
  compression: T.nilable(String)
103
109
  ).void
104
110
  end
105
- def self.write_columns(read_from, schema:, write_to:, compression: nil)
111
+ def self.write_columns(read_from, schema:, write_to:, flush_threshold: nil, compression: nil)
106
112
  end
107
113
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.8
4
+ version: 0.2.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-01-13 00:00:00.000000000 Z
11
+ date: 2025-01-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys