parquet 0.2.9 → 0.2.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c20809ee9bbbe96f268b2ef4c5d40f7e7fae1613e3aabe2c0f42778425237c32
4
- data.tar.gz: 13cd137212d16de6eb4e0803ae35e9bdace8caf95ac1beabba4848a23290ad4e
3
+ metadata.gz: d7ad6471a6c32833344fa6196794733c398a8de814652cc85f1121ab85f14f86
4
+ data.tar.gz: edeb31c6b6683b45913e58782753678302c980165aab4764bbc5b498c1203798
5
5
  SHA512:
6
- metadata.gz: 70f51bcdc98891e781ab51257c969c4a8ca5c9fe03a43ccab8b72d9a79db2b640e2779a15ed1a36969914dd1fcd0c8df639ecb50168ec5a15b53e05cebd6655b
7
- data.tar.gz: abdacbca6cf8857a14ec039a34ba51403a6e4ba130b751faeb62904329b2688f581afa7207c0a95fa958822d6dda0b2477c58d02c930dac5e2b54c50ff4ccc7f
6
+ metadata.gz: 13929dda3279394f9a8b2c25a3c605fd813393c44030b2c5fc52c815e0582b75c008bcb4146b08d6079ab3d19e8d545ffbe00b5a10d6757556d4f27122039927
7
+ data.tar.gz: a542706c99bf184b0833239950d7a269dad091dece68e09dc495528df14f90ae614fc76017a8af64231e081c247a36def095b31deaa454eac9874c60805c47ca
data/Cargo.lock CHANGED
@@ -2,15 +2,6 @@
2
2
  # It is not intended for manual editing.
3
3
  version = 4
4
4
 
5
- [[package]]
6
- name = "addr2line"
7
- version = "0.24.2"
8
- source = "registry+https://github.com/rust-lang/crates.io-index"
9
- checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1"
10
- dependencies = [
11
- "gimli",
12
- ]
13
-
14
5
  [[package]]
15
6
  name = "adler2"
16
7
  version = "2.0.0"
@@ -25,10 +16,10 @@ checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
25
16
  dependencies = [
26
17
  "cfg-if",
27
18
  "const-random",
28
- "getrandom",
19
+ "getrandom 0.2.15",
29
20
  "once_cell",
30
21
  "version_check",
31
- "zerocopy",
22
+ "zerocopy 0.7.35",
32
23
  ]
33
24
 
34
25
  [[package]]
@@ -162,17 +153,6 @@ dependencies = [
162
153
  "num",
163
154
  ]
164
155
 
165
- [[package]]
166
- name = "async-trait"
167
- version = "0.1.83"
168
- source = "registry+https://github.com/rust-lang/crates.io-index"
169
- checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd"
170
- dependencies = [
171
- "proc-macro2",
172
- "quote",
173
- "syn",
174
- ]
175
-
176
156
  [[package]]
177
157
  name = "atoi"
178
158
  version = "2.0.0"
@@ -188,21 +168,6 @@ version = "1.4.0"
188
168
  source = "registry+https://github.com/rust-lang/crates.io-index"
189
169
  checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
190
170
 
191
- [[package]]
192
- name = "backtrace"
193
- version = "0.3.74"
194
- source = "registry+https://github.com/rust-lang/crates.io-index"
195
- checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a"
196
- dependencies = [
197
- "addr2line",
198
- "cfg-if",
199
- "libc",
200
- "miniz_oxide",
201
- "object",
202
- "rustc-demangle",
203
- "windows-targets",
204
- ]
205
-
206
171
  [[package]]
207
172
  name = "base64"
208
173
  version = "0.22.1"
@@ -344,7 +309,7 @@ version = "0.1.16"
344
309
  source = "registry+https://github.com/rust-lang/crates.io-index"
345
310
  checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e"
346
311
  dependencies = [
347
- "getrandom",
312
+ "getrandom 0.2.15",
348
313
  "once_cell",
349
314
  "tiny-keccak",
350
315
  ]
@@ -370,17 +335,6 @@ version = "0.2.2"
370
335
  source = "registry+https://github.com/rust-lang/crates.io-index"
371
336
  checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
372
337
 
373
- [[package]]
374
- name = "displaydoc"
375
- version = "0.2.5"
376
- source = "registry+https://github.com/rust-lang/crates.io-index"
377
- checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
378
- dependencies = [
379
- "proc-macro2",
380
- "quote",
381
- "syn",
382
- ]
383
-
384
338
  [[package]]
385
339
  name = "either"
386
340
  version = "1.13.0"
@@ -423,104 +377,6 @@ dependencies = [
423
377
  "miniz_oxide",
424
378
  ]
425
379
 
426
- [[package]]
427
- name = "form_urlencoded"
428
- version = "1.2.1"
429
- source = "registry+https://github.com/rust-lang/crates.io-index"
430
- checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456"
431
- dependencies = [
432
- "percent-encoding",
433
- ]
434
-
435
- [[package]]
436
- name = "futures"
437
- version = "0.3.31"
438
- source = "registry+https://github.com/rust-lang/crates.io-index"
439
- checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876"
440
- dependencies = [
441
- "futures-channel",
442
- "futures-core",
443
- "futures-executor",
444
- "futures-io",
445
- "futures-sink",
446
- "futures-task",
447
- "futures-util",
448
- ]
449
-
450
- [[package]]
451
- name = "futures-channel"
452
- version = "0.3.31"
453
- source = "registry+https://github.com/rust-lang/crates.io-index"
454
- checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10"
455
- dependencies = [
456
- "futures-core",
457
- "futures-sink",
458
- ]
459
-
460
- [[package]]
461
- name = "futures-core"
462
- version = "0.3.31"
463
- source = "registry+https://github.com/rust-lang/crates.io-index"
464
- checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
465
-
466
- [[package]]
467
- name = "futures-executor"
468
- version = "0.3.31"
469
- source = "registry+https://github.com/rust-lang/crates.io-index"
470
- checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f"
471
- dependencies = [
472
- "futures-core",
473
- "futures-task",
474
- "futures-util",
475
- ]
476
-
477
- [[package]]
478
- name = "futures-io"
479
- version = "0.3.31"
480
- source = "registry+https://github.com/rust-lang/crates.io-index"
481
- checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
482
-
483
- [[package]]
484
- name = "futures-macro"
485
- version = "0.3.31"
486
- source = "registry+https://github.com/rust-lang/crates.io-index"
487
- checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
488
- dependencies = [
489
- "proc-macro2",
490
- "quote",
491
- "syn",
492
- ]
493
-
494
- [[package]]
495
- name = "futures-sink"
496
- version = "0.3.31"
497
- source = "registry+https://github.com/rust-lang/crates.io-index"
498
- checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7"
499
-
500
- [[package]]
501
- name = "futures-task"
502
- version = "0.3.31"
503
- source = "registry+https://github.com/rust-lang/crates.io-index"
504
- checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988"
505
-
506
- [[package]]
507
- name = "futures-util"
508
- version = "0.3.31"
509
- source = "registry+https://github.com/rust-lang/crates.io-index"
510
- checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
511
- dependencies = [
512
- "futures-channel",
513
- "futures-core",
514
- "futures-io",
515
- "futures-macro",
516
- "futures-sink",
517
- "futures-task",
518
- "memchr",
519
- "pin-project-lite",
520
- "pin-utils",
521
- "slab",
522
- ]
523
-
524
380
  [[package]]
525
381
  name = "getrandom"
526
382
  version = "0.2.15"
@@ -529,14 +385,20 @@ checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
529
385
  dependencies = [
530
386
  "cfg-if",
531
387
  "libc",
532
- "wasi",
388
+ "wasi 0.11.0+wasi-snapshot-preview1",
533
389
  ]
534
390
 
535
391
  [[package]]
536
- name = "gimli"
537
- version = "0.31.1"
392
+ name = "getrandom"
393
+ version = "0.3.1"
538
394
  source = "registry+https://github.com/rust-lang/crates.io-index"
539
- checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
395
+ checksum = "43a49c392881ce6d5c3b8cb70f98717b7c07aabbdff06687b9030dbfbe2725f8"
396
+ dependencies = [
397
+ "cfg-if",
398
+ "libc",
399
+ "wasi 0.13.3+wasi-0.2.2",
400
+ "windows-targets",
401
+ ]
540
402
 
541
403
  [[package]]
542
404
  name = "glob"
@@ -561,18 +423,6 @@ version = "0.15.2"
561
423
  source = "registry+https://github.com/rust-lang/crates.io-index"
562
424
  checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
563
425
 
564
- [[package]]
565
- name = "heck"
566
- version = "0.5.0"
567
- source = "registry+https://github.com/rust-lang/crates.io-index"
568
- checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
569
-
570
- [[package]]
571
- name = "humantime"
572
- version = "2.1.0"
573
- source = "registry+https://github.com/rust-lang/crates.io-index"
574
- checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
575
-
576
426
  [[package]]
577
427
  name = "iana-time-zone"
578
428
  version = "0.1.61"
@@ -596,145 +446,6 @@ dependencies = [
596
446
  "cc",
597
447
  ]
598
448
 
599
- [[package]]
600
- name = "icu_collections"
601
- version = "1.5.0"
602
- source = "registry+https://github.com/rust-lang/crates.io-index"
603
- checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526"
604
- dependencies = [
605
- "displaydoc",
606
- "yoke",
607
- "zerofrom",
608
- "zerovec",
609
- ]
610
-
611
- [[package]]
612
- name = "icu_locid"
613
- version = "1.5.0"
614
- source = "registry+https://github.com/rust-lang/crates.io-index"
615
- checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637"
616
- dependencies = [
617
- "displaydoc",
618
- "litemap",
619
- "tinystr",
620
- "writeable",
621
- "zerovec",
622
- ]
623
-
624
- [[package]]
625
- name = "icu_locid_transform"
626
- version = "1.5.0"
627
- source = "registry+https://github.com/rust-lang/crates.io-index"
628
- checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e"
629
- dependencies = [
630
- "displaydoc",
631
- "icu_locid",
632
- "icu_locid_transform_data",
633
- "icu_provider",
634
- "tinystr",
635
- "zerovec",
636
- ]
637
-
638
- [[package]]
639
- name = "icu_locid_transform_data"
640
- version = "1.5.0"
641
- source = "registry+https://github.com/rust-lang/crates.io-index"
642
- checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e"
643
-
644
- [[package]]
645
- name = "icu_normalizer"
646
- version = "1.5.0"
647
- source = "registry+https://github.com/rust-lang/crates.io-index"
648
- checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f"
649
- dependencies = [
650
- "displaydoc",
651
- "icu_collections",
652
- "icu_normalizer_data",
653
- "icu_properties",
654
- "icu_provider",
655
- "smallvec",
656
- "utf16_iter",
657
- "utf8_iter",
658
- "write16",
659
- "zerovec",
660
- ]
661
-
662
- [[package]]
663
- name = "icu_normalizer_data"
664
- version = "1.5.0"
665
- source = "registry+https://github.com/rust-lang/crates.io-index"
666
- checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516"
667
-
668
- [[package]]
669
- name = "icu_properties"
670
- version = "1.5.1"
671
- source = "registry+https://github.com/rust-lang/crates.io-index"
672
- checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5"
673
- dependencies = [
674
- "displaydoc",
675
- "icu_collections",
676
- "icu_locid_transform",
677
- "icu_properties_data",
678
- "icu_provider",
679
- "tinystr",
680
- "zerovec",
681
- ]
682
-
683
- [[package]]
684
- name = "icu_properties_data"
685
- version = "1.5.0"
686
- source = "registry+https://github.com/rust-lang/crates.io-index"
687
- checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569"
688
-
689
- [[package]]
690
- name = "icu_provider"
691
- version = "1.5.0"
692
- source = "registry+https://github.com/rust-lang/crates.io-index"
693
- checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9"
694
- dependencies = [
695
- "displaydoc",
696
- "icu_locid",
697
- "icu_provider_macros",
698
- "stable_deref_trait",
699
- "tinystr",
700
- "writeable",
701
- "yoke",
702
- "zerofrom",
703
- "zerovec",
704
- ]
705
-
706
- [[package]]
707
- name = "icu_provider_macros"
708
- version = "1.5.0"
709
- source = "registry+https://github.com/rust-lang/crates.io-index"
710
- checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6"
711
- dependencies = [
712
- "proc-macro2",
713
- "quote",
714
- "syn",
715
- ]
716
-
717
- [[package]]
718
- name = "idna"
719
- version = "1.0.3"
720
- source = "registry+https://github.com/rust-lang/crates.io-index"
721
- checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e"
722
- dependencies = [
723
- "idna_adapter",
724
- "smallvec",
725
- "utf8_iter",
726
- ]
727
-
728
- [[package]]
729
- name = "idna_adapter"
730
- version = "1.2.0"
731
- source = "registry+https://github.com/rust-lang/crates.io-index"
732
- checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71"
733
- dependencies = [
734
- "icu_normalizer",
735
- "icu_properties",
736
- ]
737
-
738
449
  [[package]]
739
450
  name = "integer-encoding"
740
451
  version = "3.0.4"
@@ -750,15 +461,6 @@ dependencies = [
750
461
  "either",
751
462
  ]
752
463
 
753
- [[package]]
754
- name = "itertools"
755
- version = "0.13.0"
756
- source = "registry+https://github.com/rust-lang/crates.io-index"
757
- checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186"
758
- dependencies = [
759
- "either",
760
- ]
761
-
762
464
  [[package]]
763
465
  name = "itertools"
764
466
  version = "0.14.0"
@@ -956,22 +658,6 @@ version = "0.4.15"
956
658
  source = "registry+https://github.com/rust-lang/crates.io-index"
957
659
  checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
958
660
 
959
- [[package]]
960
- name = "litemap"
961
- version = "0.7.4"
962
- source = "registry+https://github.com/rust-lang/crates.io-index"
963
- checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104"
964
-
965
- [[package]]
966
- name = "lock_api"
967
- version = "0.4.12"
968
- source = "registry+https://github.com/rust-lang/crates.io-index"
969
- checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17"
970
- dependencies = [
971
- "autocfg",
972
- "scopeguard",
973
- ]
974
-
975
661
  [[package]]
976
662
  name = "log"
977
663
  version = "0.4.22"
@@ -1124,36 +810,6 @@ dependencies = [
1124
810
  "libm",
1125
811
  ]
1126
812
 
1127
- [[package]]
1128
- name = "object"
1129
- version = "0.36.7"
1130
- source = "registry+https://github.com/rust-lang/crates.io-index"
1131
- checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87"
1132
- dependencies = [
1133
- "memchr",
1134
- ]
1135
-
1136
- [[package]]
1137
- name = "object_store"
1138
- version = "0.11.2"
1139
- source = "registry+https://github.com/rust-lang/crates.io-index"
1140
- checksum = "3cfccb68961a56facde1163f9319e0d15743352344e7808a11795fb99698dcaf"
1141
- dependencies = [
1142
- "async-trait",
1143
- "bytes",
1144
- "chrono",
1145
- "futures",
1146
- "humantime",
1147
- "itertools 0.13.0",
1148
- "parking_lot",
1149
- "percent-encoding",
1150
- "snafu",
1151
- "tokio",
1152
- "tracing",
1153
- "url",
1154
- "walkdir",
1155
- ]
1156
-
1157
813
  [[package]]
1158
814
  name = "once_cell"
1159
815
  version = "1.20.2"
@@ -1169,29 +825,6 @@ dependencies = [
1169
825
  "num-traits",
1170
826
  ]
1171
827
 
1172
- [[package]]
1173
- name = "parking_lot"
1174
- version = "0.12.3"
1175
- source = "registry+https://github.com/rust-lang/crates.io-index"
1176
- checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27"
1177
- dependencies = [
1178
- "lock_api",
1179
- "parking_lot_core",
1180
- ]
1181
-
1182
- [[package]]
1183
- name = "parking_lot_core"
1184
- version = "0.9.10"
1185
- source = "registry+https://github.com/rust-lang/crates.io-index"
1186
- checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8"
1187
- dependencies = [
1188
- "cfg-if",
1189
- "libc",
1190
- "redox_syscall",
1191
- "smallvec",
1192
- "windows-targets",
1193
- ]
1194
-
1195
828
  [[package]]
1196
829
  name = "parquet"
1197
830
  version = "0.1.0"
@@ -1206,6 +839,7 @@ dependencies = [
1206
839
  "magnus",
1207
840
  "mimalloc",
1208
841
  "parquet 54.0.0",
842
+ "rand",
1209
843
  "rb-sys",
1210
844
  "tempfile",
1211
845
  "thiserror",
@@ -1230,19 +864,16 @@ dependencies = [
1230
864
  "bytes",
1231
865
  "chrono",
1232
866
  "flate2",
1233
- "futures",
1234
867
  "half",
1235
868
  "hashbrown",
1236
869
  "lz4_flex",
1237
870
  "num",
1238
871
  "num-bigint",
1239
- "object_store",
1240
872
  "paste",
1241
873
  "seq-macro",
1242
874
  "serde_json",
1243
875
  "snap",
1244
876
  "thrift",
1245
- "tokio",
1246
877
  "twox-hash",
1247
878
  "zstd",
1248
879
  "zstd-sys",
@@ -1254,24 +885,6 @@ version = "1.0.15"
1254
885
  source = "registry+https://github.com/rust-lang/crates.io-index"
1255
886
  checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
1256
887
 
1257
- [[package]]
1258
- name = "percent-encoding"
1259
- version = "2.3.1"
1260
- source = "registry+https://github.com/rust-lang/crates.io-index"
1261
- checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
1262
-
1263
- [[package]]
1264
- name = "pin-project-lite"
1265
- version = "0.2.15"
1266
- source = "registry+https://github.com/rust-lang/crates.io-index"
1267
- checksum = "915a1e146535de9163f3987b8944ed8cf49a18bb0056bcebcdcece385cece4ff"
1268
-
1269
- [[package]]
1270
- name = "pin-utils"
1271
- version = "0.1.0"
1272
- source = "registry+https://github.com/rust-lang/crates.io-index"
1273
- checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
1274
-
1275
888
  [[package]]
1276
889
  name = "pkg-config"
1277
890
  version = "0.3.31"
@@ -1293,6 +906,15 @@ dependencies = [
1293
906
  "portable-atomic",
1294
907
  ]
1295
908
 
909
+ [[package]]
910
+ name = "ppv-lite86"
911
+ version = "0.2.20"
912
+ source = "registry+https://github.com/rust-lang/crates.io-index"
913
+ checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
914
+ dependencies = [
915
+ "zerocopy 0.7.35",
916
+ ]
917
+
1296
918
  [[package]]
1297
919
  name = "proc-macro2"
1298
920
  version = "1.0.92"
@@ -1311,6 +933,37 @@ dependencies = [
1311
933
  "proc-macro2",
1312
934
  ]
1313
935
 
936
+ [[package]]
937
+ name = "rand"
938
+ version = "0.9.0"
939
+ source = "registry+https://github.com/rust-lang/crates.io-index"
940
+ checksum = "3779b94aeb87e8bd4e834cee3650289ee9e0d5677f976ecdb6d219e5f4f6cd94"
941
+ dependencies = [
942
+ "rand_chacha",
943
+ "rand_core",
944
+ "zerocopy 0.8.14",
945
+ ]
946
+
947
+ [[package]]
948
+ name = "rand_chacha"
949
+ version = "0.9.0"
950
+ source = "registry+https://github.com/rust-lang/crates.io-index"
951
+ checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
952
+ dependencies = [
953
+ "ppv-lite86",
954
+ "rand_core",
955
+ ]
956
+
957
+ [[package]]
958
+ name = "rand_core"
959
+ version = "0.9.0"
960
+ source = "registry+https://github.com/rust-lang/crates.io-index"
961
+ checksum = "b08f3c9802962f7e1b25113931d94f43ed9725bebc59db9d0c3e9a23b67e15ff"
962
+ dependencies = [
963
+ "getrandom 0.3.1",
964
+ "zerocopy 0.8.14",
965
+ ]
966
+
1314
967
  [[package]]
1315
968
  name = "rb-sys"
1316
969
  version = "0.9.104"
@@ -1341,15 +994,6 @@ version = "0.1.2"
1341
994
  source = "registry+https://github.com/rust-lang/crates.io-index"
1342
995
  checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
1343
996
 
1344
- [[package]]
1345
- name = "redox_syscall"
1346
- version = "0.5.8"
1347
- source = "registry+https://github.com/rust-lang/crates.io-index"
1348
- checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834"
1349
- dependencies = [
1350
- "bitflags 2.6.0",
1351
- ]
1352
-
1353
997
  [[package]]
1354
998
  name = "regex"
1355
999
  version = "1.11.1"
@@ -1379,12 +1023,6 @@ version = "0.8.5"
1379
1023
  source = "registry+https://github.com/rust-lang/crates.io-index"
1380
1024
  checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
1381
1025
 
1382
- [[package]]
1383
- name = "rustc-demangle"
1384
- version = "0.1.24"
1385
- source = "registry+https://github.com/rust-lang/crates.io-index"
1386
- checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
1387
-
1388
1026
  [[package]]
1389
1027
  name = "rustc-hash"
1390
1028
  version = "1.1.0"
@@ -1419,21 +1057,6 @@ version = "1.0.18"
1419
1057
  source = "registry+https://github.com/rust-lang/crates.io-index"
1420
1058
  checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
1421
1059
 
1422
- [[package]]
1423
- name = "same-file"
1424
- version = "1.0.6"
1425
- source = "registry+https://github.com/rust-lang/crates.io-index"
1426
- checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
1427
- dependencies = [
1428
- "winapi-util",
1429
- ]
1430
-
1431
- [[package]]
1432
- name = "scopeguard"
1433
- version = "1.2.0"
1434
- source = "registry+https://github.com/rust-lang/crates.io-index"
1435
- checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
1436
-
1437
1060
  [[package]]
1438
1061
  name = "semver"
1439
1062
  version = "1.0.24"
@@ -1490,54 +1113,12 @@ version = "1.3.0"
1490
1113
  source = "registry+https://github.com/rust-lang/crates.io-index"
1491
1114
  checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
1492
1115
 
1493
- [[package]]
1494
- name = "slab"
1495
- version = "0.4.9"
1496
- source = "registry+https://github.com/rust-lang/crates.io-index"
1497
- checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67"
1498
- dependencies = [
1499
- "autocfg",
1500
- ]
1501
-
1502
- [[package]]
1503
- name = "smallvec"
1504
- version = "1.13.2"
1505
- source = "registry+https://github.com/rust-lang/crates.io-index"
1506
- checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
1507
-
1508
- [[package]]
1509
- name = "snafu"
1510
- version = "0.8.5"
1511
- source = "registry+https://github.com/rust-lang/crates.io-index"
1512
- checksum = "223891c85e2a29c3fe8fb900c1fae5e69c2e42415e3177752e8718475efa5019"
1513
- dependencies = [
1514
- "snafu-derive",
1515
- ]
1516
-
1517
- [[package]]
1518
- name = "snafu-derive"
1519
- version = "0.8.5"
1520
- source = "registry+https://github.com/rust-lang/crates.io-index"
1521
- checksum = "03c3c6b7927ffe7ecaa769ee0e3994da3b8cafc8f444578982c83ecb161af917"
1522
- dependencies = [
1523
- "heck",
1524
- "proc-macro2",
1525
- "quote",
1526
- "syn",
1527
- ]
1528
-
1529
1116
  [[package]]
1530
1117
  name = "snap"
1531
1118
  version = "1.1.1"
1532
1119
  source = "registry+https://github.com/rust-lang/crates.io-index"
1533
1120
  checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b"
1534
1121
 
1535
- [[package]]
1536
- name = "stable_deref_trait"
1537
- version = "1.2.0"
1538
- source = "registry+https://github.com/rust-lang/crates.io-index"
1539
- checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
1540
-
1541
1122
  [[package]]
1542
1123
  name = "static_assertions"
1543
1124
  version = "1.1.0"
@@ -1555,17 +1136,6 @@ dependencies = [
1555
1136
  "unicode-ident",
1556
1137
  ]
1557
1138
 
1558
- [[package]]
1559
- name = "synstructure"
1560
- version = "0.13.1"
1561
- source = "registry+https://github.com/rust-lang/crates.io-index"
1562
- checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
1563
- dependencies = [
1564
- "proc-macro2",
1565
- "quote",
1566
- "syn",
1567
- ]
1568
-
1569
1139
  [[package]]
1570
1140
  name = "tempfile"
1571
1141
  version = "3.15.0"
@@ -1574,7 +1144,7 @@ checksum = "9a8a559c81686f576e8cd0290cd2a24a2a9ad80c98b3478856500fcbd7acd704"
1574
1144
  dependencies = [
1575
1145
  "cfg-if",
1576
1146
  "fastrand",
1577
- "getrandom",
1147
+ "getrandom 0.2.15",
1578
1148
  "once_cell",
1579
1149
  "rustix",
1580
1150
  "windows-sys",
@@ -1620,70 +1190,6 @@ dependencies = [
1620
1190
  "crunchy",
1621
1191
  ]
1622
1192
 
1623
- [[package]]
1624
- name = "tinystr"
1625
- version = "0.7.6"
1626
- source = "registry+https://github.com/rust-lang/crates.io-index"
1627
- checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f"
1628
- dependencies = [
1629
- "displaydoc",
1630
- "zerovec",
1631
- ]
1632
-
1633
- [[package]]
1634
- name = "tokio"
1635
- version = "1.42.0"
1636
- source = "registry+https://github.com/rust-lang/crates.io-index"
1637
- checksum = "5cec9b21b0450273377fc97bd4c33a8acffc8c996c987a7c5b319a0083707551"
1638
- dependencies = [
1639
- "backtrace",
1640
- "bytes",
1641
- "pin-project-lite",
1642
- "tokio-macros",
1643
- ]
1644
-
1645
- [[package]]
1646
- name = "tokio-macros"
1647
- version = "2.4.0"
1648
- source = "registry+https://github.com/rust-lang/crates.io-index"
1649
- checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752"
1650
- dependencies = [
1651
- "proc-macro2",
1652
- "quote",
1653
- "syn",
1654
- ]
1655
-
1656
- [[package]]
1657
- name = "tracing"
1658
- version = "0.1.41"
1659
- source = "registry+https://github.com/rust-lang/crates.io-index"
1660
- checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
1661
- dependencies = [
1662
- "pin-project-lite",
1663
- "tracing-attributes",
1664
- "tracing-core",
1665
- ]
1666
-
1667
- [[package]]
1668
- name = "tracing-attributes"
1669
- version = "0.1.28"
1670
- source = "registry+https://github.com/rust-lang/crates.io-index"
1671
- checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
1672
- dependencies = [
1673
- "proc-macro2",
1674
- "quote",
1675
- "syn",
1676
- ]
1677
-
1678
- [[package]]
1679
- name = "tracing-core"
1680
- version = "0.1.33"
1681
- source = "registry+https://github.com/rust-lang/crates.io-index"
1682
- checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
1683
- dependencies = [
1684
- "once_cell",
1685
- ]
1686
-
1687
1193
  [[package]]
1688
1194
  name = "twox-hash"
1689
1195
  version = "1.6.3"
@@ -1700,29 +1206,6 @@ version = "1.0.14"
1700
1206
  source = "registry+https://github.com/rust-lang/crates.io-index"
1701
1207
  checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
1702
1208
 
1703
- [[package]]
1704
- name = "url"
1705
- version = "2.5.4"
1706
- source = "registry+https://github.com/rust-lang/crates.io-index"
1707
- checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60"
1708
- dependencies = [
1709
- "form_urlencoded",
1710
- "idna",
1711
- "percent-encoding",
1712
- ]
1713
-
1714
- [[package]]
1715
- name = "utf16_iter"
1716
- version = "1.0.5"
1717
- source = "registry+https://github.com/rust-lang/crates.io-index"
1718
- checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
1719
-
1720
- [[package]]
1721
- name = "utf8_iter"
1722
- version = "1.0.4"
1723
- source = "registry+https://github.com/rust-lang/crates.io-index"
1724
- checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
1725
-
1726
1209
  [[package]]
1727
1210
  name = "version_check"
1728
1211
  version = "0.9.5"
@@ -1730,20 +1213,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
1730
1213
  checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
1731
1214
 
1732
1215
  [[package]]
1733
- name = "walkdir"
1734
- version = "2.5.0"
1216
+ name = "wasi"
1217
+ version = "0.11.0+wasi-snapshot-preview1"
1735
1218
  source = "registry+https://github.com/rust-lang/crates.io-index"
1736
- checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
1737
- dependencies = [
1738
- "same-file",
1739
- "winapi-util",
1740
- ]
1219
+ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
1741
1220
 
1742
1221
  [[package]]
1743
1222
  name = "wasi"
1744
- version = "0.11.0+wasi-snapshot-preview1"
1223
+ version = "0.13.3+wasi-0.2.2"
1745
1224
  source = "registry+https://github.com/rust-lang/crates.io-index"
1746
- checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
1225
+ checksum = "26816d2e1a4a36a2940b96c5296ce403917633dff8f3440e9b236ed6f6bacad2"
1226
+ dependencies = [
1227
+ "wit-bindgen-rt",
1228
+ ]
1747
1229
 
1748
1230
  [[package]]
1749
1231
  name = "wasm-bindgen"
@@ -1799,15 +1281,6 @@ version = "0.2.99"
1799
1281
  source = "registry+https://github.com/rust-lang/crates.io-index"
1800
1282
  checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6"
1801
1283
 
1802
- [[package]]
1803
- name = "winapi-util"
1804
- version = "0.1.9"
1805
- source = "registry+https://github.com/rust-lang/crates.io-index"
1806
- checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
1807
- dependencies = [
1808
- "windows-sys",
1809
- ]
1810
-
1811
1284
  [[package]]
1812
1285
  name = "windows-core"
1813
1286
  version = "0.52.0"
@@ -1891,48 +1364,31 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
1891
1364
  checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
1892
1365
 
1893
1366
  [[package]]
1894
- name = "write16"
1895
- version = "1.0.0"
1367
+ name = "wit-bindgen-rt"
1368
+ version = "0.33.0"
1896
1369
  source = "registry+https://github.com/rust-lang/crates.io-index"
1897
- checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
1898
-
1899
- [[package]]
1900
- name = "writeable"
1901
- version = "0.5.5"
1902
- source = "registry+https://github.com/rust-lang/crates.io-index"
1903
- checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51"
1904
-
1905
- [[package]]
1906
- name = "yoke"
1907
- version = "0.7.5"
1908
- source = "registry+https://github.com/rust-lang/crates.io-index"
1909
- checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40"
1370
+ checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c"
1910
1371
  dependencies = [
1911
- "serde",
1912
- "stable_deref_trait",
1913
- "yoke-derive",
1914
- "zerofrom",
1372
+ "bitflags 2.6.0",
1915
1373
  ]
1916
1374
 
1917
1375
  [[package]]
1918
- name = "yoke-derive"
1919
- version = "0.7.5"
1376
+ name = "zerocopy"
1377
+ version = "0.7.35"
1920
1378
  source = "registry+https://github.com/rust-lang/crates.io-index"
1921
- checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154"
1379
+ checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
1922
1380
  dependencies = [
1923
- "proc-macro2",
1924
- "quote",
1925
- "syn",
1926
- "synstructure",
1381
+ "byteorder",
1382
+ "zerocopy-derive 0.7.35",
1927
1383
  ]
1928
1384
 
1929
1385
  [[package]]
1930
1386
  name = "zerocopy"
1931
- version = "0.7.35"
1387
+ version = "0.8.14"
1932
1388
  source = "registry+https://github.com/rust-lang/crates.io-index"
1933
- checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
1389
+ checksum = "a367f292d93d4eab890745e75a778da40909cab4d6ff8173693812f79c4a2468"
1934
1390
  dependencies = [
1935
- "zerocopy-derive",
1391
+ "zerocopy-derive 0.8.14",
1936
1392
  ]
1937
1393
 
1938
1394
  [[package]]
@@ -1947,42 +1403,10 @@ dependencies = [
1947
1403
  ]
1948
1404
 
1949
1405
  [[package]]
1950
- name = "zerofrom"
1951
- version = "0.1.5"
1952
- source = "registry+https://github.com/rust-lang/crates.io-index"
1953
- checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e"
1954
- dependencies = [
1955
- "zerofrom-derive",
1956
- ]
1957
-
1958
- [[package]]
1959
- name = "zerofrom-derive"
1960
- version = "0.1.5"
1961
- source = "registry+https://github.com/rust-lang/crates.io-index"
1962
- checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808"
1963
- dependencies = [
1964
- "proc-macro2",
1965
- "quote",
1966
- "syn",
1967
- "synstructure",
1968
- ]
1969
-
1970
- [[package]]
1971
- name = "zerovec"
1972
- version = "0.10.4"
1973
- source = "registry+https://github.com/rust-lang/crates.io-index"
1974
- checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079"
1975
- dependencies = [
1976
- "yoke",
1977
- "zerofrom",
1978
- "zerovec-derive",
1979
- ]
1980
-
1981
- [[package]]
1982
- name = "zerovec-derive"
1983
- version = "0.10.3"
1406
+ name = "zerocopy-derive"
1407
+ version = "0.8.14"
1984
1408
  source = "registry+https://github.com/rust-lang/crates.io-index"
1985
- checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6"
1409
+ checksum = "d3931cb58c62c13adec22e38686b559c86a30565e16ad6e8510a337cedc611e1"
1986
1410
  dependencies = [
1987
1411
  "proc-macro2",
1988
1412
  "quote",
data/README.md CHANGED
@@ -124,6 +124,13 @@ Parquet.write_rows(rows,
124
124
  write_to: "data.parquet",
125
125
  flush_threshold: 32 * 1024 * 1024 # 32MB
126
126
  )
127
+
128
+ # Optionally specify sample size for row size estimation (default is 100)
129
+ Parquet.write_rows(rows,
130
+ schema: schema,
131
+ write_to: "data.parquet",
132
+ sample_size: 200 # Sample 200 rows for size estimation
133
+ )
127
134
  ```
128
135
 
129
136
  ### Writing Column-wise Data
@@ -14,7 +14,8 @@ bytes = "^1.9"
14
14
  itertools = "^0.14"
15
15
  jiff = "0.1.19"
16
16
  magnus = { version = "0.7", features = ["rb-sys"] }
17
- parquet = { version = "^54.0", features = ["json", "object_store"] }
17
+ parquet = { version = "^54.0", features = ["json"] }
18
+ rand = "0.9"
18
19
  rb-sys = "^0.9"
19
20
  thiserror = "2.0"
20
21
  tempfile = "^3.15"
@@ -26,6 +26,7 @@ pub struct ParquetWriteArgs<'a> {
26
26
  pub batch_size: Option<usize>,
27
27
  pub flush_threshold: Option<usize>,
28
28
  pub compression: Option<String>,
29
+ pub sample_size: Option<usize>,
29
30
  }
30
31
 
31
32
  pub trait SendableWrite: Send + Write {}
@@ -1,6 +1,7 @@
1
1
  use std::{
2
2
  fs::File,
3
3
  io::{self, BufReader, BufWriter},
4
+ mem,
4
5
  sync::Arc,
5
6
  };
6
7
 
@@ -16,6 +17,7 @@ use parquet::{
16
17
  basic::{Compression, GzipLevel, ZstdLevel},
17
18
  file::properties::WriterProperties,
18
19
  };
20
+ use rand::Rng;
19
21
  use tempfile::NamedTempFile;
20
22
 
21
23
  use crate::{
@@ -24,7 +26,9 @@ use crate::{
24
26
  IoLikeValue, ParquetSchemaType, ParquetWriteArgs, SchemaField, SendableWrite,
25
27
  };
26
28
 
27
- const DEFAULT_BATCH_SIZE: usize = 1000;
29
+ const SAMPLE_SIZE: usize = 100; // Number of rows to sample for size estimation
30
+ const MIN_BATCH_SIZE: usize = 100; // Minimum batch size to maintain efficiency
31
+ const INITIAL_BATCH_SIZE: usize = 100; // Initial batch size while sampling
28
32
 
29
33
  // Maximum memory usage per batch (64MB by default)
30
34
  const DEFAULT_MEMORY_THRESHOLD: usize = 64 * 1024 * 1024;
@@ -42,12 +46,18 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
42
46
  Option<Option<usize>>,
43
47
  Option<Option<usize>>,
44
48
  Option<Option<String>>,
49
+ Option<Option<usize>>,
45
50
  ),
46
51
  (),
47
52
  >(
48
53
  parsed_args.keywords,
49
54
  &["schema", "write_to"],
50
- &["batch_size", "flush_threshold", "compression"],
55
+ &[
56
+ "batch_size",
57
+ "flush_threshold",
58
+ "compression",
59
+ "sample_size",
60
+ ],
51
61
  )?;
52
62
 
53
63
  let schema_array = RArray::from_value(kwargs.required.0).ok_or_else(|| {
@@ -123,9 +133,57 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
123
133
  batch_size: kwargs.optional.0.flatten(),
124
134
  flush_threshold: kwargs.optional.1.flatten(),
125
135
  compression: kwargs.optional.2.flatten(),
136
+ sample_size: kwargs.optional.3.flatten(),
126
137
  })
127
138
  }
128
139
 
140
+ /// Estimate the size of a row
141
+ fn estimate_single_row_size(row: &RArray, schema: &[SchemaField]) -> Result<usize, MagnusError> {
142
+ let mut row_size = 0;
143
+ for (field, value) in schema.iter().zip(row.into_iter()) {
144
+ // Estimate size based on type and value
145
+ row_size += match &field.type_ {
146
+ // Use reference to avoid moving
147
+ ParquetSchemaType::Int8 | ParquetSchemaType::UInt8 => 1,
148
+ ParquetSchemaType::Int16 | ParquetSchemaType::UInt16 => 2,
149
+ ParquetSchemaType::Int32
150
+ | ParquetSchemaType::UInt32
151
+ | ParquetSchemaType::Float
152
+ | ParquetSchemaType::Date32 => 4,
153
+ ParquetSchemaType::Int64
154
+ | ParquetSchemaType::UInt64
155
+ | ParquetSchemaType::Double
156
+ | ParquetSchemaType::TimestampMillis
157
+ | ParquetSchemaType::TimestampMicros => 8,
158
+ ParquetSchemaType::String => {
159
+ if let Ok(s) = String::try_convert(value) {
160
+ s.len() + mem::size_of::<usize>() // account for length prefix
161
+ } else {
162
+ 16 // default estimate for string
163
+ }
164
+ }
165
+ ParquetSchemaType::Binary => {
166
+ if let Ok(bytes) = Vec::<u8>::try_convert(value) {
167
+ bytes.len() + mem::size_of::<usize>() // account for length prefix
168
+ } else {
169
+ 16 // default estimate for binary
170
+ }
171
+ }
172
+ ParquetSchemaType::Boolean => 1,
173
+ ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
174
+ 32 // rough estimate for complex types
175
+ }
176
+ };
177
+ }
178
+ Ok(row_size)
179
+ }
180
+
181
+ /// Calculate optimal batch size based on memory threshold and estimated row size
182
+ fn calculate_batch_size(row_size: usize, memory_threshold: usize) -> usize {
183
+ let batch_size = memory_threshold / row_size;
184
+ batch_size.max(MIN_BATCH_SIZE)
185
+ }
186
+
129
187
  #[inline]
130
188
  pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
131
189
  let ruby = unsafe { Ruby::get_unchecked() };
@@ -134,13 +192,12 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
134
192
  read_from,
135
193
  write_to,
136
194
  schema,
137
- batch_size,
195
+ batch_size: user_batch_size,
138
196
  compression,
139
197
  flush_threshold,
198
+ sample_size: user_sample_size,
140
199
  } = parse_parquet_write_args(args)?;
141
200
 
142
- let batch_size = batch_size.unwrap_or(DEFAULT_BATCH_SIZE);
143
-
144
201
  let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
145
202
 
146
203
  // Convert schema to Arrow schema
@@ -185,11 +242,20 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
185
242
  if read_from.is_kind_of(ruby.class_enumerator()) {
186
243
  // Create collectors for each column
187
244
  let mut column_collectors: Vec<ColumnCollector> = schema
188
- .into_iter()
189
- .map(|field| ColumnCollector::new(field.name, field.type_, field.format))
245
+ .iter()
246
+ .map(|field| {
247
+ // Clone the type to avoid moving from a reference
248
+ let type_clone = field.type_.clone();
249
+ ColumnCollector::new(field.name.clone(), type_clone, field.format.clone())
250
+ })
190
251
  .collect();
191
252
 
192
253
  let mut rows_in_batch = 0;
254
+ let mut total_rows = 0;
255
+ let mut rng = rand::rng();
256
+ let sample_size = user_sample_size.unwrap_or(SAMPLE_SIZE);
257
+ let mut size_samples = Vec::with_capacity(sample_size);
258
+ let mut current_batch_size = user_batch_size.unwrap_or(INITIAL_BATCH_SIZE);
193
259
 
194
260
  loop {
195
261
  match read_from.funcall::<_, _, Value>("next", ()) {
@@ -211,15 +277,30 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
211
277
  ));
212
278
  }
213
279
 
214
- // Process each value in the row immediately
280
+ // Sample row sizes using reservoir sampling
281
+ if size_samples.len() < sample_size {
282
+ size_samples.push(estimate_single_row_size(&row_array, &schema)?);
283
+ } else if rng.random_range(0..=total_rows) < sample_size {
284
+ let idx = rng.random_range(0..sample_size);
285
+ size_samples[idx] = estimate_single_row_size(&row_array, &schema)?;
286
+ }
287
+
288
+ // Process each value in the row
215
289
  for (collector, value) in column_collectors.iter_mut().zip(row_array) {
216
290
  collector.push_value(value)?;
217
291
  }
218
292
 
219
293
  rows_in_batch += 1;
294
+ total_rows += 1;
295
+
296
+ // Recalculate batch size if we have enough samples and no user-specified size
297
+ if size_samples.len() >= sample_size && user_batch_size.is_none() {
298
+ let avg_row_size = size_samples.iter().sum::<usize>() / size_samples.len();
299
+ current_batch_size = calculate_batch_size(avg_row_size, flush_threshold);
300
+ }
220
301
 
221
302
  // When we reach batch size, write the batch
222
- if rows_in_batch >= batch_size {
303
+ if rows_in_batch >= current_batch_size {
223
304
  write_batch(&mut writer, &mut column_collectors, flush_threshold)?;
224
305
  rows_in_batch = 0;
225
306
  }
@@ -263,6 +344,7 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
263
344
  batch_size: _,
264
345
  compression,
265
346
  flush_threshold,
347
+ sample_size: _,
266
348
  } = parse_parquet_write_args(args)?;
267
349
 
268
350
  let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
@@ -483,7 +565,7 @@ fn write_batch(
483
565
 
484
566
  match writer {
485
567
  WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
486
- if w.in_progress_size() >= flush_threshold {
568
+ if w.in_progress_size() >= flush_threshold || w.memory_size() >= flush_threshold {
487
569
  w.flush().map_err(|e| ParquetErrorWrapper(e))?;
488
570
  }
489
571
  }
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.2.9"
2
+ VERSION = "0.2.10"
3
3
  end
data/lib/parquet.rbi CHANGED
@@ -68,6 +68,7 @@ module Parquet
68
68
  # - `flush_threshold`: Optional memory threshold in bytes before flushing (defaults to 64MB)
69
69
  # - `compression`: Optional compression type to use (defaults to "zstd")
70
70
  # Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
71
+ # - `sample_size`: Optional number of rows to sample for size estimation (defaults to 100)
71
72
  sig do
72
73
  params(
73
74
  read_from: T::Enumerator[T::Array[T.untyped]],
@@ -75,10 +76,11 @@ module Parquet
75
76
  write_to: T.any(String, IO),
76
77
  batch_size: T.nilable(Integer),
77
78
  flush_threshold: T.nilable(Integer),
78
- compression: T.nilable(String)
79
+ compression: T.nilable(String),
80
+ sample_size: T.nilable(Integer)
79
81
  ).void
80
82
  end
81
- def self.write_rows(read_from, schema:, write_to:, batch_size: nil, flush_threshold: nil, compression: nil)
83
+ def self.write_rows(read_from, schema:, write_to:, batch_size: nil, flush_threshold: nil, compression: nil, sample_size: nil)
82
84
  end
83
85
 
84
86
  # Options:
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.9
4
+ version: 0.2.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-01-29 00:00:00.000000000 Z
11
+ date: 2025-01-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys