osv 0.3.15 → 0.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 872cf06d1389f45f77b4eefc178cc8462ab165b833ab2c5bf4dc7f92e1c8308e
4
- data.tar.gz: 84e6c5d0e03389966b8882a5a73f1698ddee3ed0edae24f2fd5b7f257935a98e
3
+ metadata.gz: 4469c67b2a39d9ffa23923e36cd894eac415ca004a432e700102a334af11efd8
4
+ data.tar.gz: 8dee3117fe6511b9c5b6005ae37d991891e0f314508986743b659080c7885855
5
5
  SHA512:
6
- metadata.gz: 445581447e8f5ec336da7843af715a5f5fbc298232a24f303a22eebb844f83f65ecc2e85d877a448119adae9e6a5529e377d87399a36e6f070562fa4ce0a11b7
7
- data.tar.gz: '08f417b19b0549aa4a3db1538e4be413c5ec8faa3bd18e4c101a6fc3ea3e9496d04c30e39ea8eec9cc0cc3a38f8f83f7c2274e09c75259a26f3609620cf07a80'
6
+ metadata.gz: d8c94dc1c576cca0043c7501752bdd6dee0c8bf0523d9c99a0e8ab4d614a0eb4e6f087fa62be97bb5816f9998f2c414758ffcab260e90889afada8379fb03aec
7
+ data.tar.gz: c51ece65a713af0b351a183415816302fcdc35ad598d0e5ee9e5b693c1ef66826c5dfc3dab90f04499c90e994e590e6dd7121999b5dfe54ce20997e41df0ac02
data/Cargo.lock CHANGED
@@ -45,7 +45,7 @@ dependencies = [
45
45
  "bitflags",
46
46
  "cexpr",
47
47
  "clang-sys",
48
- "itertools",
48
+ "itertools 0.12.1",
49
49
  "lazy_static",
50
50
  "lazycell",
51
51
  "proc-macro2",
@@ -175,6 +175,15 @@ dependencies = [
175
175
  "either",
176
176
  ]
177
177
 
178
+ [[package]]
179
+ name = "itertools"
180
+ version = "0.14.0"
181
+ source = "registry+https://github.com/rust-lang/crates.io-index"
182
+ checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285"
183
+ dependencies = [
184
+ "either",
185
+ ]
186
+
178
187
  [[package]]
179
188
  name = "itoa"
180
189
  version = "1.0.14"
@@ -347,6 +356,7 @@ dependencies = [
347
356
  "ahash",
348
357
  "csv",
349
358
  "flate2",
359
+ "itertools 0.14.0",
350
360
  "jemallocator",
351
361
  "kanal",
352
362
  "magnus 0.7.1",
data/README.md CHANGED
@@ -121,7 +121,7 @@ Here's some unscientific benchmarks. You can find the code in the [benchmark/com
121
121
  ### 1,000,000 records
122
122
 
123
123
  ```
124
- 🏃 Running benchmarks...
124
+ 🏃 Running benchmarks...
125
125
  Benchmarking with 3000001 lines of data
126
126
 
127
127
  ruby 3.3.6 (2024-11-05 revision 75015d4c1f) +YJIT [arm64-darwin24]
@@ -142,34 +142,34 @@ OSV - Gzipped Direct 1.000 i/100ms
142
142
  FastCSV - Gzipped 1.000 i/100ms
143
143
  CSV - Gzipped 1.000 i/100ms
144
144
  Calculating -------------------------------------
145
- CSV - StringIO 0.079 (± 0.0%) i/s (12.69 s/i) - 3.000 in 38.139709s
146
- FastCSV - StringIO 0.370 (± 0.0%) i/s (2.71 s/i) - 12.000 in 32.474164s
147
- OSV - StringIO 0.635 (± 0.0%) i/s (1.58 s/i) - 19.000 in 30.772490s
148
- CSV - Hash output 0.058 (± 0.0%) i/s (17.11 s/i) - 2.000 in 34.212335s
149
- OSV - Hash output 0.249 (± 0.0%) i/s (4.01 s/i) - 8.000 in 32.124319s
150
- CSV - Array output 0.066 (± 0.0%) i/s (15.11 s/i) - 2.000 in 30.212137s
151
- OSV - Array output 0.665 (± 0.0%) i/s (1.50 s/i) - 20.000 in 30.813986s
145
+ CSV - StringIO 0.080 (± 0.0%) i/s (12.43 s/i) - 3.000 in 37.301114s
146
+ FastCSV - StringIO 0.368 (± 0.0%) i/s (2.72 s/i) - 12.000 in 32.619020s
147
+ OSV - StringIO 0.699 (± 0.0%) i/s (1.43 s/i) - 21.000 in 30.091225s
148
+ CSV - Hash output 0.059 (± 0.0%) i/s (16.95 s/i) - 2.000 in 33.908533s
149
+ OSV - Hash output 0.329 (± 0.0%) i/s (3.04 s/i) - 10.000 in 30.551275s
150
+ CSV - Array output 0.066 (± 0.0%) i/s (15.18 s/i) - 2.000 in 30.357327s
151
+ OSV - Array output 0.632 (± 0.0%) i/s (1.58 s/i) - 19.000 in 30.150113s
152
152
  FastCSV - Array output
153
- 0.351 (± 0.0%) i/s (2.85 s/i) - 11.000 in 31.418786s
153
+ 0.350 (± 0.0%) i/s (2.86 s/i) - 11.000 in 31.477268s
154
154
  OSV - Direct Open Array output
155
- 0.713 (± 0.0%) i/s (1.40 s/i) - 22.000 in 30.938525s
156
- OSV - Gzipped 0.506 (± 0.0%) i/s (1.98 s/i) - 16.000 in 31.709708s
157
- OSV - Gzipped Direct 0.685 (± 0.0%) i/s (1.46 s/i) - 21.000 in 31.145435s
158
- FastCSV - Gzipped 0.324 (± 0.0%) i/s (3.09 s/i) - 10.000 in 30.983582s
159
- CSV - Gzipped 0.057 (± 0.0%) i/s (17.69 s/i) - 2.000 in 35.379009s
155
+ 0.641 (± 0.0%) i/s (1.56 s/i) - 20.000 in 31.275201s
156
+ OSV - Gzipped 0.530 (± 0.0%) i/s (1.89 s/i) - 16.000 in 30.183753s
157
+ OSV - Gzipped Direct 0.727 (± 0.0%) i/s (1.37 s/i) - 22.000 in 30.283991s
158
+ FastCSV - Gzipped 0.323 (± 0.0%) i/s (3.09 s/i) - 10.000 in 30.949600s
159
+ CSV - Gzipped 0.056 (± 0.0%) i/s (17.72 s/i) - 2.000 in 35.440473s
160
160
 
161
161
  Comparison:
162
- OSV - Direct Open Array output: 0.7 i/s
163
- OSV - Gzipped Direct: 0.7 i/s - 1.04x slower
164
- OSV - Array output: 0.7 i/s - 1.07x slower
165
- OSV - StringIO: 0.6 i/s - 1.12x slower
166
- OSV - Gzipped: 0.5 i/s - 1.41x slower
167
- FastCSV - StringIO: 0.4 i/s - 1.93x slower
168
- FastCSV - Array output: 0.4 i/s - 2.03x slower
169
- FastCSV - Gzipped: 0.3 i/s - 2.20x slower
170
- OSV - Hash output: 0.2 i/s - 2.86x slower
171
- CSV - StringIO: 0.1 i/s - 9.05x slower
172
- CSV - Array output: 0.1 i/s - 10.77x slower
173
- CSV - Hash output: 0.1 i/s - 12.20x slower
174
- CSV - Gzipped: 0.1 i/s - 12.61x slower
162
+ OSV - Gzipped Direct: 0.7 i/s
163
+ OSV - StringIO: 0.7 i/s - 1.04x slower
164
+ OSV - Direct Open Array output: 0.6 i/s - 1.14x slower
165
+ OSV - Array output: 0.6 i/s - 1.15x slower
166
+ OSV - Gzipped: 0.5 i/s - 1.37x slower
167
+ FastCSV - StringIO: 0.4 i/s - 1.98x slower
168
+ FastCSV - Array output: 0.3 i/s - 2.08x slower
169
+ OSV - Hash output: 0.3 i/s - 2.21x slower
170
+ FastCSV - Gzipped: 0.3 i/s - 2.25x slower
171
+ CSV - StringIO: 0.1 i/s - 9.04x slower
172
+ CSV - Array output: 0.1 i/s - 11.04x slower
173
+ CSV - Hash output: 0.1 i/s - 12.33x slower
174
+ CSV - Gzipped: 0.1 i/s - 12.89x slower
175
175
  ```
data/ext/osv/Cargo.toml CHANGED
@@ -16,6 +16,7 @@ rb-sys = "^0.9"
16
16
  serde = { version = "1.0", features = ["derive"] }
17
17
  serde_magnus = "0.8.1"
18
18
  thiserror = "2.0"
19
+ itertools = "^0.14"
19
20
 
20
21
  [target.'cfg(target_os = "linux")'.dependencies]
21
22
  jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
@@ -6,8 +6,10 @@ use super::{
6
6
  ForgottenFileHandle,
7
7
  };
8
8
  use flate2::read::GzDecoder;
9
- use magnus::{rb_sys::AsRawValue, value::ReprValue, Error as MagnusError, Ruby, Value};
9
+ use magnus::{rb_sys::AsRawValue, value::ReprValue, Error as MagnusError, RString, Ruby, Value};
10
10
  use std::{
11
+ borrow::Cow,
12
+ fmt::Debug,
11
13
  fs::File,
12
14
  io::{self, BufReader, Read},
13
15
  marker::PhantomData,
@@ -17,18 +19,21 @@ use std::{
17
19
 
18
20
  use thiserror::Error;
19
21
 
20
- pub(crate) static BUFFER_CHANNEL_SIZE: usize = 1024;
21
-
22
+ /// Errors that can occur when building a RecordReader
22
23
  #[derive(Error, Debug)]
23
24
  pub enum ReaderError {
24
25
  #[error("Failed to get file descriptor: {0}")]
25
26
  FileDescriptor(String),
26
- #[error("Invalid file descriptor")]
27
- InvalidFileDescriptor,
27
+ #[error("Invalid file descriptor: {0}")]
28
+ InvalidFileDescriptor(i32),
28
29
  #[error("Failed to open file: {0}")]
29
30
  FileOpen(#[from] io::Error),
30
31
  #[error("Failed to intern headers: {0}")]
31
32
  HeaderIntern(#[from] CacheError),
33
+ #[error("Invalid flexible default value: {0}")]
34
+ InvalidFlexibleDefault(String),
35
+ #[error("Invalid null string value: {0}")]
36
+ InvalidNullString(String),
32
37
  #[error("Ruby error: {0}")]
33
38
  Ruby(String),
34
39
  }
@@ -48,63 +53,27 @@ impl From<ReaderError> for MagnusError {
48
53
  }
49
54
  }
50
55
 
51
- pub struct RecordReaderBuilder<'a, T: RecordParser<'a> + Send> {
52
- ruby: &'a Ruby,
56
+ /// Builder for configuring and creating a RecordReader instance.
57
+ ///
58
+ /// This struct provides a fluent interface for setting up CSV parsing options
59
+ /// and creating a RecordReader with the specified configuration.
60
+ pub struct RecordReaderBuilder<'a, T: RecordParser<'a>> {
61
+ ruby: Ruby,
53
62
  to_read: Value,
54
63
  has_headers: bool,
55
64
  delimiter: u8,
56
65
  quote_char: u8,
57
66
  null_string: Option<String>,
58
- buffer: usize,
59
67
  flexible: bool,
60
- flexible_default: Option<&'a str>,
68
+ flexible_default: Option<String>,
61
69
  trim: csv::Trim,
62
70
  _phantom: PhantomData<T>,
71
+ _phantom_a: PhantomData<&'a ()>,
63
72
  }
64
73
 
65
- impl<T: RecordParser<'static> + Send + 'static> RecordReaderBuilder<'static, T> {
66
- fn build_multi_threaded(
67
- self,
68
- readable: Box<dyn Read + Send + 'static>,
69
- ) -> Result<RecordReader<'static, T>, ReaderError> {
70
- let flexible = self.flexible || self.flexible_default.is_some();
71
- let mut reader = csv::ReaderBuilder::new()
72
- .has_headers(self.has_headers)
73
- .delimiter(self.delimiter)
74
- .quote(self.quote_char)
75
- .flexible(flexible)
76
- .trim(self.trim)
77
- .from_reader(readable);
78
-
79
- let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
80
- let static_headers = StringCache::intern_many(&headers)?;
81
-
82
- Ok(RecordReader::new_multi_threaded(
83
- reader,
84
- static_headers,
85
- self.buffer,
86
- self.null_string,
87
- self.flexible_default,
88
- ))
89
- }
90
-
91
- pub fn build_threaded(self) -> Result<RecordReader<'static, T>, ReaderError> {
92
- if self.to_read.is_kind_of(self.ruby.class_io()) {
93
- let readable = self.handle_file_descriptor()?;
94
- self.build_multi_threaded(readable)
95
- } else if self.to_read.is_kind_of(self.ruby.class_string()) {
96
- let readable = self.handle_file_path()?;
97
- self.build_multi_threaded(readable)
98
- } else {
99
- let readable = build_ruby_reader(self.ruby, self.to_read)?;
100
- let buffered_reader = BufReader::with_capacity(READ_BUFFER_SIZE, readable);
101
- self.build_single_threaded(buffered_reader)
102
- }
103
- }
104
- }
105
-
106
- impl<'a, T: RecordParser<'a> + Send> RecordReaderBuilder<'a, T> {
107
- pub fn new(ruby: &'a Ruby, to_read: Value) -> Self {
74
+ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
75
+ /// Creates a new builder instance with default settings.
76
+ pub fn new(ruby: Ruby, to_read: Value) -> Self {
108
77
  Self {
109
78
  ruby,
110
79
  to_read,
@@ -112,92 +81,107 @@ impl<'a, T: RecordParser<'a> + Send> RecordReaderBuilder<'a, T> {
112
81
  delimiter: b',',
113
82
  quote_char: b'"',
114
83
  null_string: None,
115
- buffer: BUFFER_CHANNEL_SIZE,
116
84
  flexible: false,
117
85
  flexible_default: None,
118
86
  trim: csv::Trim::None,
119
87
  _phantom: PhantomData,
88
+ _phantom_a: PhantomData,
120
89
  }
121
90
  }
122
91
 
92
+ /// Sets whether the CSV file has headers.
93
+ #[must_use]
123
94
  pub fn has_headers(mut self, has_headers: bool) -> Self {
124
95
  self.has_headers = has_headers;
125
96
  self
126
97
  }
127
98
 
99
+ /// Sets the delimiter character for the CSV.
100
+ #[must_use]
128
101
  pub fn delimiter(mut self, delimiter: u8) -> Self {
129
102
  self.delimiter = delimiter;
130
103
  self
131
104
  }
132
105
 
106
+ /// Sets the quote character for the CSV.
107
+ #[must_use]
133
108
  pub fn quote_char(mut self, quote_char: u8) -> Self {
134
109
  self.quote_char = quote_char;
135
110
  self
136
111
  }
137
112
 
113
+ /// Sets the string that should be interpreted as null.
114
+ #[must_use]
138
115
  pub fn null_string(mut self, null_string: Option<String>) -> Self {
139
116
  self.null_string = null_string;
140
117
  self
141
118
  }
142
119
 
143
- pub fn buffer(mut self, buffer: usize) -> Self {
144
- self.buffer = buffer;
145
- self
146
- }
147
-
120
+ /// Sets whether the reader should be flexible with field counts.
121
+ #[must_use]
148
122
  pub fn flexible(mut self, flexible: bool) -> Self {
149
123
  self.flexible = flexible;
150
124
  self
151
125
  }
152
126
 
153
- pub fn flexible_default(mut self, flexible_default: Option<&'a str>) -> Self {
127
+ /// Sets the default value for missing fields when in flexible mode.
128
+ #[must_use]
129
+ pub fn flexible_default(mut self, flexible_default: Option<String>) -> Self {
154
130
  self.flexible_default = flexible_default;
155
131
  self
156
132
  }
157
133
 
134
+ /// Sets the trimming mode for fields.
135
+ #[must_use]
158
136
  pub fn trim(mut self, trim: csv::Trim) -> Self {
159
137
  self.trim = trim;
160
138
  self
161
139
  }
162
140
 
163
- fn handle_file_descriptor(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
141
+ /// Handles reading from a file descriptor.
142
+ fn handle_file_descriptor(&self) -> Result<Box<dyn SeekableRead>, ReaderError> {
164
143
  let raw_value = self.to_read.as_raw();
165
144
  let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
166
- .map_err(|_| {
167
- ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
168
- })?;
145
+ .map_err(|e| ReaderError::FileDescriptor(format!("{:?}", e)))?;
169
146
 
170
147
  if fd < 0 {
171
- return Err(ReaderError::InvalidFileDescriptor);
148
+ return Err(ReaderError::InvalidFileDescriptor(fd));
172
149
  }
173
150
 
174
151
  let file = unsafe { File::from_raw_fd(fd) };
175
152
  let forgotten = ForgottenFileHandle(ManuallyDrop::new(file));
176
- Ok(Box::new(BufReader::with_capacity(
177
- READ_BUFFER_SIZE,
178
- forgotten,
179
- )))
153
+ Ok(Box::new(forgotten))
180
154
  }
181
155
 
182
- fn handle_file_path(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
156
+ /// Handles reading from a file path.
157
+ fn handle_file_path(&self) -> Result<Box<dyn SeekableRead>, ReaderError> {
183
158
  let path = self.to_read.to_r_string()?.to_string()?;
184
159
  let file = File::open(&path)?;
185
160
 
186
- Ok(if path.ends_with(".gz") {
187
- Box::new(GzDecoder::new(BufReader::with_capacity(
188
- READ_BUFFER_SIZE,
189
- file,
190
- )))
161
+ if path.ends_with(".gz") {
162
+ // For gzipped files, we need to decompress them into memory first
163
+ // since GzDecoder doesn't support seeking
164
+ let mut decoder = GzDecoder::new(BufReader::with_capacity(READ_BUFFER_SIZE, file));
165
+ let mut contents = Vec::new();
166
+ decoder.read_to_end(&mut contents)?;
167
+ Ok(Box::new(std::io::Cursor::new(contents)))
191
168
  } else {
192
- Box::new(BufReader::with_capacity(READ_BUFFER_SIZE, file))
193
- })
169
+ Ok(Box::new(file))
170
+ }
194
171
  }
195
172
 
196
- fn build_single_threaded(
197
- self,
198
- readable: BufReader<Box<dyn SeekableRead>>,
199
- ) -> Result<RecordReader<'a, T>, ReaderError> {
173
+ /// Builds the RecordReader with the configured options.
174
+ pub fn build(self) -> Result<RecordReader<'a, T>, ReaderError> {
175
+ let readable = if self.to_read.is_kind_of(self.ruby.class_io()) {
176
+ self.handle_file_descriptor()?
177
+ } else if self.to_read.is_kind_of(self.ruby.class_string()) {
178
+ self.handle_file_path()?
179
+ } else {
180
+ build_ruby_reader(&self.ruby, self.to_read)?
181
+ };
182
+
200
183
  let flexible = self.flexible || self.flexible_default.is_some();
184
+ let reader = BufReader::with_capacity(READ_BUFFER_SIZE, readable);
201
185
 
202
186
  let mut reader = csv::ReaderBuilder::new()
203
187
  .has_headers(self.has_headers)
@@ -205,16 +189,39 @@ impl<'a, T: RecordParser<'a> + Send> RecordReaderBuilder<'a, T> {
205
189
  .quote(self.quote_char)
206
190
  .flexible(flexible)
207
191
  .trim(self.trim)
208
- .from_reader(readable);
192
+ .from_reader(reader);
209
193
 
210
- let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
194
+ let headers = RecordReader::<T>::get_headers(&self.ruby, &mut reader, self.has_headers)?;
211
195
  let static_headers = StringCache::intern_many(&headers)?;
212
196
 
213
- Ok(RecordReader::new_single_threaded(
197
+ // We intern both of these to get static string references we can reuse throughout the parser.
198
+ let flexible_default = self
199
+ .flexible_default
200
+ .map(|s| {
201
+ RString::new(&s)
202
+ .to_interned_str()
203
+ .as_str()
204
+ .map_err(|e| ReaderError::InvalidFlexibleDefault(format!("{:?}", e)))
205
+ })
206
+ .transpose()?
207
+ .map(|s| Cow::Borrowed(s));
208
+
209
+ let null_string = self
210
+ .null_string
211
+ .map(|s| {
212
+ RString::new(&s)
213
+ .to_interned_str()
214
+ .as_str()
215
+ .map_err(|e| ReaderError::InvalidNullString(format!("{:?}", e)))
216
+ })
217
+ .transpose()?
218
+ .map(|s| Cow::Borrowed(s));
219
+
220
+ Ok(RecordReader::new(
214
221
  reader,
215
222
  static_headers,
216
- self.null_string,
217
- self.flexible_default,
223
+ null_string,
224
+ flexible_default,
218
225
  ))
219
226
  }
220
227
  }
@@ -6,8 +6,14 @@
6
6
  /// so this optimization could be removed if any issues arise.
7
7
  use std::{
8
8
  collections::HashMap,
9
- sync::{atomic::AtomicU32, LazyLock, Mutex},
9
+ sync::{
10
+ atomic::{AtomicU32, Ordering},
11
+ LazyLock, Mutex, OnceLock,
12
+ },
10
13
  };
14
+
15
+ use magnus::{r_string::FString, value::Opaque, IntoValue, RString, Ruby, Value};
16
+
11
17
  use thiserror::Error;
12
18
 
13
19
  #[derive(Debug, Error)]
@@ -16,66 +22,139 @@ pub enum CacheError {
16
22
  LockError(String),
17
23
  }
18
24
 
19
- static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, AtomicU32>>> =
25
+ static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (StringCacheKey, AtomicU32)>>> =
20
26
  LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
21
27
 
22
28
  pub struct StringCache;
23
29
 
30
+ #[derive(Copy, Clone)]
31
+ pub struct StringCacheKey(Opaque<FString>, &'static str);
32
+
33
+ impl StringCacheKey {
34
+ pub fn new(string: &str) -> Self {
35
+ let rstr = RString::new(string);
36
+ let fstr = rstr.to_interned_str();
37
+ Self(Opaque::from(fstr), fstr.as_str().unwrap())
38
+ }
39
+ }
40
+
41
+ impl AsRef<str> for StringCacheKey {
42
+ fn as_ref(&self) -> &'static str {
43
+ self.1
44
+ }
45
+ }
46
+
47
+ impl IntoValue for StringCacheKey {
48
+ fn into_value_with(self, handle: &Ruby) -> Value {
49
+ handle.into_value(self.0)
50
+ }
51
+ }
52
+
53
+ impl std::fmt::Debug for StringCacheKey {
54
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
55
+ self.1.fmt(f)
56
+ }
57
+ }
58
+
59
+ impl PartialEq for StringCacheKey {
60
+ fn eq(&self, other: &Self) -> bool {
61
+ self.1 == other.1
62
+ }
63
+ }
64
+
65
+ impl std::cmp::Eq for StringCacheKey {}
66
+
67
+ impl std::hash::Hash for StringCacheKey {
68
+ fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
69
+ self.1.hash(state);
70
+ }
71
+ }
72
+
24
73
  impl StringCache {
25
74
  #[allow(dead_code)]
26
- pub fn intern(string: String) -> Result<&'static str, CacheError> {
75
+ pub fn intern(string: String) -> Result<StringCacheKey, CacheError> {
27
76
  let mut cache = STRING_CACHE
28
77
  .lock()
29
78
  .map_err(|e| CacheError::LockError(e.to_string()))?;
30
79
 
31
- if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
32
- count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
33
- Ok(existing)
80
+ if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
81
+ counter.fetch_add(1, Ordering::Relaxed);
82
+ Ok(*interned_string)
34
83
  } else {
84
+ let interned = StringCacheKey::new(string.as_str());
35
85
  let leaked = Box::leak(string.into_boxed_str());
36
- cache.insert(leaked, AtomicU32::new(1));
37
- Ok(leaked)
86
+ cache.insert(leaked, (interned, AtomicU32::new(1)));
87
+ Ok(interned)
38
88
  }
39
89
  }
40
90
 
41
- pub fn intern_many(strings: &[String]) -> Result<Vec<&'static str>, CacheError> {
91
+ pub fn intern_many(strings: &[String]) -> Result<Vec<StringCacheKey>, CacheError> {
42
92
  let mut cache = STRING_CACHE
43
93
  .lock()
44
94
  .map_err(|e| CacheError::LockError(e.to_string()))?;
45
95
 
46
- let mut result = Vec::with_capacity(strings.len());
96
+ let mut result: Vec<StringCacheKey> = Vec::with_capacity(strings.len());
47
97
  for string in strings {
48
- if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
49
- count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
50
- result.push(existing);
98
+ if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
99
+ counter.fetch_add(1, Ordering::Relaxed);
100
+ result.push(*interned_string);
51
101
  } else {
102
+ let interned = StringCacheKey::new(&string);
52
103
  let leaked = Box::leak(string.clone().into_boxed_str());
53
- cache.insert(leaked, AtomicU32::new(1));
54
- result.push(leaked);
104
+ cache.insert(leaked, (interned, AtomicU32::new(1)));
105
+ result.push(interned);
55
106
  }
56
107
  }
57
108
  Ok(result)
58
109
  }
59
110
 
60
- pub fn clear(headers: &[&'static str]) -> Result<(), CacheError> {
111
+ pub fn clear(headers: &[StringCacheKey]) -> Result<(), CacheError> {
61
112
  let mut cache = STRING_CACHE
62
113
  .lock()
63
114
  .map_err(|e| CacheError::LockError(e.to_string()))?;
64
115
 
65
- for header in headers {
66
- if let Some(count) = cache.get(header) {
67
- // Returns the previous value of the counter
68
- let was = count.fetch_sub(1, std::sync::atomic::Ordering::Relaxed);
69
- if was == 1 {
70
- cache.remove(header);
71
- let ptr = *header as *const str as *mut str;
72
- unsafe {
73
- let _ = Box::from_raw(ptr);
116
+ let to_remove: Vec<_> = headers
117
+ .iter()
118
+ .filter_map(|header| {
119
+ let key = header.as_ref();
120
+ if let Some((_, (_, counter))) = cache.get_key_value(key) {
121
+ let prev_count = counter.fetch_sub(1, Ordering::Relaxed);
122
+ if prev_count == 1 {
123
+ Some(key)
124
+ } else {
125
+ None
74
126
  }
127
+ } else {
128
+ None
75
129
  }
76
- }
130
+ })
131
+ .collect();
132
+
133
+ for key in to_remove {
134
+ cache.remove(key);
77
135
  }
78
136
 
79
137
  Ok(())
80
138
  }
81
139
  }
140
+
141
+ pub struct HeaderCacheCleanupIter<I> {
142
+ pub inner: I,
143
+ pub headers: OnceLock<Vec<StringCacheKey>>,
144
+ }
145
+
146
+ impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
147
+ type Item = I::Item;
148
+
149
+ fn next(&mut self) -> Option<Self::Item> {
150
+ self.inner.next()
151
+ }
152
+ }
153
+
154
+ impl<I> Drop for HeaderCacheCleanupIter<I> {
155
+ fn drop(&mut self) {
156
+ if let Some(headers) = self.headers.get() {
157
+ StringCache::clear(&headers).unwrap();
158
+ }
159
+ }
160
+ }
@@ -7,7 +7,7 @@ mod ruby_integration;
7
7
  mod ruby_reader;
8
8
 
9
9
  pub use builder::RecordReaderBuilder;
10
- pub(crate) use builder::BUFFER_CHANNEL_SIZE;
11
- pub use record::CowValue;
10
+ pub use header_cache::StringCacheKey;
11
+ pub use record::CowStr;
12
12
  pub use record::CsvRecord;
13
13
  pub use ruby_integration::*;