osv 0.3.15 → 0.3.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +11 -1
- data/README.md +27 -27
- data/ext/osv/Cargo.toml +1 -0
- data/ext/osv/src/csv/builder.rs +92 -85
- data/ext/osv/src/csv/header_cache.rs +105 -26
- data/ext/osv/src/csv/mod.rs +2 -2
- data/ext/osv/src/csv/parser.rs +22 -85
- data/ext/osv/src/csv/record.rs +25 -8
- data/ext/osv/src/csv/record_reader.rs +53 -118
- data/ext/osv/src/csv/ruby_integration.rs +10 -21
- data/ext/osv/src/csv/ruby_reader.rs +9 -4
- data/ext/osv/src/reader.rs +64 -46
- data/ext/osv/src/utils.rs +4 -12
- data/lib/osv/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4469c67b2a39d9ffa23923e36cd894eac415ca004a432e700102a334af11efd8
|
4
|
+
data.tar.gz: 8dee3117fe6511b9c5b6005ae37d991891e0f314508986743b659080c7885855
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d8c94dc1c576cca0043c7501752bdd6dee0c8bf0523d9c99a0e8ab4d614a0eb4e6f087fa62be97bb5816f9998f2c414758ffcab260e90889afada8379fb03aec
|
7
|
+
data.tar.gz: c51ece65a713af0b351a183415816302fcdc35ad598d0e5ee9e5b693c1ef66826c5dfc3dab90f04499c90e994e590e6dd7121999b5dfe54ce20997e41df0ac02
|
data/Cargo.lock
CHANGED
@@ -45,7 +45,7 @@ dependencies = [
|
|
45
45
|
"bitflags",
|
46
46
|
"cexpr",
|
47
47
|
"clang-sys",
|
48
|
-
"itertools",
|
48
|
+
"itertools 0.12.1",
|
49
49
|
"lazy_static",
|
50
50
|
"lazycell",
|
51
51
|
"proc-macro2",
|
@@ -175,6 +175,15 @@ dependencies = [
|
|
175
175
|
"either",
|
176
176
|
]
|
177
177
|
|
178
|
+
[[package]]
|
179
|
+
name = "itertools"
|
180
|
+
version = "0.14.0"
|
181
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
182
|
+
checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285"
|
183
|
+
dependencies = [
|
184
|
+
"either",
|
185
|
+
]
|
186
|
+
|
178
187
|
[[package]]
|
179
188
|
name = "itoa"
|
180
189
|
version = "1.0.14"
|
@@ -347,6 +356,7 @@ dependencies = [
|
|
347
356
|
"ahash",
|
348
357
|
"csv",
|
349
358
|
"flate2",
|
359
|
+
"itertools 0.14.0",
|
350
360
|
"jemallocator",
|
351
361
|
"kanal",
|
352
362
|
"magnus 0.7.1",
|
data/README.md
CHANGED
@@ -121,7 +121,7 @@ Here's some unscientific benchmarks. You can find the code in the [benchmark/com
|
|
121
121
|
### 1,000,000 records
|
122
122
|
|
123
123
|
```
|
124
|
-
🏃
|
124
|
+
🏃 Running benchmarks...
|
125
125
|
Benchmarking with 3000001 lines of data
|
126
126
|
|
127
127
|
ruby 3.3.6 (2024-11-05 revision 75015d4c1f) +YJIT [arm64-darwin24]
|
@@ -142,34 +142,34 @@ OSV - Gzipped Direct 1.000 i/100ms
|
|
142
142
|
FastCSV - Gzipped 1.000 i/100ms
|
143
143
|
CSV - Gzipped 1.000 i/100ms
|
144
144
|
Calculating -------------------------------------
|
145
|
-
CSV - StringIO 0.
|
146
|
-
FastCSV - StringIO 0.
|
147
|
-
OSV - StringIO 0.
|
148
|
-
CSV - Hash output 0.
|
149
|
-
OSV - Hash output 0.
|
150
|
-
CSV - Array output 0.066 (± 0.0%) i/s (15.
|
151
|
-
OSV - Array output 0.
|
145
|
+
CSV - StringIO 0.080 (± 0.0%) i/s (12.43 s/i) - 3.000 in 37.301114s
|
146
|
+
FastCSV - StringIO 0.368 (± 0.0%) i/s (2.72 s/i) - 12.000 in 32.619020s
|
147
|
+
OSV - StringIO 0.699 (± 0.0%) i/s (1.43 s/i) - 21.000 in 30.091225s
|
148
|
+
CSV - Hash output 0.059 (± 0.0%) i/s (16.95 s/i) - 2.000 in 33.908533s
|
149
|
+
OSV - Hash output 0.329 (± 0.0%) i/s (3.04 s/i) - 10.000 in 30.551275s
|
150
|
+
CSV - Array output 0.066 (± 0.0%) i/s (15.18 s/i) - 2.000 in 30.357327s
|
151
|
+
OSV - Array output 0.632 (± 0.0%) i/s (1.58 s/i) - 19.000 in 30.150113s
|
152
152
|
FastCSV - Array output
|
153
|
-
0.
|
153
|
+
0.350 (± 0.0%) i/s (2.86 s/i) - 11.000 in 31.477268s
|
154
154
|
OSV - Direct Open Array output
|
155
|
-
0.
|
156
|
-
OSV - Gzipped 0.
|
157
|
-
OSV - Gzipped Direct 0.
|
158
|
-
FastCSV - Gzipped 0.
|
159
|
-
CSV - Gzipped 0.
|
155
|
+
0.641 (± 0.0%) i/s (1.56 s/i) - 20.000 in 31.275201s
|
156
|
+
OSV - Gzipped 0.530 (± 0.0%) i/s (1.89 s/i) - 16.000 in 30.183753s
|
157
|
+
OSV - Gzipped Direct 0.727 (± 0.0%) i/s (1.37 s/i) - 22.000 in 30.283991s
|
158
|
+
FastCSV - Gzipped 0.323 (± 0.0%) i/s (3.09 s/i) - 10.000 in 30.949600s
|
159
|
+
CSV - Gzipped 0.056 (± 0.0%) i/s (17.72 s/i) - 2.000 in 35.440473s
|
160
160
|
|
161
161
|
Comparison:
|
162
|
-
OSV - Direct
|
163
|
-
OSV -
|
164
|
-
|
165
|
-
|
166
|
-
OSV - Gzipped: 0.5 i/s - 1.
|
167
|
-
FastCSV - StringIO: 0.4 i/s - 1.
|
168
|
-
FastCSV - Array output: 0.
|
169
|
-
|
170
|
-
|
171
|
-
CSV - StringIO: 0.1 i/s - 9.
|
172
|
-
CSV - Array output: 0.1 i/s -
|
173
|
-
CSV - Hash output: 0.1 i/s - 12.
|
174
|
-
CSV - Gzipped: 0.1 i/s - 12.
|
162
|
+
OSV - Gzipped Direct: 0.7 i/s
|
163
|
+
OSV - StringIO: 0.7 i/s - 1.04x slower
|
164
|
+
OSV - Direct Open Array output: 0.6 i/s - 1.14x slower
|
165
|
+
OSV - Array output: 0.6 i/s - 1.15x slower
|
166
|
+
OSV - Gzipped: 0.5 i/s - 1.37x slower
|
167
|
+
FastCSV - StringIO: 0.4 i/s - 1.98x slower
|
168
|
+
FastCSV - Array output: 0.3 i/s - 2.08x slower
|
169
|
+
OSV - Hash output: 0.3 i/s - 2.21x slower
|
170
|
+
FastCSV - Gzipped: 0.3 i/s - 2.25x slower
|
171
|
+
CSV - StringIO: 0.1 i/s - 9.04x slower
|
172
|
+
CSV - Array output: 0.1 i/s - 11.04x slower
|
173
|
+
CSV - Hash output: 0.1 i/s - 12.33x slower
|
174
|
+
CSV - Gzipped: 0.1 i/s - 12.89x slower
|
175
175
|
```
|
data/ext/osv/Cargo.toml
CHANGED
@@ -16,6 +16,7 @@ rb-sys = "^0.9"
|
|
16
16
|
serde = { version = "1.0", features = ["derive"] }
|
17
17
|
serde_magnus = "0.8.1"
|
18
18
|
thiserror = "2.0"
|
19
|
+
itertools = "^0.14"
|
19
20
|
|
20
21
|
[target.'cfg(target_os = "linux")'.dependencies]
|
21
22
|
jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
|
data/ext/osv/src/csv/builder.rs
CHANGED
@@ -6,8 +6,10 @@ use super::{
|
|
6
6
|
ForgottenFileHandle,
|
7
7
|
};
|
8
8
|
use flate2::read::GzDecoder;
|
9
|
-
use magnus::{rb_sys::AsRawValue, value::ReprValue, Error as MagnusError, Ruby, Value};
|
9
|
+
use magnus::{rb_sys::AsRawValue, value::ReprValue, Error as MagnusError, RString, Ruby, Value};
|
10
10
|
use std::{
|
11
|
+
borrow::Cow,
|
12
|
+
fmt::Debug,
|
11
13
|
fs::File,
|
12
14
|
io::{self, BufReader, Read},
|
13
15
|
marker::PhantomData,
|
@@ -17,18 +19,21 @@ use std::{
|
|
17
19
|
|
18
20
|
use thiserror::Error;
|
19
21
|
|
20
|
-
|
21
|
-
|
22
|
+
/// Errors that can occur when building a RecordReader
|
22
23
|
#[derive(Error, Debug)]
|
23
24
|
pub enum ReaderError {
|
24
25
|
#[error("Failed to get file descriptor: {0}")]
|
25
26
|
FileDescriptor(String),
|
26
|
-
#[error("Invalid file descriptor")]
|
27
|
-
InvalidFileDescriptor,
|
27
|
+
#[error("Invalid file descriptor: {0}")]
|
28
|
+
InvalidFileDescriptor(i32),
|
28
29
|
#[error("Failed to open file: {0}")]
|
29
30
|
FileOpen(#[from] io::Error),
|
30
31
|
#[error("Failed to intern headers: {0}")]
|
31
32
|
HeaderIntern(#[from] CacheError),
|
33
|
+
#[error("Invalid flexible default value: {0}")]
|
34
|
+
InvalidFlexibleDefault(String),
|
35
|
+
#[error("Invalid null string value: {0}")]
|
36
|
+
InvalidNullString(String),
|
32
37
|
#[error("Ruby error: {0}")]
|
33
38
|
Ruby(String),
|
34
39
|
}
|
@@ -48,63 +53,27 @@ impl From<ReaderError> for MagnusError {
|
|
48
53
|
}
|
49
54
|
}
|
50
55
|
|
51
|
-
|
52
|
-
|
56
|
+
/// Builder for configuring and creating a RecordReader instance.
|
57
|
+
///
|
58
|
+
/// This struct provides a fluent interface for setting up CSV parsing options
|
59
|
+
/// and creating a RecordReader with the specified configuration.
|
60
|
+
pub struct RecordReaderBuilder<'a, T: RecordParser<'a>> {
|
61
|
+
ruby: Ruby,
|
53
62
|
to_read: Value,
|
54
63
|
has_headers: bool,
|
55
64
|
delimiter: u8,
|
56
65
|
quote_char: u8,
|
57
66
|
null_string: Option<String>,
|
58
|
-
buffer: usize,
|
59
67
|
flexible: bool,
|
60
|
-
flexible_default: Option
|
68
|
+
flexible_default: Option<String>,
|
61
69
|
trim: csv::Trim,
|
62
70
|
_phantom: PhantomData<T>,
|
71
|
+
_phantom_a: PhantomData<&'a ()>,
|
63
72
|
}
|
64
73
|
|
65
|
-
impl<T: RecordParser<'
|
66
|
-
|
67
|
-
|
68
|
-
readable: Box<dyn Read + Send + 'static>,
|
69
|
-
) -> Result<RecordReader<'static, T>, ReaderError> {
|
70
|
-
let flexible = self.flexible || self.flexible_default.is_some();
|
71
|
-
let mut reader = csv::ReaderBuilder::new()
|
72
|
-
.has_headers(self.has_headers)
|
73
|
-
.delimiter(self.delimiter)
|
74
|
-
.quote(self.quote_char)
|
75
|
-
.flexible(flexible)
|
76
|
-
.trim(self.trim)
|
77
|
-
.from_reader(readable);
|
78
|
-
|
79
|
-
let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
|
80
|
-
let static_headers = StringCache::intern_many(&headers)?;
|
81
|
-
|
82
|
-
Ok(RecordReader::new_multi_threaded(
|
83
|
-
reader,
|
84
|
-
static_headers,
|
85
|
-
self.buffer,
|
86
|
-
self.null_string,
|
87
|
-
self.flexible_default,
|
88
|
-
))
|
89
|
-
}
|
90
|
-
|
91
|
-
pub fn build_threaded(self) -> Result<RecordReader<'static, T>, ReaderError> {
|
92
|
-
if self.to_read.is_kind_of(self.ruby.class_io()) {
|
93
|
-
let readable = self.handle_file_descriptor()?;
|
94
|
-
self.build_multi_threaded(readable)
|
95
|
-
} else if self.to_read.is_kind_of(self.ruby.class_string()) {
|
96
|
-
let readable = self.handle_file_path()?;
|
97
|
-
self.build_multi_threaded(readable)
|
98
|
-
} else {
|
99
|
-
let readable = build_ruby_reader(self.ruby, self.to_read)?;
|
100
|
-
let buffered_reader = BufReader::with_capacity(READ_BUFFER_SIZE, readable);
|
101
|
-
self.build_single_threaded(buffered_reader)
|
102
|
-
}
|
103
|
-
}
|
104
|
-
}
|
105
|
-
|
106
|
-
impl<'a, T: RecordParser<'a> + Send> RecordReaderBuilder<'a, T> {
|
107
|
-
pub fn new(ruby: &'a Ruby, to_read: Value) -> Self {
|
74
|
+
impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
|
75
|
+
/// Creates a new builder instance with default settings.
|
76
|
+
pub fn new(ruby: Ruby, to_read: Value) -> Self {
|
108
77
|
Self {
|
109
78
|
ruby,
|
110
79
|
to_read,
|
@@ -112,92 +81,107 @@ impl<'a, T: RecordParser<'a> + Send> RecordReaderBuilder<'a, T> {
|
|
112
81
|
delimiter: b',',
|
113
82
|
quote_char: b'"',
|
114
83
|
null_string: None,
|
115
|
-
buffer: BUFFER_CHANNEL_SIZE,
|
116
84
|
flexible: false,
|
117
85
|
flexible_default: None,
|
118
86
|
trim: csv::Trim::None,
|
119
87
|
_phantom: PhantomData,
|
88
|
+
_phantom_a: PhantomData,
|
120
89
|
}
|
121
90
|
}
|
122
91
|
|
92
|
+
/// Sets whether the CSV file has headers.
|
93
|
+
#[must_use]
|
123
94
|
pub fn has_headers(mut self, has_headers: bool) -> Self {
|
124
95
|
self.has_headers = has_headers;
|
125
96
|
self
|
126
97
|
}
|
127
98
|
|
99
|
+
/// Sets the delimiter character for the CSV.
|
100
|
+
#[must_use]
|
128
101
|
pub fn delimiter(mut self, delimiter: u8) -> Self {
|
129
102
|
self.delimiter = delimiter;
|
130
103
|
self
|
131
104
|
}
|
132
105
|
|
106
|
+
/// Sets the quote character for the CSV.
|
107
|
+
#[must_use]
|
133
108
|
pub fn quote_char(mut self, quote_char: u8) -> Self {
|
134
109
|
self.quote_char = quote_char;
|
135
110
|
self
|
136
111
|
}
|
137
112
|
|
113
|
+
/// Sets the string that should be interpreted as null.
|
114
|
+
#[must_use]
|
138
115
|
pub fn null_string(mut self, null_string: Option<String>) -> Self {
|
139
116
|
self.null_string = null_string;
|
140
117
|
self
|
141
118
|
}
|
142
119
|
|
143
|
-
|
144
|
-
|
145
|
-
self
|
146
|
-
}
|
147
|
-
|
120
|
+
/// Sets whether the reader should be flexible with field counts.
|
121
|
+
#[must_use]
|
148
122
|
pub fn flexible(mut self, flexible: bool) -> Self {
|
149
123
|
self.flexible = flexible;
|
150
124
|
self
|
151
125
|
}
|
152
126
|
|
153
|
-
|
127
|
+
/// Sets the default value for missing fields when in flexible mode.
|
128
|
+
#[must_use]
|
129
|
+
pub fn flexible_default(mut self, flexible_default: Option<String>) -> Self {
|
154
130
|
self.flexible_default = flexible_default;
|
155
131
|
self
|
156
132
|
}
|
157
133
|
|
134
|
+
/// Sets the trimming mode for fields.
|
135
|
+
#[must_use]
|
158
136
|
pub fn trim(mut self, trim: csv::Trim) -> Self {
|
159
137
|
self.trim = trim;
|
160
138
|
self
|
161
139
|
}
|
162
140
|
|
163
|
-
|
141
|
+
/// Handles reading from a file descriptor.
|
142
|
+
fn handle_file_descriptor(&self) -> Result<Box<dyn SeekableRead>, ReaderError> {
|
164
143
|
let raw_value = self.to_read.as_raw();
|
165
144
|
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
166
|
-
.map_err(|
|
167
|
-
ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
|
168
|
-
})?;
|
145
|
+
.map_err(|e| ReaderError::FileDescriptor(format!("{:?}", e)))?;
|
169
146
|
|
170
147
|
if fd < 0 {
|
171
|
-
return Err(ReaderError::InvalidFileDescriptor);
|
148
|
+
return Err(ReaderError::InvalidFileDescriptor(fd));
|
172
149
|
}
|
173
150
|
|
174
151
|
let file = unsafe { File::from_raw_fd(fd) };
|
175
152
|
let forgotten = ForgottenFileHandle(ManuallyDrop::new(file));
|
176
|
-
Ok(Box::new(
|
177
|
-
READ_BUFFER_SIZE,
|
178
|
-
forgotten,
|
179
|
-
)))
|
153
|
+
Ok(Box::new(forgotten))
|
180
154
|
}
|
181
155
|
|
182
|
-
|
156
|
+
/// Handles reading from a file path.
|
157
|
+
fn handle_file_path(&self) -> Result<Box<dyn SeekableRead>, ReaderError> {
|
183
158
|
let path = self.to_read.to_r_string()?.to_string()?;
|
184
159
|
let file = File::open(&path)?;
|
185
160
|
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
)
|
161
|
+
if path.ends_with(".gz") {
|
162
|
+
// For gzipped files, we need to decompress them into memory first
|
163
|
+
// since GzDecoder doesn't support seeking
|
164
|
+
let mut decoder = GzDecoder::new(BufReader::with_capacity(READ_BUFFER_SIZE, file));
|
165
|
+
let mut contents = Vec::new();
|
166
|
+
decoder.read_to_end(&mut contents)?;
|
167
|
+
Ok(Box::new(std::io::Cursor::new(contents)))
|
191
168
|
} else {
|
192
|
-
Box::new(
|
193
|
-
}
|
169
|
+
Ok(Box::new(file))
|
170
|
+
}
|
194
171
|
}
|
195
172
|
|
196
|
-
|
197
|
-
|
198
|
-
readable
|
199
|
-
|
173
|
+
/// Builds the RecordReader with the configured options.
|
174
|
+
pub fn build(self) -> Result<RecordReader<'a, T>, ReaderError> {
|
175
|
+
let readable = if self.to_read.is_kind_of(self.ruby.class_io()) {
|
176
|
+
self.handle_file_descriptor()?
|
177
|
+
} else if self.to_read.is_kind_of(self.ruby.class_string()) {
|
178
|
+
self.handle_file_path()?
|
179
|
+
} else {
|
180
|
+
build_ruby_reader(&self.ruby, self.to_read)?
|
181
|
+
};
|
182
|
+
|
200
183
|
let flexible = self.flexible || self.flexible_default.is_some();
|
184
|
+
let reader = BufReader::with_capacity(READ_BUFFER_SIZE, readable);
|
201
185
|
|
202
186
|
let mut reader = csv::ReaderBuilder::new()
|
203
187
|
.has_headers(self.has_headers)
|
@@ -205,16 +189,39 @@ impl<'a, T: RecordParser<'a> + Send> RecordReaderBuilder<'a, T> {
|
|
205
189
|
.quote(self.quote_char)
|
206
190
|
.flexible(flexible)
|
207
191
|
.trim(self.trim)
|
208
|
-
.from_reader(
|
192
|
+
.from_reader(reader);
|
209
193
|
|
210
|
-
let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
|
194
|
+
let headers = RecordReader::<T>::get_headers(&self.ruby, &mut reader, self.has_headers)?;
|
211
195
|
let static_headers = StringCache::intern_many(&headers)?;
|
212
196
|
|
213
|
-
|
197
|
+
// We intern both of these to get static string references we can reuse throughout the parser.
|
198
|
+
let flexible_default = self
|
199
|
+
.flexible_default
|
200
|
+
.map(|s| {
|
201
|
+
RString::new(&s)
|
202
|
+
.to_interned_str()
|
203
|
+
.as_str()
|
204
|
+
.map_err(|e| ReaderError::InvalidFlexibleDefault(format!("{:?}", e)))
|
205
|
+
})
|
206
|
+
.transpose()?
|
207
|
+
.map(|s| Cow::Borrowed(s));
|
208
|
+
|
209
|
+
let null_string = self
|
210
|
+
.null_string
|
211
|
+
.map(|s| {
|
212
|
+
RString::new(&s)
|
213
|
+
.to_interned_str()
|
214
|
+
.as_str()
|
215
|
+
.map_err(|e| ReaderError::InvalidNullString(format!("{:?}", e)))
|
216
|
+
})
|
217
|
+
.transpose()?
|
218
|
+
.map(|s| Cow::Borrowed(s));
|
219
|
+
|
220
|
+
Ok(RecordReader::new(
|
214
221
|
reader,
|
215
222
|
static_headers,
|
216
|
-
|
217
|
-
|
223
|
+
null_string,
|
224
|
+
flexible_default,
|
218
225
|
))
|
219
226
|
}
|
220
227
|
}
|
@@ -6,8 +6,14 @@
|
|
6
6
|
/// so this optimization could be removed if any issues arise.
|
7
7
|
use std::{
|
8
8
|
collections::HashMap,
|
9
|
-
sync::{
|
9
|
+
sync::{
|
10
|
+
atomic::{AtomicU32, Ordering},
|
11
|
+
LazyLock, Mutex, OnceLock,
|
12
|
+
},
|
10
13
|
};
|
14
|
+
|
15
|
+
use magnus::{r_string::FString, value::Opaque, IntoValue, RString, Ruby, Value};
|
16
|
+
|
11
17
|
use thiserror::Error;
|
12
18
|
|
13
19
|
#[derive(Debug, Error)]
|
@@ -16,66 +22,139 @@ pub enum CacheError {
|
|
16
22
|
LockError(String),
|
17
23
|
}
|
18
24
|
|
19
|
-
static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, AtomicU32>>> =
|
25
|
+
static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (StringCacheKey, AtomicU32)>>> =
|
20
26
|
LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
|
21
27
|
|
22
28
|
pub struct StringCache;
|
23
29
|
|
30
|
+
#[derive(Copy, Clone)]
|
31
|
+
pub struct StringCacheKey(Opaque<FString>, &'static str);
|
32
|
+
|
33
|
+
impl StringCacheKey {
|
34
|
+
pub fn new(string: &str) -> Self {
|
35
|
+
let rstr = RString::new(string);
|
36
|
+
let fstr = rstr.to_interned_str();
|
37
|
+
Self(Opaque::from(fstr), fstr.as_str().unwrap())
|
38
|
+
}
|
39
|
+
}
|
40
|
+
|
41
|
+
impl AsRef<str> for StringCacheKey {
|
42
|
+
fn as_ref(&self) -> &'static str {
|
43
|
+
self.1
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
47
|
+
impl IntoValue for StringCacheKey {
|
48
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
49
|
+
handle.into_value(self.0)
|
50
|
+
}
|
51
|
+
}
|
52
|
+
|
53
|
+
impl std::fmt::Debug for StringCacheKey {
|
54
|
+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
55
|
+
self.1.fmt(f)
|
56
|
+
}
|
57
|
+
}
|
58
|
+
|
59
|
+
impl PartialEq for StringCacheKey {
|
60
|
+
fn eq(&self, other: &Self) -> bool {
|
61
|
+
self.1 == other.1
|
62
|
+
}
|
63
|
+
}
|
64
|
+
|
65
|
+
impl std::cmp::Eq for StringCacheKey {}
|
66
|
+
|
67
|
+
impl std::hash::Hash for StringCacheKey {
|
68
|
+
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
69
|
+
self.1.hash(state);
|
70
|
+
}
|
71
|
+
}
|
72
|
+
|
24
73
|
impl StringCache {
|
25
74
|
#[allow(dead_code)]
|
26
|
-
pub fn intern(string: String) -> Result
|
75
|
+
pub fn intern(string: String) -> Result<StringCacheKey, CacheError> {
|
27
76
|
let mut cache = STRING_CACHE
|
28
77
|
.lock()
|
29
78
|
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
30
79
|
|
31
|
-
if let Some((
|
32
|
-
|
33
|
-
Ok(
|
80
|
+
if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
|
81
|
+
counter.fetch_add(1, Ordering::Relaxed);
|
82
|
+
Ok(*interned_string)
|
34
83
|
} else {
|
84
|
+
let interned = StringCacheKey::new(string.as_str());
|
35
85
|
let leaked = Box::leak(string.into_boxed_str());
|
36
|
-
cache.insert(leaked, AtomicU32::new(1));
|
37
|
-
Ok(
|
86
|
+
cache.insert(leaked, (interned, AtomicU32::new(1)));
|
87
|
+
Ok(interned)
|
38
88
|
}
|
39
89
|
}
|
40
90
|
|
41
|
-
pub fn intern_many(strings: &[String]) -> Result<Vec
|
91
|
+
pub fn intern_many(strings: &[String]) -> Result<Vec<StringCacheKey>, CacheError> {
|
42
92
|
let mut cache = STRING_CACHE
|
43
93
|
.lock()
|
44
94
|
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
45
95
|
|
46
|
-
let mut result = Vec::with_capacity(strings.len());
|
96
|
+
let mut result: Vec<StringCacheKey> = Vec::with_capacity(strings.len());
|
47
97
|
for string in strings {
|
48
|
-
if let Some((
|
49
|
-
|
50
|
-
result.push(
|
98
|
+
if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
|
99
|
+
counter.fetch_add(1, Ordering::Relaxed);
|
100
|
+
result.push(*interned_string);
|
51
101
|
} else {
|
102
|
+
let interned = StringCacheKey::new(&string);
|
52
103
|
let leaked = Box::leak(string.clone().into_boxed_str());
|
53
|
-
cache.insert(leaked, AtomicU32::new(1));
|
54
|
-
result.push(
|
104
|
+
cache.insert(leaked, (interned, AtomicU32::new(1)));
|
105
|
+
result.push(interned);
|
55
106
|
}
|
56
107
|
}
|
57
108
|
Ok(result)
|
58
109
|
}
|
59
110
|
|
60
|
-
pub fn clear(headers: &[
|
111
|
+
pub fn clear(headers: &[StringCacheKey]) -> Result<(), CacheError> {
|
61
112
|
let mut cache = STRING_CACHE
|
62
113
|
.lock()
|
63
114
|
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
64
115
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
let
|
69
|
-
if
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
116
|
+
let to_remove: Vec<_> = headers
|
117
|
+
.iter()
|
118
|
+
.filter_map(|header| {
|
119
|
+
let key = header.as_ref();
|
120
|
+
if let Some((_, (_, counter))) = cache.get_key_value(key) {
|
121
|
+
let prev_count = counter.fetch_sub(1, Ordering::Relaxed);
|
122
|
+
if prev_count == 1 {
|
123
|
+
Some(key)
|
124
|
+
} else {
|
125
|
+
None
|
74
126
|
}
|
127
|
+
} else {
|
128
|
+
None
|
75
129
|
}
|
76
|
-
}
|
130
|
+
})
|
131
|
+
.collect();
|
132
|
+
|
133
|
+
for key in to_remove {
|
134
|
+
cache.remove(key);
|
77
135
|
}
|
78
136
|
|
79
137
|
Ok(())
|
80
138
|
}
|
81
139
|
}
|
140
|
+
|
141
|
+
pub struct HeaderCacheCleanupIter<I> {
|
142
|
+
pub inner: I,
|
143
|
+
pub headers: OnceLock<Vec<StringCacheKey>>,
|
144
|
+
}
|
145
|
+
|
146
|
+
impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
|
147
|
+
type Item = I::Item;
|
148
|
+
|
149
|
+
fn next(&mut self) -> Option<Self::Item> {
|
150
|
+
self.inner.next()
|
151
|
+
}
|
152
|
+
}
|
153
|
+
|
154
|
+
impl<I> Drop for HeaderCacheCleanupIter<I> {
|
155
|
+
fn drop(&mut self) {
|
156
|
+
if let Some(headers) = self.headers.get() {
|
157
|
+
StringCache::clear(&headers).unwrap();
|
158
|
+
}
|
159
|
+
}
|
160
|
+
}
|
data/ext/osv/src/csv/mod.rs
CHANGED
@@ -7,7 +7,7 @@ mod ruby_integration;
|
|
7
7
|
mod ruby_reader;
|
8
8
|
|
9
9
|
pub use builder::RecordReaderBuilder;
|
10
|
-
pub
|
11
|
-
pub use record::
|
10
|
+
pub use header_cache::StringCacheKey;
|
11
|
+
pub use record::CowStr;
|
12
12
|
pub use record::CsvRecord;
|
13
13
|
pub use ruby_integration::*;
|