osv 0.3.13 → 0.3.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,19 +1,20 @@
1
1
  use super::{
2
2
  header_cache::{CacheError, StringCache},
3
3
  parser::RecordParser,
4
- read_impl::ReadImpl,
5
- reader::RecordReader,
6
- READ_BUFFER_SIZE,
4
+ record_reader::{RecordReader, READ_BUFFER_SIZE},
5
+ ruby_reader::{build_ruby_reader, SeekableRead},
6
+ ForgottenFileHandle,
7
7
  };
8
8
  use flate2::read::GzDecoder;
9
- use magnus::{rb_sys::AsRawValue, value::ReprValue, Error as MagnusError, RString, Ruby, Value};
9
+ use magnus::{rb_sys::AsRawValue, value::ReprValue, Error as MagnusError, Ruby, Value};
10
10
  use std::{
11
11
  fs::File,
12
12
  io::{self, BufReader, Read},
13
13
  marker::PhantomData,
14
+ mem::ManuallyDrop,
14
15
  os::fd::FromRawFd,
15
- thread,
16
16
  };
17
+
17
18
  use thiserror::Error;
18
19
 
19
20
  pub(crate) static BUFFER_CHANNEL_SIZE: usize = 1024;
@@ -28,8 +29,6 @@ pub enum ReaderError {
28
29
  FileOpen(#[from] io::Error),
29
30
  #[error("Failed to intern headers: {0}")]
30
31
  HeaderIntern(#[from] CacheError),
31
- #[error("Unsupported GzipReader")]
32
- UnsupportedGzipReader,
33
32
  #[error("Ruby error: {0}")]
34
33
  Ruby(String),
35
34
  }
@@ -49,7 +48,7 @@ impl From<ReaderError> for MagnusError {
49
48
  }
50
49
  }
51
50
 
52
- pub struct RecordReaderBuilder<'a, T: RecordParser + Send + 'static> {
51
+ pub struct RecordReaderBuilder<'a, T: RecordParser<'a> + Send> {
53
52
  ruby: &'a Ruby,
54
53
  to_read: Value,
55
54
  has_headers: bool,
@@ -58,12 +57,53 @@ pub struct RecordReaderBuilder<'a, T: RecordParser + Send + 'static> {
58
57
  null_string: Option<String>,
59
58
  buffer: usize,
60
59
  flexible: bool,
61
- flexible_default: Option<String>,
60
+ flexible_default: Option<&'a str>,
62
61
  trim: csv::Trim,
63
62
  _phantom: PhantomData<T>,
64
63
  }
65
64
 
66
- impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
65
+ impl<T: RecordParser<'static> + Send + 'static> RecordReaderBuilder<'static, T> {
66
+ fn build_multi_threaded(
67
+ self,
68
+ readable: Box<dyn Read + Send + 'static>,
69
+ ) -> Result<RecordReader<'static, T>, ReaderError> {
70
+ let flexible = self.flexible || self.flexible_default.is_some();
71
+ let mut reader = csv::ReaderBuilder::new()
72
+ .has_headers(self.has_headers)
73
+ .delimiter(self.delimiter)
74
+ .quote(self.quote_char)
75
+ .flexible(flexible)
76
+ .trim(self.trim)
77
+ .from_reader(readable);
78
+
79
+ let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
80
+ let static_headers = StringCache::intern_many(&headers)?;
81
+
82
+ Ok(RecordReader::new_multi_threaded(
83
+ reader,
84
+ static_headers,
85
+ self.buffer,
86
+ self.null_string,
87
+ self.flexible_default,
88
+ ))
89
+ }
90
+
91
+ pub fn build_threaded(self) -> Result<RecordReader<'static, T>, ReaderError> {
92
+ if self.to_read.is_kind_of(self.ruby.class_io()) {
93
+ let readable = self.handle_file_descriptor()?;
94
+ self.build_multi_threaded(readable)
95
+ } else if self.to_read.is_kind_of(self.ruby.class_string()) {
96
+ let readable = self.handle_file_path()?;
97
+ self.build_multi_threaded(readable)
98
+ } else {
99
+ let readable = build_ruby_reader(self.ruby, self.to_read)?;
100
+ let buffered_reader = BufReader::with_capacity(READ_BUFFER_SIZE, readable);
101
+ self.build_single_threaded(buffered_reader)
102
+ }
103
+ }
104
+ }
105
+
106
+ impl<'a, T: RecordParser<'a> + Send> RecordReaderBuilder<'a, T> {
67
107
  pub fn new(ruby: &'a Ruby, to_read: Value) -> Self {
68
108
  Self {
69
109
  ruby,
@@ -110,7 +150,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
110
150
  self
111
151
  }
112
152
 
113
- pub fn flexible_default(mut self, flexible_default: Option<String>) -> Self {
153
+ pub fn flexible_default(mut self, flexible_default: Option<&'a str>) -> Self {
114
154
  self.flexible_default = flexible_default;
115
155
  self
116
156
  }
@@ -120,12 +160,6 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
120
160
  self
121
161
  }
122
162
 
123
- fn handle_string_io(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
124
- let string: RString = self.to_read.funcall("string", ())?;
125
- let content = string.to_string()?;
126
- Ok(Box::new(std::io::Cursor::new(content)))
127
- }
128
-
129
163
  fn handle_file_descriptor(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
130
164
  let raw_value = self.to_read.as_raw();
131
165
  let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
@@ -138,7 +172,11 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
138
172
  }
139
173
 
140
174
  let file = unsafe { File::from_raw_fd(fd) };
141
- Ok(Box::new(BufReader::with_capacity(READ_BUFFER_SIZE, file)))
175
+ let forgotten = ForgottenFileHandle(ManuallyDrop::new(file));
176
+ Ok(Box::new(BufReader::with_capacity(
177
+ READ_BUFFER_SIZE,
178
+ forgotten,
179
+ )))
142
180
  }
143
181
 
144
182
  fn handle_file_path(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
@@ -155,102 +193,12 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
155
193
  })
156
194
  }
157
195
 
158
- fn get_reader(&self) -> Result<(Box<dyn Read + Send + 'static>, bool), ReaderError> {
159
- let string_io: magnus::RClass = self.ruby.eval("StringIO")?;
160
- let gzip_reader_class: magnus::RClass = self.ruby.eval("Zlib::GzipReader")?;
161
-
162
- if self.to_read.is_kind_of(string_io) {
163
- self.handle_string_io().map(|r| (r, false))
164
- } else if self.to_read.is_kind_of(gzip_reader_class) {
165
- Err(ReaderError::UnsupportedGzipReader)
166
- } else if self.to_read.is_kind_of(self.ruby.class_io()) {
167
- self.handle_file_descriptor().map(|r| (r, true))
168
- } else {
169
- self.handle_file_path().map(|r| (r, false))
170
- }
171
- }
172
-
173
- fn get_single_threaded_reader(&self) -> Result<Box<dyn Read>, ReaderError> {
174
- let string_io: magnus::RClass = self.ruby.eval("StringIO")?;
175
- let gzip_reader_class: magnus::RClass = self.ruby.eval("Zlib::GzipReader")?;
176
-
177
- if self.to_read.is_kind_of(string_io) {
178
- self.handle_string_io().map(|r| -> Box<dyn Read> { r })
179
- } else if self.to_read.is_kind_of(gzip_reader_class) {
180
- Ok(Box::new(RubyReader::new(self.to_read)))
181
- } else if self.to_read.is_kind_of(self.ruby.class_io()) {
182
- self.handle_file_descriptor()
183
- .map(|r| -> Box<dyn Read> { r })
184
- } else {
185
- self.handle_file_path().map(|r| -> Box<dyn Read> { r })
186
- }
187
- }
188
-
189
- pub fn build(self) -> Result<RecordReader<T>, ReaderError> {
190
- match self.get_reader() {
191
- Ok((readable, should_forget)) => self.build_multi_threaded(readable, should_forget),
192
- Err(_) => {
193
- let readable = self.get_single_threaded_reader()?;
194
- self.build_single_threaded(readable)
195
- }
196
- }
197
- }
198
-
199
- fn build_multi_threaded(
200
- self,
201
- readable: Box<dyn Read + Send + 'static>,
202
- should_forget: bool,
203
- ) -> Result<RecordReader<T>, ReaderError> {
204
- let flexible = self.flexible || self.flexible_default.is_some();
205
- let mut reader = csv::ReaderBuilder::new()
206
- .has_headers(self.has_headers)
207
- .delimiter(self.delimiter)
208
- .quote(self.quote_char)
209
- .flexible(flexible)
210
- .trim(self.trim)
211
- .from_reader(readable);
212
-
213
- let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
214
- let static_headers = StringCache::intern_many(&headers)?;
215
- let headers_for_cleanup = static_headers.clone();
216
-
217
- let (sender, receiver) = kanal::bounded(self.buffer);
218
- let null_string = self.null_string.clone();
219
-
220
- let flexible_default = self.flexible_default.clone();
221
- let handle = thread::spawn(move || {
222
- let mut record = csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers.len());
223
- while let Ok(true) = reader.read_record(&mut record) {
224
- let row = T::parse(
225
- &static_headers,
226
- &record,
227
- null_string.as_deref(),
228
- flexible_default.as_deref(),
229
- );
230
- if sender.send(row).is_err() {
231
- break;
232
- }
233
- }
234
- if should_forget {
235
- let file_to_forget = reader.into_inner();
236
- std::mem::forget(file_to_forget);
237
- }
238
- });
239
-
240
- Ok(RecordReader {
241
- reader: ReadImpl::MultiThreaded {
242
- headers: headers_for_cleanup,
243
- receiver,
244
- handle: Some(handle),
245
- },
246
- })
247
- }
248
-
249
196
  fn build_single_threaded(
250
197
  self,
251
- readable: Box<dyn Read>,
252
- ) -> Result<RecordReader<T>, ReaderError> {
198
+ readable: BufReader<Box<dyn SeekableRead>>,
199
+ ) -> Result<RecordReader<'a, T>, ReaderError> {
253
200
  let flexible = self.flexible || self.flexible_default.is_some();
201
+
254
202
  let mut reader = csv::ReaderBuilder::new()
255
203
  .has_headers(self.has_headers)
256
204
  .delimiter(self.delimiter)
@@ -262,70 +210,11 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
262
210
  let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
263
211
  let static_headers = StringCache::intern_many(&headers)?;
264
212
 
265
- Ok(RecordReader {
266
- reader: ReadImpl::SingleThreaded {
267
- reader,
268
- headers: static_headers,
269
- null_string: self.null_string,
270
- flexible_default: self.flexible_default,
271
- },
272
- })
273
- }
274
- }
275
-
276
- struct RubyReader {
277
- inner: Value,
278
- buffer: Option<Vec<u8>>,
279
- offset: usize,
280
- }
281
-
282
- impl RubyReader {
283
- fn new(inner: Value) -> Self {
284
- Self {
285
- inner,
286
- buffer: None,
287
- offset: 0,
288
- }
289
- }
290
- }
291
-
292
- // Read the entire inner into a vector and then read future reads from that vector with offset
293
- impl Read for RubyReader {
294
- fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
295
- // If we have an existing buffer, read from it
296
- if let Some(buffer) = self.buffer.as_ref() {
297
- let remaining = buffer.len() - self.offset;
298
- let copy_size = remaining.min(buf.len());
299
- buf[..copy_size].copy_from_slice(&buffer[self.offset..self.offset + copy_size]);
300
- self.offset += copy_size;
301
- return Ok(copy_size);
302
- }
303
-
304
- // No buffer yet - read the entire content from Ruby
305
- let result = self.inner.funcall::<_, _, Value>("read", ());
306
- match result {
307
- Ok(data) => {
308
- if data.is_nil() {
309
- return Ok(0); // EOF
310
- }
311
-
312
- let string = RString::from_value(data).ok_or_else(|| {
313
- io::Error::new(io::ErrorKind::Other, "Failed to convert to RString")
314
- })?;
315
- let bytes = unsafe { string.as_slice() };
316
-
317
- // Store the entire content in the buffer
318
- self.buffer = Some(bytes.to_vec());
319
- self.offset = 0;
320
-
321
- // Read initial chunk
322
- let copy_size = bytes.len().min(buf.len());
323
- buf[..copy_size].copy_from_slice(&bytes[..copy_size]);
324
- self.offset = copy_size;
325
-
326
- Ok(copy_size)
327
- }
328
- Err(e) => Err(io::Error::new(io::ErrorKind::Other, e.to_string())),
329
- }
213
+ Ok(RecordReader::new_single_threaded(
214
+ reader,
215
+ static_headers,
216
+ self.null_string,
217
+ self.flexible_default,
218
+ ))
330
219
  }
331
220
  }
@@ -1,11 +1,13 @@
1
1
  mod builder;
2
2
  mod header_cache;
3
3
  mod parser;
4
- pub mod read_impl;
5
- mod reader;
6
4
  mod record;
5
+ mod record_reader;
6
+ mod ruby_integration;
7
+ mod ruby_reader;
7
8
 
8
9
  pub use builder::RecordReaderBuilder;
9
10
  pub(crate) use builder::BUFFER_CHANNEL_SIZE;
10
- pub(crate) use read_impl::READ_BUFFER_SIZE;
11
+ pub use record::CowValue;
11
12
  pub use record::CsvRecord;
13
+ pub use ruby_integration::*;
@@ -1,18 +1,23 @@
1
+ use std::borrow::Cow;
1
2
  use std::collections::HashMap;
2
3
  use std::hash::BuildHasher;
3
4
 
4
- pub trait RecordParser {
5
- type Output;
5
+ use super::CowValue;
6
+
7
+ pub trait RecordParser<'a> {
8
+ type Output: 'a;
6
9
 
7
10
  fn parse(
8
11
  headers: &[&'static str],
9
12
  record: &csv::StringRecord,
10
13
  null_string: Option<&str>,
11
- flexible_default: Option<&str>,
14
+ flexible_default: Option<Cow<'a, str>>,
12
15
  ) -> Self::Output;
13
16
  }
14
17
 
15
- impl<S: BuildHasher + Default> RecordParser for HashMap<&'static str, Option<String>, S> {
18
+ impl<'a, S: BuildHasher + Default + 'a> RecordParser<'a>
19
+ for HashMap<&'static str, Option<CowValue<'a>>, S>
20
+ {
16
21
  type Output = Self;
17
22
 
18
23
  #[inline]
@@ -20,19 +25,22 @@ impl<S: BuildHasher + Default> RecordParser for HashMap<&'static str, Option<Str
20
25
  headers: &[&'static str],
21
26
  record: &csv::StringRecord,
22
27
  null_string: Option<&str>,
23
- flexible_default: Option<&str>,
28
+ flexible_default: Option<Cow<'a, str>>,
24
29
  ) -> Self::Output {
25
30
  let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default());
31
+
32
+ let shared_empty = Cow::Borrowed("");
33
+ let shared_default = flexible_default.map(CowValue);
26
34
  headers.iter().enumerate().for_each(|(i, &header)| {
27
35
  let value = record.get(i).map_or_else(
28
- || flexible_default.map(ToString::to_string),
36
+ || shared_default.clone(),
29
37
  |field| {
30
38
  if null_string == Some(field) {
31
39
  None
32
40
  } else if field.is_empty() {
33
- Some(String::new())
41
+ Some(CowValue(shared_empty.clone()))
34
42
  } else {
35
- Some(field.into())
43
+ Some(CowValue(Cow::Owned(field.to_string())))
36
44
  }
37
45
  },
38
46
  );
@@ -42,7 +50,7 @@ impl<S: BuildHasher + Default> RecordParser for HashMap<&'static str, Option<Str
42
50
  }
43
51
  }
44
52
 
45
- impl RecordParser for Vec<Option<String>> {
53
+ impl<'a> RecordParser<'a> for Vec<Option<CowValue<'a>>> {
46
54
  type Output = Self;
47
55
 
48
56
  #[inline]
@@ -50,26 +58,94 @@ impl RecordParser for Vec<Option<String>> {
50
58
  headers: &[&'static str],
51
59
  record: &csv::StringRecord,
52
60
  null_string: Option<&str>,
53
- flexible_default: Option<&str>,
61
+ flexible_default: Option<Cow<'a, str>>,
54
62
  ) -> Self::Output {
55
63
  let target_len = headers.len();
56
64
  let mut vec = Vec::with_capacity(target_len);
65
+
66
+ let shared_empty = Cow::Borrowed("");
67
+ let shared_default = flexible_default.map(CowValue);
68
+
57
69
  for field in record.iter() {
58
70
  let value = if Some(field) == null_string {
59
71
  None
60
72
  } else if field.is_empty() {
61
- Some(String::new())
73
+ Some(CowValue(shared_empty.clone()))
62
74
  } else {
63
- Some(field.into())
75
+ Some(CowValue(Cow::Owned(field.to_string())))
64
76
  };
65
77
  vec.push(value);
66
78
  }
67
79
 
68
80
  if vec.len() < target_len {
69
- if let Some(default) = flexible_default {
70
- vec.resize_with(target_len, || Some(default.to_string()));
81
+ if let Some(default) = shared_default {
82
+ vec.resize_with(target_len, || Some(default.clone()));
71
83
  }
72
84
  }
73
85
  vec
74
86
  }
75
87
  }
88
+
89
+ // impl<'a, S: BuildHasher + Default + 'a> RecordParser<'a>
90
+ // for HashMap<&'static str, Option<String>, S>
91
+ // {
92
+ // type Output = Self;
93
+
94
+ // #[inline]
95
+ // fn parse(
96
+ // headers: &[&'static str],
97
+ // record: &csv::StringRecord,
98
+ // null_string: Option<&str>,
99
+ // flexible_default: Option<Cow<'a, str>>,
100
+ // ) -> Self::Output {
101
+ // let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default());
102
+ // headers.iter().enumerate().for_each(|(i, &header)| {
103
+ // let value = record.get(i).map_or_else(
104
+ // || flexible_default.clone(),
105
+ // |field| {
106
+ // if null_string == Some(field) {
107
+ // None
108
+ // } else if field.is_empty() {
109
+ // Some(String::new())
110
+ // } else {
111
+ // Some(field.into())
112
+ // }
113
+ // },
114
+ // );
115
+ // map.insert(header, value);
116
+ // });
117
+ // map
118
+ // }
119
+ // }
120
+
121
+ // impl<'a> RecordParser<'a> for Vec<Option<String>> {
122
+ // type Output = Self;
123
+
124
+ // #[inline]
125
+ // fn parse(
126
+ // headers: &[&'static str],
127
+ // record: &csv::StringRecord,
128
+ // null_string: Option<&str>,
129
+ // flexible_default: Option<Cow<'a, str>>,
130
+ // ) -> Self::Output {
131
+ // let target_len = headers.len();
132
+ // let mut vec = Vec::with_capacity(target_len);
133
+ // for field in record.iter() {
134
+ // let value = if Some(field) == null_string {
135
+ // None
136
+ // } else if field.is_empty() {
137
+ // Some(String::new())
138
+ // } else {
139
+ // Some(field.into())
140
+ // };
141
+ // vec.push(value);
142
+ // }
143
+
144
+ // if vec.len() < target_len {
145
+ // if let Some(default) = flexible_default {
146
+ // vec.resize_with(target_len, || Some(default.to_string()));
147
+ // }
148
+ // }
149
+ // vec
150
+ // }
151
+ // }
@@ -1,17 +1,21 @@
1
1
  use magnus::{IntoValue, Ruby, Value};
2
- use std::{collections::HashMap, hash::BuildHasher};
2
+ use std::{borrow::Cow, collections::HashMap, hash::BuildHasher};
3
3
 
4
4
  #[derive(Debug)]
5
- pub enum CsvRecord<S: BuildHasher + Default> {
6
- Vec(Vec<Option<String>>),
7
- Map(HashMap<&'static str, Option<String>, S>),
5
+ pub enum CsvRecord<'a, S: BuildHasher + Default> {
6
+ Vec(Vec<Option<CowValue<'a>>>),
7
+ Map(HashMap<&'static str, Option<CowValue<'a>>, S>),
8
8
  }
9
9
 
10
- impl<S: BuildHasher + Default> IntoValue for CsvRecord<S> {
10
+ impl<S: BuildHasher + Default> IntoValue for CsvRecord<'_, S> {
11
11
  #[inline]
12
12
  fn into_value_with(self, handle: &Ruby) -> Value {
13
13
  match self {
14
- CsvRecord::Vec(vec) => vec.into_value_with(handle),
14
+ CsvRecord::Vec(vec) => {
15
+ let ary = handle.ary_new_capa(vec.len());
16
+ vec.into_iter().try_for_each(|v| ary.push(v)).unwrap();
17
+ ary.into_value_with(handle)
18
+ }
15
19
  CsvRecord::Map(map) => {
16
20
  // Pre-allocate the hash with the known size
17
21
  let hash = handle.hash_new_capa(map.len());
@@ -23,3 +27,12 @@ impl<S: BuildHasher + Default> IntoValue for CsvRecord<S> {
23
27
  }
24
28
  }
25
29
  }
30
+
31
+ #[derive(Debug, Clone)]
32
+ pub struct CowValue<'a>(pub Cow<'a, str>);
33
+
34
+ impl IntoValue for CowValue<'_> {
35
+ fn into_value_with(self, handle: &Ruby) -> Value {
36
+ self.0.into_value_with(handle)
37
+ }
38
+ }