osv 0.3.15 → 0.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,77 +2,78 @@ use std::borrow::Cow;
2
2
  use std::collections::HashMap;
3
3
  use std::hash::BuildHasher;
4
4
 
5
- use super::CowValue;
5
+ use super::header_cache::StringCacheKey;
6
+ use super::CowStr;
6
7
 
7
8
  pub trait RecordParser<'a> {
8
- type Output: 'a;
9
+ type Output;
9
10
 
10
11
  fn parse(
11
- headers: &[&'static str],
12
+ headers: &[StringCacheKey],
12
13
  record: &csv::StringRecord,
13
- null_string: Option<&str>,
14
+ null_string: Option<Cow<'a, str>>,
14
15
  flexible_default: Option<Cow<'a, str>>,
15
16
  ) -> Self::Output;
16
17
  }
17
18
 
18
- impl<'a, S: BuildHasher + Default + 'a> RecordParser<'a>
19
- for HashMap<&'static str, Option<CowValue<'a>>, S>
19
+ impl<'a, S: BuildHasher + Default> RecordParser<'a>
20
+ for HashMap<StringCacheKey, Option<CowStr<'a>>, S>
20
21
  {
21
22
  type Output = Self;
22
23
 
23
24
  #[inline]
24
25
  fn parse(
25
- headers: &[&'static str],
26
+ headers: &[StringCacheKey],
26
27
  record: &csv::StringRecord,
27
- null_string: Option<&str>,
28
+ null_string: Option<Cow<'a, str>>,
28
29
  flexible_default: Option<Cow<'a, str>>,
29
30
  ) -> Self::Output {
30
31
  let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default());
31
32
 
32
33
  let shared_empty = Cow::Borrowed("");
33
- let shared_default = flexible_default.map(CowValue);
34
- headers.iter().enumerate().for_each(|(i, &header)| {
34
+ let shared_default = flexible_default.map(CowStr);
35
+ headers.iter().enumerate().for_each(|(i, ref header)| {
35
36
  let value = record.get(i).map_or_else(
36
37
  || shared_default.clone(),
37
38
  |field| {
38
- if null_string == Some(field) {
39
+ if null_string.as_deref() == Some(field) {
39
40
  None
40
41
  } else if field.is_empty() {
41
- Some(CowValue(shared_empty.clone()))
42
+ Some(CowStr(shared_empty.clone()))
42
43
  } else {
43
- Some(CowValue(Cow::Owned(field.to_string())))
44
+ Some(CowStr(Cow::Owned(field.to_string())))
44
45
  }
45
46
  },
46
47
  );
47
- map.insert(header, value);
48
+ map.insert((*header).clone(), value);
48
49
  });
49
50
  map
50
51
  }
51
52
  }
52
53
 
53
- impl<'a> RecordParser<'a> for Vec<Option<CowValue<'a>>> {
54
+ impl<'a> RecordParser<'a> for Vec<Option<CowStr<'a>>> {
54
55
  type Output = Self;
55
56
 
56
57
  #[inline]
57
58
  fn parse(
58
- headers: &[&'static str],
59
+ headers: &[StringCacheKey],
59
60
  record: &csv::StringRecord,
60
- null_string: Option<&str>,
61
+ null_string: Option<Cow<'a, str>>,
61
62
  flexible_default: Option<Cow<'a, str>>,
62
63
  ) -> Self::Output {
63
64
  let target_len = headers.len();
64
65
  let mut vec = Vec::with_capacity(target_len);
65
66
 
66
67
  let shared_empty = Cow::Borrowed("");
67
- let shared_default = flexible_default.map(CowValue);
68
+ let shared_default = flexible_default.map(CowStr);
68
69
 
69
70
  for field in record.iter() {
70
- let value = if Some(field) == null_string {
71
+ let value = if Some(field) == null_string.as_deref() {
71
72
  None
72
73
  } else if field.is_empty() {
73
- Some(CowValue(shared_empty.clone()))
74
+ Some(CowStr(shared_empty.clone()))
74
75
  } else {
75
- Some(CowValue(Cow::Owned(field.to_string())))
76
+ Some(CowStr(Cow::Owned(field.to_string())))
76
77
  };
77
78
  vec.push(value);
78
79
  }
@@ -85,67 +86,3 @@ impl<'a> RecordParser<'a> for Vec<Option<CowValue<'a>>> {
85
86
  vec
86
87
  }
87
88
  }
88
-
89
- // impl<'a, S: BuildHasher + Default + 'a> RecordParser<'a>
90
- // for HashMap<&'static str, Option<String>, S>
91
- // {
92
- // type Output = Self;
93
-
94
- // #[inline]
95
- // fn parse(
96
- // headers: &[&'static str],
97
- // record: &csv::StringRecord,
98
- // null_string: Option<&str>,
99
- // flexible_default: Option<Cow<'a, str>>,
100
- // ) -> Self::Output {
101
- // let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default());
102
- // headers.iter().enumerate().for_each(|(i, &header)| {
103
- // let value = record.get(i).map_or_else(
104
- // || flexible_default.clone(),
105
- // |field| {
106
- // if null_string == Some(field) {
107
- // None
108
- // } else if field.is_empty() {
109
- // Some(String::new())
110
- // } else {
111
- // Some(field.into())
112
- // }
113
- // },
114
- // );
115
- // map.insert(header, value);
116
- // });
117
- // map
118
- // }
119
- // }
120
-
121
- // impl<'a> RecordParser<'a> for Vec<Option<String>> {
122
- // type Output = Self;
123
-
124
- // #[inline]
125
- // fn parse(
126
- // headers: &[&'static str],
127
- // record: &csv::StringRecord,
128
- // null_string: Option<&str>,
129
- // flexible_default: Option<Cow<'a, str>>,
130
- // ) -> Self::Output {
131
- // let target_len = headers.len();
132
- // let mut vec = Vec::with_capacity(target_len);
133
- // for field in record.iter() {
134
- // let value = if Some(field) == null_string {
135
- // None
136
- // } else if field.is_empty() {
137
- // Some(String::new())
138
- // } else {
139
- // Some(field.into())
140
- // };
141
- // vec.push(value);
142
- // }
143
-
144
- // if vec.len() < target_len {
145
- // if let Some(default) = flexible_default {
146
- // vec.resize_with(target_len, || Some(default.to_string()));
147
- // }
148
- // }
149
- // vec
150
- // }
151
- // }
@@ -1,10 +1,13 @@
1
- use magnus::{IntoValue, Ruby, Value};
1
+ use itertools::Itertools;
2
+ use magnus::{value::ReprValue, IntoValue, Ruby, Value};
2
3
  use std::{borrow::Cow, collections::HashMap, hash::BuildHasher};
3
4
 
5
+ use super::StringCacheKey;
6
+
4
7
  #[derive(Debug)]
5
8
  pub enum CsvRecord<'a, S: BuildHasher + Default> {
6
- Vec(Vec<Option<CowValue<'a>>>),
7
- Map(HashMap<&'static str, Option<CowValue<'a>>, S>),
9
+ Vec(Vec<Option<CowStr<'a>>>),
10
+ Map(HashMap<StringCacheKey, Option<CowStr<'a>>, S>),
8
11
  }
9
12
 
10
13
  impl<S: BuildHasher + Default> IntoValue for CsvRecord<'_, S> {
@@ -19,9 +22,23 @@ impl<S: BuildHasher + Default> IntoValue for CsvRecord<'_, S> {
19
22
  CsvRecord::Map(map) => {
20
23
  // Pre-allocate the hash with the known size
21
24
  let hash = handle.hash_new_capa(map.len());
22
- map.into_iter()
23
- .try_for_each(|(k, v)| hash.aset(k, v))
24
- .unwrap();
25
+
26
+ let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
27
+ let mut i = 0;
28
+
29
+ for chunk in &map.into_iter().chunks(128) {
30
+ for (k, v) in chunk {
31
+ values[i] = handle.into_value(k);
32
+ values[i + 1] = handle.into_value(v);
33
+ i += 2;
34
+ }
35
+ hash.bulk_insert(&values[..i]).unwrap();
36
+
37
+ // Zero out used values
38
+ values[..i].fill(handle.qnil().as_value());
39
+ i = 0;
40
+ }
41
+
25
42
  hash.into_value_with(handle)
26
43
  }
27
44
  }
@@ -29,9 +46,9 @@ impl<S: BuildHasher + Default> IntoValue for CsvRecord<'_, S> {
29
46
  }
30
47
 
31
48
  #[derive(Debug, Clone)]
32
- pub struct CowValue<'a>(pub Cow<'a, str>);
49
+ pub struct CowStr<'a>(pub Cow<'a, str>);
33
50
 
34
- impl IntoValue for CowValue<'_> {
51
+ impl IntoValue for CowStr<'_> {
35
52
  fn into_value_with(self, handle: &Ruby) -> Value {
36
53
  self.0.into_value_with(handle)
37
54
  }
@@ -1,32 +1,35 @@
1
+ use super::header_cache::StringCacheKey;
1
2
  use super::parser::RecordParser;
2
3
  use super::{header_cache::StringCache, ruby_reader::SeekableRead};
3
4
  use magnus::{Error, Ruby};
4
- use std::io::BufReader;
5
- use std::{borrow::Cow, io::Read, thread};
5
+ use std::borrow::Cow;
6
+ use std::io::{BufReader, Read};
6
7
 
8
+ /// Size of the internal buffer used for reading CSV records
7
9
  pub(crate) const READ_BUFFER_SIZE: usize = 16384;
8
10
 
11
+ /// A reader that processes CSV records using a specified parser.
12
+ ///
13
+ /// This struct implements Iterator to provide a streaming interface for CSV records.
9
14
  pub struct RecordReader<'a, T: RecordParser<'a>> {
10
- inner: ReaderImpl<'a, T>,
11
- }
12
-
13
- #[allow(clippy::large_enum_variant)]
14
- enum ReaderImpl<'a, T: RecordParser<'a>> {
15
- SingleThreaded {
16
- reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
17
- headers: Vec<&'static str>,
18
- null_string: Option<String>,
19
- flexible_default: Option<Cow<'a, str>>,
20
- string_record: csv::StringRecord,
21
- },
22
- MultiThreaded {
23
- headers: Vec<&'static str>,
24
- receiver: kanal::Receiver<T::Output>,
25
- handle: Option<thread::JoinHandle<()>>,
26
- },
15
+ reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
16
+ headers: Vec<StringCacheKey>,
17
+ null_string: Option<Cow<'a, str>>,
18
+ flexible_default: Option<Cow<'a, str>>,
19
+ string_record: csv::StringRecord,
20
+ parser: std::marker::PhantomData<T>,
27
21
  }
28
22
 
29
23
  impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
24
+ /// Reads and processes headers from a CSV reader.
25
+ ///
26
+ /// # Arguments
27
+ /// * `ruby` - Ruby VM context for error handling
28
+ /// * `reader` - CSV reader instance
29
+ /// * `has_headers` - Whether the CSV file contains headers
30
+ ///
31
+ /// # Returns
32
+ /// A vector of header strings or generated column names if `has_headers` is false
30
33
  #[inline]
31
34
  pub(crate) fn get_headers(
32
35
  ruby: &Ruby,
@@ -40,67 +43,41 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
40
43
  )
41
44
  })?;
42
45
 
43
- let mut headers = Vec::with_capacity(first_row.len());
44
- if has_headers {
45
- headers.extend(first_row.iter().map(String::from));
46
+ Ok(if has_headers {
47
+ first_row.iter().map(String::from).collect()
46
48
  } else {
47
- headers.extend((0..first_row.len()).map(|i| format!("c{i}")));
48
- }
49
- Ok(headers)
49
+ (0..first_row.len()).map(|i| format!("c{i}")).collect()
50
+ })
50
51
  }
51
52
 
52
- pub(crate) fn new_single_threaded(
53
+ /// Creates a new RecordReader instance.
54
+ pub(crate) fn new(
53
55
  reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
54
- headers: Vec<&'static str>,
55
- null_string: Option<String>,
56
- flexible_default: Option<&'a str>,
56
+ headers: Vec<StringCacheKey>,
57
+ null_string: Option<Cow<'a, str>>,
58
+ flexible_default: Option<Cow<'a, str>>,
57
59
  ) -> Self {
58
60
  let headers_len = headers.len();
59
61
  Self {
60
- inner: ReaderImpl::SingleThreaded {
61
- reader,
62
- headers,
63
- null_string,
64
- flexible_default: flexible_default.map(Cow::Borrowed),
65
- string_record: csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers_len),
66
- },
62
+ reader,
63
+ headers,
64
+ null_string,
65
+ flexible_default,
66
+ string_record: csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers_len),
67
+ parser: std::marker::PhantomData,
67
68
  }
68
69
  }
69
- }
70
70
 
71
- impl<T: RecordParser<'static> + Send> RecordReader<'static, T> {
72
- pub(crate) fn new_multi_threaded(
73
- mut reader: csv::Reader<Box<dyn Read + Send + 'static>>,
74
- headers: Vec<&'static str>,
75
- buffer_size: usize,
76
- null_string: Option<String>,
77
- flexible_default: Option<&'static str>,
78
- ) -> Self {
79
- let (sender, receiver) = kanal::bounded(buffer_size);
80
- let headers_for_thread = headers.clone();
81
-
82
- let handle = thread::spawn(move || {
83
- let mut record =
84
- csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers_for_thread.len());
85
- while let Ok(true) = reader.read_record(&mut record) {
86
- let row = T::parse(
87
- &headers_for_thread,
88
- &record,
89
- null_string.as_deref(),
90
- flexible_default.map(Cow::Borrowed),
91
- );
92
- if sender.send(row).is_err() {
93
- break;
94
- }
95
- }
96
- });
97
-
98
- Self {
99
- inner: ReaderImpl::MultiThreaded {
100
- headers,
101
- receiver,
102
- handle: Some(handle),
103
- },
71
+ /// Attempts to read the next record, returning any errors encountered.
72
+ fn try_next(&mut self) -> csv::Result<Option<T::Output>> {
73
+ match self.reader.read_record(&mut self.string_record)? {
74
+ true => Ok(Some(T::parse(
75
+ &self.headers,
76
+ &self.string_record,
77
+ self.null_string.clone(),
78
+ self.flexible_default.clone(),
79
+ ))),
80
+ false => Ok(None),
104
81
  }
105
82
  }
106
83
  }
@@ -110,63 +87,21 @@ impl<'a, T: RecordParser<'a>> Iterator for RecordReader<'a, T> {
110
87
 
111
88
  #[inline]
112
89
  fn next(&mut self) -> Option<Self::Item> {
113
- match &mut self.inner {
114
- ReaderImpl::MultiThreaded {
115
- receiver, handle, ..
116
- } => match receiver.recv() {
117
- Ok(record) => Some(record),
118
- Err(_) => {
119
- if let Some(handle) = handle.take() {
120
- let _ = handle.join();
121
- }
122
- None
123
- }
124
- },
125
- ReaderImpl::SingleThreaded {
126
- reader,
127
- headers,
128
- null_string,
129
- flexible_default,
130
- ref mut string_record,
131
- } => match reader.read_record(string_record) {
132
- Ok(true) => Some(T::parse(
133
- headers,
134
- string_record,
135
- null_string.as_deref(),
136
- flexible_default.clone(),
137
- )),
138
- Ok(false) => None,
139
- Err(_e) => None,
140
- },
141
- }
90
+ // Note: We intentionally swallow errors here to maintain Iterator contract.
91
+ // Errors can be handled by using try_next() directly if needed.
92
+ self.try_next().ok().flatten()
142
93
  }
143
94
 
144
95
  #[inline]
145
96
  fn size_hint(&self) -> (usize, Option<usize>) {
146
- // We can't know the exact size without reading the whole file
147
- (0, None)
97
+ (0, None) // Cannot determine size without reading entire file
148
98
  }
149
99
  }
150
100
 
151
101
  impl<'a, T: RecordParser<'a>> Drop for RecordReader<'a, T> {
152
102
  #[inline]
153
103
  fn drop(&mut self) {
154
- match &mut self.inner {
155
- ReaderImpl::MultiThreaded {
156
- receiver,
157
- handle,
158
- headers,
159
- ..
160
- } => {
161
- receiver.close();
162
- if let Some(handle) = handle.take() {
163
- let _ = handle.join();
164
- }
165
- let _ = StringCache::clear(headers);
166
- }
167
- ReaderImpl::SingleThreaded { headers, .. } => {
168
- let _ = StringCache::clear(headers);
169
- }
170
- }
104
+ // Intentionally ignore errors during cleanup as there's no meaningful way to handle them
105
+ let _ = StringCache::clear(&self.headers);
171
106
  }
172
107
  }
@@ -1,30 +1,19 @@
1
- use std::{fs::File, io, mem::ManuallyDrop};
1
+ use std::{
2
+ fs::File,
3
+ io::{self, Read, Seek, SeekFrom},
4
+ mem::ManuallyDrop,
5
+ };
2
6
 
3
7
  pub struct ForgottenFileHandle(pub ManuallyDrop<File>);
4
8
 
5
- impl std::io::Read for ForgottenFileHandle {
9
+ impl Read for ForgottenFileHandle {
6
10
  fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
7
11
  self.0.read(buf)
8
12
  }
13
+ }
9
14
 
10
- fn read_vectored(&mut self, bufs: &mut [std::io::IoSliceMut<'_>]) -> io::Result<usize> {
11
- self.0.read_vectored(bufs)
12
- }
13
-
14
- // fn read_buf(&mut self, cursor: BorrowedCursor<'_>) -> io::Result<()> {
15
- // self.0.read_buf(cursor)
16
- // }
17
-
18
- // #[inline]
19
- // fn is_read_vectored(&self) -> bool {
20
- // self.0.is_read_vectored()
21
- // }
22
-
23
- fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
24
- self.0.read_to_end(buf)
25
- }
26
-
27
- fn read_to_string(&mut self, buf: &mut String) -> io::Result<usize> {
28
- self.0.read_to_string(buf)
15
+ impl Seek for ForgottenFileHandle {
16
+ fn seek(&mut self, pos: SeekFrom) -> io::Result<u64> {
17
+ self.0.seek(pos)
29
18
  }
30
19
  }
@@ -2,9 +2,12 @@ use magnus::{
2
2
  value::{Opaque, ReprValue},
3
3
  RClass, RString, Ruby, Value,
4
4
  };
5
- use std::io::{self, Read, Seek, SeekFrom, Write};
5
+ use std::fs::File;
6
+ use std::io::{self, BufReader, Read, Seek, SeekFrom, Write};
6
7
  use std::sync::OnceLock;
7
8
 
9
+ use super::ForgottenFileHandle;
10
+
8
11
  static STRING_IO_CLASS: OnceLock<Opaque<RClass>> = OnceLock::new();
9
12
 
10
13
  /// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
@@ -17,6 +20,10 @@ pub struct RubyReader<T> {
17
20
  pub trait SeekableRead: std::io::Read + Seek {}
18
21
  impl SeekableRead for RubyReader<Value> {}
19
22
  impl SeekableRead for RubyReader<RString> {}
23
+ impl SeekableRead for File {}
24
+ impl<T: Read + Seek> SeekableRead for BufReader<T> {}
25
+ impl SeekableRead for std::io::Cursor<Vec<u8>> {}
26
+ impl SeekableRead for ForgottenFileHandle {}
20
27
 
21
28
  pub fn build_ruby_reader(
22
29
  ruby: &Ruby,
@@ -74,9 +81,7 @@ impl Seek for RubyReader<RString> {
74
81
  match pos {
75
82
  io::SeekFrom::Start(offset) => self.offset = offset as usize,
76
83
  io::SeekFrom::Current(offset) => self.offset = (self.offset as i64 + offset) as usize,
77
- io::SeekFrom::End(offset) => {
78
- self.offset = self.inner.len() - offset as usize
79
- }
84
+ io::SeekFrom::End(offset) => self.offset = self.inner.len() - offset as usize,
80
85
  }
81
86
  Ok(self.offset as u64)
82
87
  }