osv 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 51e4a387f1ed43bddc9f1f7a118637953d04239b5324ef131b9c860577ed4d41
4
- data.tar.gz: e42928a09656216bbadcc2458953a8c5f28401ddf27095fc05038e0960471854
3
+ metadata.gz: 137ae556685639f7d13234e3061d9b310757ce02f75a713753d175f1bc71b628
4
+ data.tar.gz: 5892494ad08d783955d2b932150d65433a4d3593376fadbaf54e54780e7a350f
5
5
  SHA512:
6
- metadata.gz: 4100c50a629ba5803db883532cfbe547eb3091e421b0876595d91791d8952a7b0169477c9c6f31063eafa5b91d0a9b1a9f0a5ae016d70cdd101e284beebfaf22
7
- data.tar.gz: 90a822c644fcb37dc1892ede85a54395bc9e62a4b0b0a1af838182d390702d0ee4253151faafcedbf734b0a381fe2acf5c1ab23b842059fbdd4d51570fe33e58
6
+ metadata.gz: 6efbc2ee65a8e79379722ae977ee7dbec6131b78968d080f9feb86a3310368c387da54dd8c073e9b4008cb80d906293ea9115982d00d5ff637cf5ab51179b53c
7
+ data.tar.gz: 7b4ab3199f90654cd831dfbb52a9d22b70237e7120bd5308a1b7698268fa981abefd7ee47d53424d0c7bff46956256db8f1e139d17e381fd5570a16ca183e376
@@ -213,7 +213,8 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
213
213
  .from_reader(reader);
214
214
 
215
215
  let mut headers =
216
- RecordReader::<T>::get_headers(&self.ruby, &mut reader, self.has_headers)?;
216
+ RecordReader::<T>::get_headers(&self.ruby, &mut reader, self.has_headers, self.lossy)?;
217
+
217
218
  if self.ignore_null_bytes {
218
219
  headers = headers.iter().map(|h| h.replace("\0", "")).collect();
219
220
  }
@@ -8,7 +8,7 @@ use std::{
8
8
  collections::HashMap,
9
9
  sync::{
10
10
  atomic::{AtomicU32, Ordering},
11
- LazyLock, Mutex, OnceLock,
11
+ Arc, LazyLock, Mutex, OnceLock,
12
12
  },
13
13
  };
14
14
 
@@ -22,12 +22,11 @@ pub enum CacheError {
22
22
  LockError(String),
23
23
  }
24
24
 
25
- static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (StringCacheKey, AtomicU32)>>> =
25
+ static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (Arc<StringCacheKey>, AtomicU32)>>> =
26
26
  LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
27
27
 
28
28
  pub struct StringCache;
29
29
 
30
- #[derive(Copy, Clone)]
31
30
  pub struct StringCacheKey(Opaque<FString>, &'static str);
32
31
 
33
32
  impl StringCacheKey {
@@ -50,6 +49,12 @@ impl IntoValue for StringCacheKey {
50
49
  }
51
50
  }
52
51
 
52
+ impl IntoValue for &StringCacheKey {
53
+ fn into_value_with(self, handle: &Ruby) -> Value {
54
+ handle.into_value(self.0)
55
+ }
56
+ }
57
+
53
58
  impl std::fmt::Debug for StringCacheKey {
54
59
  fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
55
60
  self.1.fmt(f)
@@ -72,43 +77,43 @@ impl std::hash::Hash for StringCacheKey {
72
77
 
73
78
  impl StringCache {
74
79
  #[allow(dead_code)]
75
- pub fn intern(string: String) -> Result<StringCacheKey, CacheError> {
80
+ pub fn intern(string: String) -> Result<Arc<StringCacheKey>, CacheError> {
76
81
  let mut cache = STRING_CACHE
77
82
  .lock()
78
83
  .map_err(|e| CacheError::LockError(e.to_string()))?;
79
84
 
80
85
  if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
81
86
  counter.fetch_add(1, Ordering::Relaxed);
82
- Ok(*interned_string)
87
+ Ok(interned_string.clone())
83
88
  } else {
84
- let interned = StringCacheKey::new(string.as_str());
89
+ let interned = Arc::new(StringCacheKey::new(string.as_str()));
85
90
  let leaked = Box::leak(string.into_boxed_str());
86
- cache.insert(leaked, (interned, AtomicU32::new(1)));
91
+ cache.insert(leaked, (interned.clone(), AtomicU32::new(1)));
87
92
  Ok(interned)
88
93
  }
89
94
  }
90
95
 
91
- pub fn intern_many(strings: &[String]) -> Result<Vec<StringCacheKey>, CacheError> {
96
+ pub fn intern_many(strings: &[String]) -> Result<Vec<Arc<StringCacheKey>>, CacheError> {
92
97
  let mut cache = STRING_CACHE
93
98
  .lock()
94
99
  .map_err(|e| CacheError::LockError(e.to_string()))?;
95
100
 
96
- let mut result: Vec<StringCacheKey> = Vec::with_capacity(strings.len());
101
+ let mut result: Vec<Arc<StringCacheKey>> = Vec::with_capacity(strings.len());
97
102
  for string in strings {
98
103
  if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
99
104
  counter.fetch_add(1, Ordering::Relaxed);
100
- result.push(*interned_string);
105
+ result.push(interned_string.clone());
101
106
  } else {
102
- let interned = StringCacheKey::new(string);
107
+ let interned = Arc::new(StringCacheKey::new(string));
103
108
  let leaked = Box::leak(string.clone().into_boxed_str());
104
- cache.insert(leaked, (interned, AtomicU32::new(1)));
109
+ cache.insert(leaked, (interned.clone(), AtomicU32::new(1)));
105
110
  result.push(interned);
106
111
  }
107
112
  }
108
113
  Ok(result)
109
114
  }
110
115
 
111
- pub fn clear(headers: &[StringCacheKey]) -> Result<(), CacheError> {
116
+ pub fn clear(headers: &[Arc<StringCacheKey>]) -> Result<(), CacheError> {
112
117
  let mut cache = STRING_CACHE
113
118
  .lock()
114
119
  .map_err(|e| CacheError::LockError(e.to_string()))?;
@@ -116,7 +121,7 @@ impl StringCache {
116
121
  let to_remove: Vec<_> = headers
117
122
  .iter()
118
123
  .filter_map(|header| {
119
- let key = header.as_ref();
124
+ let key = header.as_ref().as_ref();
120
125
  if let Some((_, (_, counter))) = cache.get_key_value(key) {
121
126
  let prev_count = counter.fetch_sub(1, Ordering::Relaxed);
122
127
  if prev_count == 1 {
@@ -140,7 +145,7 @@ impl StringCache {
140
145
 
141
146
  pub struct HeaderCacheCleanupIter<I> {
142
147
  pub inner: I,
143
- pub headers: OnceLock<Vec<StringCacheKey>>,
148
+ pub headers: OnceLock<Vec<Arc<StringCacheKey>>>,
144
149
  }
145
150
 
146
151
  impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
@@ -1,6 +1,7 @@
1
1
  use std::borrow::Cow;
2
2
  use std::collections::HashMap;
3
3
  use std::hash::BuildHasher;
4
+ use std::sync::Arc;
4
5
 
5
6
  use super::header_cache::StringCacheKey;
6
7
  use super::CowStr;
@@ -14,7 +15,7 @@ pub trait RecordParser<'a> {
14
15
  type Output;
15
16
 
16
17
  fn parse(
17
- headers: &[StringCacheKey],
18
+ headers: &[Arc<StringCacheKey>],
18
19
  record: &CsvRecordType,
19
20
  null_string: Option<Cow<'a, str>>,
20
21
  ignore_null_bytes: bool,
@@ -22,13 +23,13 @@ pub trait RecordParser<'a> {
22
23
  }
23
24
 
24
25
  impl<'a, S: BuildHasher + Default> RecordParser<'a>
25
- for HashMap<StringCacheKey, Option<CowStr<'a>>, S>
26
+ for HashMap<Arc<StringCacheKey>, Option<CowStr<'a>>, S>
26
27
  {
27
28
  type Output = Self;
28
29
 
29
30
  #[inline]
30
31
  fn parse(
31
- headers: &[StringCacheKey],
32
+ headers: &[Arc<StringCacheKey>],
32
33
  record: &CsvRecordType,
33
34
  null_string: Option<Cow<'a, str>>,
34
35
  ignore_null_bytes: bool,
@@ -65,7 +66,7 @@ impl<'a, S: BuildHasher + Default> RecordParser<'a>
65
66
  }),
66
67
  };
67
68
 
68
- map.insert(*header, value);
69
+ map.insert(header.clone(), value);
69
70
  });
70
71
  map
71
72
  }
@@ -76,7 +77,7 @@ impl<'a> RecordParser<'a> for Vec<Option<CowStr<'a>>> {
76
77
 
77
78
  #[inline]
78
79
  fn parse(
79
- headers: &[StringCacheKey],
80
+ headers: &[Arc<StringCacheKey>],
80
81
  record: &CsvRecordType,
81
82
  null_string: Option<Cow<'a, str>>,
82
83
  ignore_null_bytes: bool,
@@ -1,13 +1,13 @@
1
1
  use itertools::Itertools;
2
2
  use magnus::{value::ReprValue, IntoValue, Ruby, Value};
3
- use std::{borrow::Cow, collections::HashMap, hash::BuildHasher};
3
+ use std::{borrow::Cow, collections::HashMap, hash::BuildHasher, sync::Arc};
4
4
 
5
5
  use super::StringCacheKey;
6
6
 
7
7
  #[derive(Debug)]
8
8
  pub enum CsvRecord<'a, S: BuildHasher + Default> {
9
9
  Vec(Vec<Option<CowStr<'a>>>),
10
- Map(HashMap<StringCacheKey, Option<CowStr<'a>>, S>),
10
+ Map(HashMap<Arc<StringCacheKey>, Option<CowStr<'a>>, S>),
11
11
  }
12
12
 
13
13
  impl<S: BuildHasher + Default> IntoValue for CsvRecord<'_, S> {
@@ -28,7 +28,7 @@ impl<S: BuildHasher + Default> IntoValue for CsvRecord<'_, S> {
28
28
 
29
29
  for chunk in &map.into_iter().chunks(128) {
30
30
  for (k, v) in chunk {
31
- values[i] = handle.into_value(k);
31
+ values[i] = handle.into_value(k.as_ref());
32
32
  values[i + 1] = handle.into_value(v);
33
33
  i += 2;
34
34
  }
@@ -5,6 +5,7 @@ use super::{header_cache::StringCache, ruby_reader::SeekableRead};
5
5
  use magnus::{Error, Ruby};
6
6
  use std::borrow::Cow;
7
7
  use std::io::{BufReader, Read};
8
+ use std::sync::Arc;
8
9
 
9
10
  /// Size of the internal buffer used for reading CSV records
10
11
  pub(crate) const READ_BUFFER_SIZE: usize = 16384;
@@ -14,7 +15,7 @@ pub(crate) const READ_BUFFER_SIZE: usize = 16384;
14
15
  /// This struct implements Iterator to provide a streaming interface for CSV records.
15
16
  pub struct RecordReader<'a, T: RecordParser<'a>> {
16
17
  reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
17
- headers: Vec<StringCacheKey>,
18
+ headers: Vec<Arc<StringCacheKey>>,
18
19
  null_string: Option<Cow<'a, str>>,
19
20
  string_record: CsvRecordType,
20
21
  parser: std::marker::PhantomData<T>,
@@ -36,25 +37,45 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
36
37
  ruby: &Ruby,
37
38
  reader: &mut csv::Reader<impl Read>,
38
39
  has_headers: bool,
40
+ lossy: bool,
39
41
  ) -> Result<Vec<String>, Error> {
40
- let first_row = reader.headers().map_err(|e| {
41
- Error::new(
42
- ruby.exception_runtime_error(),
43
- format!("Failed to read headers: {e}"),
44
- )
45
- })?;
46
-
47
- Ok(if has_headers {
48
- first_row.iter().map(String::from).collect()
42
+ let headers = if lossy {
43
+ let first_row = reader.byte_headers().map_err(|e| {
44
+ Error::new(
45
+ ruby.exception_runtime_error(),
46
+ format!("Failed to read headers: {e}"),
47
+ )
48
+ })?;
49
+ if has_headers {
50
+ first_row
51
+ .iter()
52
+ .map(String::from_utf8_lossy)
53
+ .map(|x| x.to_string())
54
+ .collect()
55
+ } else {
56
+ (0..first_row.len()).map(|i| format!("c{i}")).collect()
57
+ }
49
58
  } else {
50
- (0..first_row.len()).map(|i| format!("c{i}")).collect()
51
- })
59
+ let first_row = reader.headers().map_err(|e| {
60
+ Error::new(
61
+ ruby.exception_runtime_error(),
62
+ format!("Failed to read headers: {e}"),
63
+ )
64
+ })?;
65
+ if has_headers {
66
+ first_row.iter().map(String::from).collect()
67
+ } else {
68
+ (0..first_row.len()).map(|i| format!("c{i}")).collect()
69
+ }
70
+ };
71
+
72
+ Ok(headers)
52
73
  }
53
74
 
54
75
  /// Creates a new RecordReader instance.
55
76
  pub(crate) fn new(
56
77
  reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
57
- headers: Vec<StringCacheKey>,
78
+ headers: Vec<Arc<StringCacheKey>>,
58
79
  null_string: Option<Cow<'a, str>>,
59
80
  ignore_null_bytes: bool,
60
81
  lossy: bool,
@@ -5,6 +5,7 @@ use csv::Trim;
5
5
  use magnus::value::ReprValue;
6
6
  use magnus::{Error, IntoValue, KwArgs, RHash, Ruby, Symbol, Value};
7
7
  use std::collections::HashMap;
8
+ use std::sync::Arc;
8
9
 
9
10
  /// Valid result types for CSV parsing
10
11
  #[derive(Debug, Clone, Copy, PartialEq, Eq)]
@@ -93,7 +94,7 @@ pub fn parse_csv(rb_self: Value, args: &[Value]) -> Result<Value, Error> {
93
94
  match result_type {
94
95
  ResultType::Hash => {
95
96
  let builder = RecordReaderBuilder::<
96
- HashMap<StringCacheKey, Option<CowStr<'static>>, RandomState>,
97
+ HashMap<Arc<StringCacheKey>, Option<CowStr<'static>>, RandomState>,
97
98
  >::new(ruby, to_read)
98
99
  .has_headers(has_headers)
99
100
  .flexible(flexible)
data/lib/osv/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module OSV
2
- VERSION = "0.4.0"
2
+ VERSION = "0.4.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: osv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-01-30 00:00:00.000000000 Z
11
+ date: 2025-01-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys