osv 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f22d1d56b0eba1e23ca192db2c70e68689486e2f0032672285017e1f98a530d2
4
- data.tar.gz: 6288dce70b95faf312e8aa244ba56a60c3d59b85bdd16a4a951060df78b97e1e
3
+ metadata.gz: 687f5a3426558ea4d4d342f153dae1f11da93807daddcc6944888d92cda7502b
4
+ data.tar.gz: e9f766b94e4c7f806e81eec859a0146b6a4758c5a69c759b084f64e01fd7a888
5
5
  SHA512:
6
- metadata.gz: 5399b43ecd3987c73daf09341d51a1ee8e5d060f1085e9a7aac9b823ab723ccbbc4084c5c0c9abbd36e7e17becfcfa1757af6af3666e01ef3992a27e35b5b983
7
- data.tar.gz: 2a6d98b645af40ab08a5a01a2bcf5b67ce9ebff18a993e602f6b00fa2f3f80d65c63605a7f5a715286ac20751db0607c515686e4439156a4039ae24f49e95e10
6
+ metadata.gz: bb14e59bca79613064a07cbcfa6c2531f08c68eb131e512b9822c7295bcacfd051de8dcd49100d12c8377d90a438d528dc5b20bcf6a1aa7c4e44f8e2aaae7111
7
+ data.tar.gz: a3a83f9653ef5bc41c908de14c96b756c719fd2750c8739b4a2165ff91ed82d34fdfde184c853185a8ff516e324d01816c65764f05adf58db6b2159ae66b0c74
data/Cargo.lock CHANGED
@@ -43,6 +43,12 @@ version = "2.6.0"
43
43
  source = "registry+https://github.com/rust-lang/crates.io-index"
44
44
  checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
45
45
 
46
+ [[package]]
47
+ name = "bumpalo"
48
+ version = "3.16.0"
49
+ source = "registry+https://github.com/rust-lang/crates.io-index"
50
+ checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
51
+
46
52
  [[package]]
47
53
  name = "cexpr"
48
54
  version = "0.6.0"
@@ -232,6 +238,7 @@ dependencies = [
232
238
  name = "osv"
233
239
  version = "0.1.0"
234
240
  dependencies = [
241
+ "bumpalo",
235
242
  "csv",
236
243
  "kanal",
237
244
  "magnus 0.7.1",
@@ -260,18 +267,18 @@ dependencies = [
260
267
 
261
268
  [[package]]
262
269
  name = "rb-sys"
263
- version = "0.9.103"
270
+ version = "0.9.104"
264
271
  source = "registry+https://github.com/rust-lang/crates.io-index"
265
- checksum = "91dbe37ab6ac2fba187480fb6544b92445e41e5c6f553bf0c33743f3c450a1df"
272
+ checksum = "e2e26425f064a90404ed5e33fee2137b02a9c6d1c83e19394f4d8a476b9d76a2"
266
273
  dependencies = [
267
274
  "rb-sys-build",
268
275
  ]
269
276
 
270
277
  [[package]]
271
278
  name = "rb-sys-build"
272
- version = "0.9.103"
279
+ version = "0.9.104"
273
280
  source = "registry+https://github.com/rust-lang/crates.io-index"
274
- checksum = "c4d56a49dcb646b70b758789c0d16c055a386a4f2a3346333abb69850fa860ce"
281
+ checksum = "c9802c9003c5648ee0a067e9aa8960d402d5f764f682f93c1ed49eec72f6d7fc"
275
282
  dependencies = [
276
283
  "bindgen",
277
284
  "lazy_static",
data/ext/osv/Cargo.toml CHANGED
@@ -7,9 +7,10 @@ edition = "2021"
7
7
  crate-type = ["cdylib"]
8
8
 
9
9
  [dependencies]
10
- csv = "1.3.1"
10
+ csv = "^1.3"
11
11
  magnus = { version = "0.7", features = ["rb-sys"] }
12
- rb-sys = "0.9"
12
+ rb-sys = "^0.9"
13
13
  serde = { version = "1.0", features = ["derive"] }
14
14
  serde_magnus = "0.8.1"
15
15
  kanal = "0.1.0-pre8"
16
+ bumpalo = { version = "3.16.0", features = ["collections"] }
@@ -1,4 +1,5 @@
1
1
  use super::{
2
+ header_cache::StringCache,
2
3
  parser::RecordParser,
3
4
  reader::{ReadImpl, RecordReader},
4
5
  };
@@ -80,7 +81,6 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
80
81
 
81
82
  pub fn build(self) -> Result<RecordReader<T>, Error> {
82
83
  let readable = self.get_reader()?;
83
-
84
84
  let mut reader = csv::ReaderBuilder::new()
85
85
  .has_headers(self.has_headers)
86
86
  .delimiter(self.delimiter)
@@ -88,14 +88,21 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
88
88
  .from_reader(readable);
89
89
 
90
90
  let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
91
- let headers_clone = headers.clone();
92
91
  let null_string = self.null_string;
93
92
 
93
+ let static_headers = StringCache::intern_many(&headers).map_err(|e| {
94
+ Error::new(
95
+ self.ruby.exception_runtime_error(),
96
+ format!("Failed to intern headers: {e}"),
97
+ )
98
+ })?;
99
+ let headers_for_cleanup = static_headers.clone();
100
+
94
101
  let (sender, receiver) = kanal::bounded(self.buffer);
95
102
  let handle = thread::spawn(move || {
96
103
  let mut record = csv::StringRecord::new();
97
104
  while let Ok(true) = reader.read_record(&mut record) {
98
- let row = T::parse(&headers_clone, &record, &null_string);
105
+ let row = T::parse(&static_headers, &record, &null_string);
99
106
  if sender.send(row).is_err() {
100
107
  break;
101
108
  }
@@ -106,6 +113,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
106
113
 
107
114
  Ok(RecordReader {
108
115
  reader: ReadImpl::MultiThreaded {
116
+ headers: headers_for_cleanup,
109
117
  receiver,
110
118
  handle: Some(handle),
111
119
  },
@@ -0,0 +1,63 @@
1
+ use std::{
2
+ collections::HashMap,
3
+ sync::{atomic::AtomicU32, LazyLock, Mutex},
4
+ };
5
+
6
+ static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, AtomicU32>>> =
7
+ LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
8
+
9
+ pub struct StringCache {}
10
+
11
+ impl StringCache {
12
+ #[allow(dead_code)]
13
+ pub fn intern(string: String) -> Result<&'static str, String> {
14
+ let mut cache = STRING_CACHE.lock().map_err(|e| e.to_string())?;
15
+
16
+ if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
17
+ count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
18
+ Ok(existing)
19
+ } else {
20
+ let leaked = Box::leak(string.into_boxed_str());
21
+ cache.insert(leaked, AtomicU32::new(1));
22
+ Ok(leaked)
23
+ }
24
+ }
25
+
26
+ pub fn intern_many(strings: &[String]) -> Result<Vec<&'static str>, String> {
27
+ let mut cache = STRING_CACHE.lock().map_err(|e| e.to_string())?;
28
+ let mut result = Vec::with_capacity(strings.len());
29
+
30
+ for string in strings {
31
+ let static_str: &'static str =
32
+ if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
33
+ count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
34
+ existing
35
+ } else {
36
+ let leaked = Box::leak(string.clone().into_boxed_str());
37
+ cache.insert(leaked, AtomicU32::new(1));
38
+ leaked
39
+ };
40
+ result.push(static_str);
41
+ }
42
+
43
+ Ok(result)
44
+ }
45
+
46
+ pub fn clear(headers: &[&'static str]) -> Result<(), String> {
47
+ let cache = STRING_CACHE.lock().map_err(|e| e.to_string())?;
48
+
49
+ for header in headers {
50
+ if let Some(count) = cache.get(header) {
51
+ let remaining = count.fetch_sub(1, std::sync::atomic::Ordering::Relaxed);
52
+ if remaining == 0 {
53
+ let ptr = *header as *const str as *mut str;
54
+ unsafe {
55
+ let _ = Box::from_raw(ptr);
56
+ }
57
+ }
58
+ }
59
+ }
60
+
61
+ Ok(())
62
+ }
63
+ }
@@ -1,8 +1,8 @@
1
1
  mod builder;
2
+ mod header_cache;
2
3
  mod parser;
3
4
  mod reader;
4
5
  mod record;
5
6
 
6
7
  pub use builder::RecordReaderBuilder;
7
8
  pub use record::CsvRecord;
8
-
@@ -2,42 +2,50 @@ use std::collections::HashMap;
2
2
 
3
3
  pub trait RecordParser {
4
4
  type Output;
5
-
6
- fn parse(headers: &[String], record: &csv::StringRecord, null_string: &str) -> Self::Output;
5
+ fn parse(
6
+ headers: &[&'static str],
7
+ record: &csv::StringRecord,
8
+ null_string: &str,
9
+ ) -> Self::Output;
7
10
  }
8
11
 
9
- impl RecordParser for HashMap<String, Option<String>> {
12
+ impl RecordParser for HashMap<&'static str, Option<String>> {
10
13
  type Output = Self;
11
-
12
- fn parse(headers: &[String], record: &csv::StringRecord, null_string: &str) -> Self::Output {
13
- headers
14
- .iter()
15
- .zip(record.iter())
16
- .map(|(header, field)| {
17
- let value = if field == null_string {
14
+ fn parse(
15
+ headers: &[&'static str],
16
+ record: &csv::StringRecord,
17
+ null_string: &str,
18
+ ) -> Self::Output {
19
+ let mut map = HashMap::with_capacity(headers.len());
20
+ for (header, field) in headers.iter().zip(record.iter()) {
21
+ map.insert(
22
+ *header,
23
+ if field == null_string {
18
24
  None
19
25
  } else {
20
26
  Some(field.to_string())
21
- };
22
- (header.clone(), value)
23
- })
24
- .collect()
27
+ },
28
+ );
29
+ }
30
+ map
25
31
  }
26
32
  }
27
33
 
28
34
  impl RecordParser for Vec<Option<String>> {
29
35
  type Output = Self;
30
-
31
- fn parse(_headers: &[String], record: &csv::StringRecord, null_string: &str) -> Self::Output {
32
- record
33
- .iter()
34
- .map(|field| {
35
- if field == null_string {
36
- None
37
- } else {
38
- Some(field.to_string())
39
- }
40
- })
41
- .collect()
36
+ fn parse(
37
+ _headers: &[&'static str],
38
+ record: &csv::StringRecord,
39
+ null_string: &str,
40
+ ) -> Self::Output {
41
+ let mut vec = Vec::with_capacity(record.len());
42
+ for field in record.iter() {
43
+ vec.push(if field == null_string {
44
+ None
45
+ } else {
46
+ Some(field.to_string())
47
+ });
48
+ }
49
+ vec
42
50
  }
43
51
  }
@@ -1,4 +1,4 @@
1
- use super::parser::RecordParser;
1
+ use super::{header_cache::StringCache, parser::RecordParser};
2
2
  use magnus::{Error, Ruby};
3
3
  use std::{io::Read, thread};
4
4
 
@@ -6,14 +6,36 @@ pub struct RecordReader<T: RecordParser> {
6
6
  pub(crate) reader: ReadImpl<T>,
7
7
  }
8
8
 
9
+ impl<T: RecordParser> Drop for RecordReader<T> {
10
+ fn drop(&mut self) {
11
+ match &mut self.reader {
12
+ ReadImpl::MultiThreaded {
13
+ receiver,
14
+ handle,
15
+ headers,
16
+ } => {
17
+ receiver.close();
18
+ if let Some(handle) = handle.take() {
19
+ let _ = handle.join();
20
+ }
21
+ StringCache::clear(headers).unwrap();
22
+ }
23
+ ReadImpl::SingleThreaded { headers, .. } => {
24
+ StringCache::clear(headers).unwrap();
25
+ }
26
+ }
27
+ }
28
+ }
29
+
9
30
  #[allow(dead_code)]
10
31
  pub enum ReadImpl<T: RecordParser> {
11
32
  SingleThreaded {
12
33
  reader: csv::Reader<Box<dyn Read + Send + 'static>>,
13
- headers: Vec<String>,
34
+ headers: Vec<&'static str>,
14
35
  null_string: String,
15
36
  },
16
37
  MultiThreaded {
38
+ headers: Vec<&'static str>,
17
39
  receiver: kanal::Receiver<T::Output>,
18
40
  handle: Option<thread::JoinHandle<()>>,
19
41
  },
@@ -48,7 +70,9 @@ impl<T: RecordParser> Iterator for RecordReader<T> {
48
70
 
49
71
  fn next(&mut self) -> Option<Self::Item> {
50
72
  match &mut self.reader {
51
- ReadImpl::MultiThreaded { receiver, handle } => match receiver.recv() {
73
+ ReadImpl::MultiThreaded {
74
+ receiver, handle, ..
75
+ } => match receiver.recv() {
52
76
  Ok(record) => Some(record),
53
77
  Err(_) => {
54
78
  if let Some(handle) = handle.take() {
@@ -1,17 +1,23 @@
1
- use magnus::{IntoValue, Ruby, Value};
1
+ use magnus::{IntoValue, RHash, Ruby, Value};
2
2
  use std::collections::HashMap;
3
3
 
4
4
  #[derive(Debug)]
5
5
  pub enum CsvRecord {
6
6
  Vec(Vec<Option<String>>),
7
- Map(HashMap<String, Option<String>>),
7
+ Map(HashMap<&'static str, Option<String>>),
8
8
  }
9
9
 
10
10
  impl IntoValue for CsvRecord {
11
11
  fn into_value_with(self, handle: &Ruby) -> Value {
12
12
  match self {
13
13
  CsvRecord::Vec(vec) => vec.into_value_with(handle),
14
- CsvRecord::Map(map) => map.into_value_with(handle),
14
+ CsvRecord::Map(map) => {
15
+ let hash = RHash::new();
16
+ for (k, v) in map {
17
+ hash.aset(k, v).unwrap();
18
+ }
19
+ hash.into_value_with(handle)
20
+ }
15
21
  }
16
22
  }
17
23
  }
@@ -1,15 +1,15 @@
1
- use std::collections::HashMap;
2
-
3
1
  use crate::csv::{CsvRecord, RecordReaderBuilder};
4
2
  use crate::utils::*;
5
3
  use magnus::value::ReprValue;
6
4
  use magnus::{block::Yield, Error, KwArgs, RHash, Ruby, Symbol, Value};
5
+ use std::collections::HashMap;
7
6
 
8
7
  pub fn parse_csv(
9
- ruby: &Ruby,
10
8
  rb_self: Value,
11
9
  args: &[Value],
12
10
  ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord>>>, Error> {
11
+ let ruby = unsafe { Ruby::get_unchecked() };
12
+
13
13
  let CsvArgs {
14
14
  to_read,
15
15
  has_headers,
@@ -18,7 +18,7 @@ pub fn parse_csv(
18
18
  null_string,
19
19
  buffer_size,
20
20
  result_type,
21
- } = parse_csv_args(ruby, args)?;
21
+ } = parse_csv_args(&ruby, args)?;
22
22
 
23
23
  if !ruby.block_given() {
24
24
  return create_enumerator(EnumeratorArgs {
@@ -35,7 +35,7 @@ pub fn parse_csv(
35
35
 
36
36
  let iter: Box<dyn Iterator<Item = CsvRecord>> = match result_type.as_str() {
37
37
  "hash" => Box::new(
38
- RecordReaderBuilder::<HashMap<String, Option<String>>>::new(ruby, to_read)
38
+ RecordReaderBuilder::<HashMap<&'static str, Option<String>>>::new(&ruby, to_read)
39
39
  .has_headers(has_headers)
40
40
  .delimiter(delimiter)
41
41
  .quote_char(quote_char)
@@ -45,7 +45,7 @@ pub fn parse_csv(
45
45
  .map(CsvRecord::Map),
46
46
  ),
47
47
  "array" => Box::new(
48
- RecordReaderBuilder::<Vec<Option<String>>>::new(ruby, to_read)
48
+ RecordReaderBuilder::<Vec<Option<String>>>::new(&ruby, to_read)
49
49
  .has_headers(has_headers)
50
50
  .delimiter(delimiter)
51
51
  .quote_char(quote_char)
data/lib/osv/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module OSV
2
- VERSION = "0.3.0"
2
+ VERSION = "0.3.2"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: osv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
@@ -56,6 +56,7 @@ files:
56
56
  - ext/osv/Cargo.toml
57
57
  - ext/osv/extconf.rb
58
58
  - ext/osv/src/csv/builder.rs
59
+ - ext/osv/src/csv/header_cache.rs
59
60
  - ext/osv/src/csv/mod.rs
60
61
  - ext/osv/src/csv/parser.rs
61
62
  - ext/osv/src/csv/reader.rs