osv 0.3.0 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f22d1d56b0eba1e23ca192db2c70e68689486e2f0032672285017e1f98a530d2
4
- data.tar.gz: 6288dce70b95faf312e8aa244ba56a60c3d59b85bdd16a4a951060df78b97e1e
3
+ metadata.gz: 687f5a3426558ea4d4d342f153dae1f11da93807daddcc6944888d92cda7502b
4
+ data.tar.gz: e9f766b94e4c7f806e81eec859a0146b6a4758c5a69c759b084f64e01fd7a888
5
5
  SHA512:
6
- metadata.gz: 5399b43ecd3987c73daf09341d51a1ee8e5d060f1085e9a7aac9b823ab723ccbbc4084c5c0c9abbd36e7e17becfcfa1757af6af3666e01ef3992a27e35b5b983
7
- data.tar.gz: 2a6d98b645af40ab08a5a01a2bcf5b67ce9ebff18a993e602f6b00fa2f3f80d65c63605a7f5a715286ac20751db0607c515686e4439156a4039ae24f49e95e10
6
+ metadata.gz: bb14e59bca79613064a07cbcfa6c2531f08c68eb131e512b9822c7295bcacfd051de8dcd49100d12c8377d90a438d528dc5b20bcf6a1aa7c4e44f8e2aaae7111
7
+ data.tar.gz: a3a83f9653ef5bc41c908de14c96b756c719fd2750c8739b4a2165ff91ed82d34fdfde184c853185a8ff516e324d01816c65764f05adf58db6b2159ae66b0c74
data/Cargo.lock CHANGED
@@ -43,6 +43,12 @@ version = "2.6.0"
43
43
  source = "registry+https://github.com/rust-lang/crates.io-index"
44
44
  checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
45
45
 
46
+ [[package]]
47
+ name = "bumpalo"
48
+ version = "3.16.0"
49
+ source = "registry+https://github.com/rust-lang/crates.io-index"
50
+ checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
51
+
46
52
  [[package]]
47
53
  name = "cexpr"
48
54
  version = "0.6.0"
@@ -232,6 +238,7 @@ dependencies = [
232
238
  name = "osv"
233
239
  version = "0.1.0"
234
240
  dependencies = [
241
+ "bumpalo",
235
242
  "csv",
236
243
  "kanal",
237
244
  "magnus 0.7.1",
@@ -260,18 +267,18 @@ dependencies = [
260
267
 
261
268
  [[package]]
262
269
  name = "rb-sys"
263
- version = "0.9.103"
270
+ version = "0.9.104"
264
271
  source = "registry+https://github.com/rust-lang/crates.io-index"
265
- checksum = "91dbe37ab6ac2fba187480fb6544b92445e41e5c6f553bf0c33743f3c450a1df"
272
+ checksum = "e2e26425f064a90404ed5e33fee2137b02a9c6d1c83e19394f4d8a476b9d76a2"
266
273
  dependencies = [
267
274
  "rb-sys-build",
268
275
  ]
269
276
 
270
277
  [[package]]
271
278
  name = "rb-sys-build"
272
- version = "0.9.103"
279
+ version = "0.9.104"
273
280
  source = "registry+https://github.com/rust-lang/crates.io-index"
274
- checksum = "c4d56a49dcb646b70b758789c0d16c055a386a4f2a3346333abb69850fa860ce"
281
+ checksum = "c9802c9003c5648ee0a067e9aa8960d402d5f764f682f93c1ed49eec72f6d7fc"
275
282
  dependencies = [
276
283
  "bindgen",
277
284
  "lazy_static",
data/ext/osv/Cargo.toml CHANGED
@@ -7,9 +7,10 @@ edition = "2021"
7
7
  crate-type = ["cdylib"]
8
8
 
9
9
  [dependencies]
10
- csv = "1.3.1"
10
+ csv = "^1.3"
11
11
  magnus = { version = "0.7", features = ["rb-sys"] }
12
- rb-sys = "0.9"
12
+ rb-sys = "^0.9"
13
13
  serde = { version = "1.0", features = ["derive"] }
14
14
  serde_magnus = "0.8.1"
15
15
  kanal = "0.1.0-pre8"
16
+ bumpalo = { version = "3.16.0", features = ["collections"] }
@@ -1,4 +1,5 @@
1
1
  use super::{
2
+ header_cache::StringCache,
2
3
  parser::RecordParser,
3
4
  reader::{ReadImpl, RecordReader},
4
5
  };
@@ -80,7 +81,6 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
80
81
 
81
82
  pub fn build(self) -> Result<RecordReader<T>, Error> {
82
83
  let readable = self.get_reader()?;
83
-
84
84
  let mut reader = csv::ReaderBuilder::new()
85
85
  .has_headers(self.has_headers)
86
86
  .delimiter(self.delimiter)
@@ -88,14 +88,21 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
88
88
  .from_reader(readable);
89
89
 
90
90
  let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
91
- let headers_clone = headers.clone();
92
91
  let null_string = self.null_string;
93
92
 
93
+ let static_headers = StringCache::intern_many(&headers).map_err(|e| {
94
+ Error::new(
95
+ self.ruby.exception_runtime_error(),
96
+ format!("Failed to intern headers: {e}"),
97
+ )
98
+ })?;
99
+ let headers_for_cleanup = static_headers.clone();
100
+
94
101
  let (sender, receiver) = kanal::bounded(self.buffer);
95
102
  let handle = thread::spawn(move || {
96
103
  let mut record = csv::StringRecord::new();
97
104
  while let Ok(true) = reader.read_record(&mut record) {
98
- let row = T::parse(&headers_clone, &record, &null_string);
105
+ let row = T::parse(&static_headers, &record, &null_string);
99
106
  if sender.send(row).is_err() {
100
107
  break;
101
108
  }
@@ -106,6 +113,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
106
113
 
107
114
  Ok(RecordReader {
108
115
  reader: ReadImpl::MultiThreaded {
116
+ headers: headers_for_cleanup,
109
117
  receiver,
110
118
  handle: Some(handle),
111
119
  },
@@ -0,0 +1,63 @@
1
+ use std::{
2
+ collections::HashMap,
3
+ sync::{atomic::AtomicU32, LazyLock, Mutex},
4
+ };
5
+
6
+ static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, AtomicU32>>> =
7
+ LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
8
+
9
+ pub struct StringCache {}
10
+
11
+ impl StringCache {
12
+ #[allow(dead_code)]
13
+ pub fn intern(string: String) -> Result<&'static str, String> {
14
+ let mut cache = STRING_CACHE.lock().map_err(|e| e.to_string())?;
15
+
16
+ if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
17
+ count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
18
+ Ok(existing)
19
+ } else {
20
+ let leaked = Box::leak(string.into_boxed_str());
21
+ cache.insert(leaked, AtomicU32::new(1));
22
+ Ok(leaked)
23
+ }
24
+ }
25
+
26
+ pub fn intern_many(strings: &[String]) -> Result<Vec<&'static str>, String> {
27
+ let mut cache = STRING_CACHE.lock().map_err(|e| e.to_string())?;
28
+ let mut result = Vec::with_capacity(strings.len());
29
+
30
+ for string in strings {
31
+ let static_str: &'static str =
32
+ if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
33
+ count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
34
+ existing
35
+ } else {
36
+ let leaked = Box::leak(string.clone().into_boxed_str());
37
+ cache.insert(leaked, AtomicU32::new(1));
38
+ leaked
39
+ };
40
+ result.push(static_str);
41
+ }
42
+
43
+ Ok(result)
44
+ }
45
+
46
+ pub fn clear(headers: &[&'static str]) -> Result<(), String> {
47
+ let cache = STRING_CACHE.lock().map_err(|e| e.to_string())?;
48
+
49
+ for header in headers {
50
+ if let Some(count) = cache.get(header) {
51
+ let remaining = count.fetch_sub(1, std::sync::atomic::Ordering::Relaxed);
52
+ if remaining == 0 {
53
+ let ptr = *header as *const str as *mut str;
54
+ unsafe {
55
+ let _ = Box::from_raw(ptr);
56
+ }
57
+ }
58
+ }
59
+ }
60
+
61
+ Ok(())
62
+ }
63
+ }
@@ -1,8 +1,8 @@
1
1
  mod builder;
2
+ mod header_cache;
2
3
  mod parser;
3
4
  mod reader;
4
5
  mod record;
5
6
 
6
7
  pub use builder::RecordReaderBuilder;
7
8
  pub use record::CsvRecord;
8
-
@@ -2,42 +2,50 @@ use std::collections::HashMap;
2
2
 
3
3
  pub trait RecordParser {
4
4
  type Output;
5
-
6
- fn parse(headers: &[String], record: &csv::StringRecord, null_string: &str) -> Self::Output;
5
+ fn parse(
6
+ headers: &[&'static str],
7
+ record: &csv::StringRecord,
8
+ null_string: &str,
9
+ ) -> Self::Output;
7
10
  }
8
11
 
9
- impl RecordParser for HashMap<String, Option<String>> {
12
+ impl RecordParser for HashMap<&'static str, Option<String>> {
10
13
  type Output = Self;
11
-
12
- fn parse(headers: &[String], record: &csv::StringRecord, null_string: &str) -> Self::Output {
13
- headers
14
- .iter()
15
- .zip(record.iter())
16
- .map(|(header, field)| {
17
- let value = if field == null_string {
14
+ fn parse(
15
+ headers: &[&'static str],
16
+ record: &csv::StringRecord,
17
+ null_string: &str,
18
+ ) -> Self::Output {
19
+ let mut map = HashMap::with_capacity(headers.len());
20
+ for (header, field) in headers.iter().zip(record.iter()) {
21
+ map.insert(
22
+ *header,
23
+ if field == null_string {
18
24
  None
19
25
  } else {
20
26
  Some(field.to_string())
21
- };
22
- (header.clone(), value)
23
- })
24
- .collect()
27
+ },
28
+ );
29
+ }
30
+ map
25
31
  }
26
32
  }
27
33
 
28
34
  impl RecordParser for Vec<Option<String>> {
29
35
  type Output = Self;
30
-
31
- fn parse(_headers: &[String], record: &csv::StringRecord, null_string: &str) -> Self::Output {
32
- record
33
- .iter()
34
- .map(|field| {
35
- if field == null_string {
36
- None
37
- } else {
38
- Some(field.to_string())
39
- }
40
- })
41
- .collect()
36
+ fn parse(
37
+ _headers: &[&'static str],
38
+ record: &csv::StringRecord,
39
+ null_string: &str,
40
+ ) -> Self::Output {
41
+ let mut vec = Vec::with_capacity(record.len());
42
+ for field in record.iter() {
43
+ vec.push(if field == null_string {
44
+ None
45
+ } else {
46
+ Some(field.to_string())
47
+ });
48
+ }
49
+ vec
42
50
  }
43
51
  }
@@ -1,4 +1,4 @@
1
- use super::parser::RecordParser;
1
+ use super::{header_cache::StringCache, parser::RecordParser};
2
2
  use magnus::{Error, Ruby};
3
3
  use std::{io::Read, thread};
4
4
 
@@ -6,14 +6,36 @@ pub struct RecordReader<T: RecordParser> {
6
6
  pub(crate) reader: ReadImpl<T>,
7
7
  }
8
8
 
9
+ impl<T: RecordParser> Drop for RecordReader<T> {
10
+ fn drop(&mut self) {
11
+ match &mut self.reader {
12
+ ReadImpl::MultiThreaded {
13
+ receiver,
14
+ handle,
15
+ headers,
16
+ } => {
17
+ receiver.close();
18
+ if let Some(handle) = handle.take() {
19
+ let _ = handle.join();
20
+ }
21
+ StringCache::clear(headers).unwrap();
22
+ }
23
+ ReadImpl::SingleThreaded { headers, .. } => {
24
+ StringCache::clear(headers).unwrap();
25
+ }
26
+ }
27
+ }
28
+ }
29
+
9
30
  #[allow(dead_code)]
10
31
  pub enum ReadImpl<T: RecordParser> {
11
32
  SingleThreaded {
12
33
  reader: csv::Reader<Box<dyn Read + Send + 'static>>,
13
- headers: Vec<String>,
34
+ headers: Vec<&'static str>,
14
35
  null_string: String,
15
36
  },
16
37
  MultiThreaded {
38
+ headers: Vec<&'static str>,
17
39
  receiver: kanal::Receiver<T::Output>,
18
40
  handle: Option<thread::JoinHandle<()>>,
19
41
  },
@@ -48,7 +70,9 @@ impl<T: RecordParser> Iterator for RecordReader<T> {
48
70
 
49
71
  fn next(&mut self) -> Option<Self::Item> {
50
72
  match &mut self.reader {
51
- ReadImpl::MultiThreaded { receiver, handle } => match receiver.recv() {
73
+ ReadImpl::MultiThreaded {
74
+ receiver, handle, ..
75
+ } => match receiver.recv() {
52
76
  Ok(record) => Some(record),
53
77
  Err(_) => {
54
78
  if let Some(handle) = handle.take() {
@@ -1,17 +1,23 @@
1
- use magnus::{IntoValue, Ruby, Value};
1
+ use magnus::{IntoValue, RHash, Ruby, Value};
2
2
  use std::collections::HashMap;
3
3
 
4
4
  #[derive(Debug)]
5
5
  pub enum CsvRecord {
6
6
  Vec(Vec<Option<String>>),
7
- Map(HashMap<String, Option<String>>),
7
+ Map(HashMap<&'static str, Option<String>>),
8
8
  }
9
9
 
10
10
  impl IntoValue for CsvRecord {
11
11
  fn into_value_with(self, handle: &Ruby) -> Value {
12
12
  match self {
13
13
  CsvRecord::Vec(vec) => vec.into_value_with(handle),
14
- CsvRecord::Map(map) => map.into_value_with(handle),
14
+ CsvRecord::Map(map) => {
15
+ let hash = RHash::new();
16
+ for (k, v) in map {
17
+ hash.aset(k, v).unwrap();
18
+ }
19
+ hash.into_value_with(handle)
20
+ }
15
21
  }
16
22
  }
17
23
  }
@@ -1,15 +1,15 @@
1
- use std::collections::HashMap;
2
-
3
1
  use crate::csv::{CsvRecord, RecordReaderBuilder};
4
2
  use crate::utils::*;
5
3
  use magnus::value::ReprValue;
6
4
  use magnus::{block::Yield, Error, KwArgs, RHash, Ruby, Symbol, Value};
5
+ use std::collections::HashMap;
7
6
 
8
7
  pub fn parse_csv(
9
- ruby: &Ruby,
10
8
  rb_self: Value,
11
9
  args: &[Value],
12
10
  ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord>>>, Error> {
11
+ let ruby = unsafe { Ruby::get_unchecked() };
12
+
13
13
  let CsvArgs {
14
14
  to_read,
15
15
  has_headers,
@@ -18,7 +18,7 @@ pub fn parse_csv(
18
18
  null_string,
19
19
  buffer_size,
20
20
  result_type,
21
- } = parse_csv_args(ruby, args)?;
21
+ } = parse_csv_args(&ruby, args)?;
22
22
 
23
23
  if !ruby.block_given() {
24
24
  return create_enumerator(EnumeratorArgs {
@@ -35,7 +35,7 @@ pub fn parse_csv(
35
35
 
36
36
  let iter: Box<dyn Iterator<Item = CsvRecord>> = match result_type.as_str() {
37
37
  "hash" => Box::new(
38
- RecordReaderBuilder::<HashMap<String, Option<String>>>::new(ruby, to_read)
38
+ RecordReaderBuilder::<HashMap<&'static str, Option<String>>>::new(&ruby, to_read)
39
39
  .has_headers(has_headers)
40
40
  .delimiter(delimiter)
41
41
  .quote_char(quote_char)
@@ -45,7 +45,7 @@ pub fn parse_csv(
45
45
  .map(CsvRecord::Map),
46
46
  ),
47
47
  "array" => Box::new(
48
- RecordReaderBuilder::<Vec<Option<String>>>::new(ruby, to_read)
48
+ RecordReaderBuilder::<Vec<Option<String>>>::new(&ruby, to_read)
49
49
  .has_headers(has_headers)
50
50
  .delimiter(delimiter)
51
51
  .quote_char(quote_char)
data/lib/osv/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module OSV
2
- VERSION = "0.3.0"
2
+ VERSION = "0.3.2"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: osv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
@@ -56,6 +56,7 @@ files:
56
56
  - ext/osv/Cargo.toml
57
57
  - ext/osv/extconf.rb
58
58
  - ext/osv/src/csv/builder.rs
59
+ - ext/osv/src/csv/header_cache.rs
59
60
  - ext/osv/src/csv/mod.rs
60
61
  - ext/osv/src/csv/parser.rs
61
62
  - ext/osv/src/csv/reader.rs