osv 0.3.1 → 0.3.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9a6579b30fb8761382666c3fcfa434d9234c3ebf508cad7b3045b9f4ebc2f8b7
4
- data.tar.gz: afb793b1df201e876e955ed75904c729aa353bdee3ae374085902efdd4601603
3
+ metadata.gz: 33c644fac6e61f8bf3b9f11e646d6706017ece2a386df03a568b5ed06fd91e2a
4
+ data.tar.gz: 0d977fb3a7eaf867663feb76161eb2d3fbe42e758523ef8cafbff51a9acfef0d
5
5
  SHA512:
6
- metadata.gz: 4318353ec8b32f026d3a3c8675a26b6a6545faba213bba28ada9b0ab7d978ea8d7dd6a8e202aaefb765040cc0d54990fd94f8263bbf4ecd6a4150ab9f48e7cc6
7
- data.tar.gz: d0c590105f070773f82a2e00c0c069439c55055cfe958e7eb3bcf443a83fb1d3dd1f099b93f2835a30797024451666e42f18aeaa361432824d18f77cfaa4748f
6
+ metadata.gz: b6fe382c005837fbfc705bd02b1859fdfa9fa9f955c15f7b13ffacb97f1d5dc0714288c23a7e2431a21aada6aacb1952b33ae8d14acd019c96ed719f5580c02d
7
+ data.tar.gz: 4e7c3d783f23af709505a4c182d1ee6744c9f28226aecdf68c532ada9ffa653858bc730bfd879cfa94d8e8f7a1280c49380bb5b6f74ed16b7ac776318aa53d1b
data/Cargo.lock CHANGED
@@ -2,6 +2,12 @@
2
2
  # It is not intended for manual editing.
3
3
  version = 3
4
4
 
5
+ [[package]]
6
+ name = "adler2"
7
+ version = "2.0.0"
8
+ source = "registry+https://github.com/rust-lang/crates.io-index"
9
+ checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
10
+
5
11
  [[package]]
6
12
  name = "aho-corasick"
7
13
  version = "1.1.3"
@@ -69,6 +75,15 @@ dependencies = [
69
75
  "libloading",
70
76
  ]
71
77
 
78
+ [[package]]
79
+ name = "crc32fast"
80
+ version = "1.4.2"
81
+ source = "registry+https://github.com/rust-lang/crates.io-index"
82
+ checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3"
83
+ dependencies = [
84
+ "cfg-if",
85
+ ]
86
+
72
87
  [[package]]
73
88
  name = "csv"
74
89
  version = "1.3.1"
@@ -96,6 +111,16 @@ version = "1.13.0"
96
111
  source = "registry+https://github.com/rust-lang/crates.io-index"
97
112
  checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
98
113
 
114
+ [[package]]
115
+ name = "flate2"
116
+ version = "1.0.35"
117
+ source = "registry+https://github.com/rust-lang/crates.io-index"
118
+ checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c"
119
+ dependencies = [
120
+ "crc32fast",
121
+ "miniz_oxide",
122
+ ]
123
+
99
124
  [[package]]
100
125
  name = "futures-core"
101
126
  version = "0.3.31"
@@ -218,6 +243,15 @@ version = "0.2.1"
218
243
  source = "registry+https://github.com/rust-lang/crates.io-index"
219
244
  checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
220
245
 
246
+ [[package]]
247
+ name = "miniz_oxide"
248
+ version = "0.8.2"
249
+ source = "registry+https://github.com/rust-lang/crates.io-index"
250
+ checksum = "4ffbe83022cedc1d264172192511ae958937694cd57ce297164951b8b3568394"
251
+ dependencies = [
252
+ "adler2",
253
+ ]
254
+
221
255
  [[package]]
222
256
  name = "nom"
223
257
  version = "7.1.3"
@@ -233,6 +267,7 @@ name = "osv"
233
267
  version = "0.1.0"
234
268
  dependencies = [
235
269
  "csv",
270
+ "flate2",
236
271
  "kanal",
237
272
  "magnus 0.7.1",
238
273
  "rb-sys",
@@ -260,18 +295,18 @@ dependencies = [
260
295
 
261
296
  [[package]]
262
297
  name = "rb-sys"
263
- version = "0.9.103"
298
+ version = "0.9.104"
264
299
  source = "registry+https://github.com/rust-lang/crates.io-index"
265
- checksum = "91dbe37ab6ac2fba187480fb6544b92445e41e5c6f553bf0c33743f3c450a1df"
300
+ checksum = "e2e26425f064a90404ed5e33fee2137b02a9c6d1c83e19394f4d8a476b9d76a2"
266
301
  dependencies = [
267
302
  "rb-sys-build",
268
303
  ]
269
304
 
270
305
  [[package]]
271
306
  name = "rb-sys-build"
272
- version = "0.9.103"
307
+ version = "0.9.104"
273
308
  source = "registry+https://github.com/rust-lang/crates.io-index"
274
- checksum = "c4d56a49dcb646b70b758789c0d16c055a386a4f2a3346333abb69850fa860ce"
309
+ checksum = "c9802c9003c5648ee0a067e9aa8960d402d5f764f682f93c1ed49eec72f6d7fc"
275
310
  dependencies = [
276
311
  "bindgen",
277
312
  "lazy_static",
data/ext/osv/Cargo.toml CHANGED
@@ -7,9 +7,10 @@ edition = "2021"
7
7
  crate-type = ["cdylib"]
8
8
 
9
9
  [dependencies]
10
- csv = "1.3.1"
10
+ csv = "^1.3"
11
+ flate2 = "1.0.35"
12
+ kanal = "0.1.0-pre8"
11
13
  magnus = { version = "0.7", features = ["rb-sys"] }
12
- rb-sys = "0.9"
14
+ rb-sys = "^0.9"
13
15
  serde = { version = "1.0", features = ["derive"] }
14
16
  serde_magnus = "0.8.1"
15
- kanal = "0.1.0-pre8"
@@ -1,7 +1,9 @@
1
1
  use super::{
2
+ header_cache::StringCache,
2
3
  parser::RecordParser,
3
4
  reader::{ReadImpl, RecordReader},
4
5
  };
6
+ use flate2::read::GzDecoder;
5
7
  use magnus::{rb_sys::AsRawValue, value::ReprValue, Error, RString, Ruby, Value};
6
8
  use std::{fs::File, io::Read, marker::PhantomData, os::fd::FromRawFd, thread};
7
9
 
@@ -74,7 +76,12 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
74
76
  format!("Failed to open file: {e}"),
75
77
  )
76
78
  })?;
77
- Ok(Box::new(file))
79
+ if path.ends_with(".gz") {
80
+ let file = GzDecoder::new(file);
81
+ Ok(Box::new(file))
82
+ } else {
83
+ Ok(Box::new(file))
84
+ }
78
85
  }
79
86
  }
80
87
 
@@ -89,11 +96,19 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
89
96
  let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
90
97
  let null_string = self.null_string;
91
98
 
99
+ let static_headers = StringCache::intern_many(&headers).map_err(|e| {
100
+ Error::new(
101
+ self.ruby.exception_runtime_error(),
102
+ format!("Failed to intern headers: {e}"),
103
+ )
104
+ })?;
105
+ let headers_for_cleanup = static_headers.clone();
106
+
92
107
  let (sender, receiver) = kanal::bounded(self.buffer);
93
108
  let handle = thread::spawn(move || {
94
109
  let mut record = csv::StringRecord::new();
95
110
  while let Ok(true) = reader.read_record(&mut record) {
96
- let row = T::parse(&headers, &record, &null_string);
111
+ let row = T::parse(&static_headers, &record, &null_string);
97
112
  if sender.send(row).is_err() {
98
113
  break;
99
114
  }
@@ -104,6 +119,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
104
119
 
105
120
  Ok(RecordReader {
106
121
  reader: ReadImpl::MultiThreaded {
122
+ headers: headers_for_cleanup,
107
123
  receiver,
108
124
  handle: Some(handle),
109
125
  },
@@ -0,0 +1,71 @@
1
+ /// This module exists to avoid cloning header keys in returned HashMaps.
2
+ /// Since the underlying RString creation already involves cloning,
3
+ /// this caching layer aims to reduce redundant allocations.
4
+ ///
5
+ /// Note: Performance testing on macOS showed minimal speed improvements,
6
+ /// so this optimization could be removed if any issues arise.
7
+
8
+
9
+ use std::{
10
+ collections::HashMap,
11
+ sync::{atomic::AtomicU32, LazyLock, Mutex},
12
+ };
13
+
14
+ static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, AtomicU32>>> =
15
+ LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
16
+
17
+ pub struct StringCache {}
18
+
19
+ impl StringCache {
20
+ #[allow(dead_code)]
21
+ pub fn intern(string: String) -> Result<&'static str, String> {
22
+ let mut cache = STRING_CACHE.lock().map_err(|e| e.to_string())?;
23
+
24
+ if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
25
+ count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
26
+ Ok(existing)
27
+ } else {
28
+ let leaked = Box::leak(string.into_boxed_str());
29
+ cache.insert(leaked, AtomicU32::new(1));
30
+ Ok(leaked)
31
+ }
32
+ }
33
+
34
+ pub fn intern_many(strings: &[String]) -> Result<Vec<&'static str>, String> {
35
+ let mut cache = STRING_CACHE.lock().map_err(|e| e.to_string())?;
36
+ let mut result = Vec::with_capacity(strings.len());
37
+
38
+ for string in strings {
39
+ let static_str: &'static str =
40
+ if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
41
+ count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
42
+ existing
43
+ } else {
44
+ let leaked = Box::leak(string.clone().into_boxed_str());
45
+ cache.insert(leaked, AtomicU32::new(1));
46
+ leaked
47
+ };
48
+ result.push(static_str);
49
+ }
50
+
51
+ Ok(result)
52
+ }
53
+
54
+ pub fn clear(headers: &[&'static str]) -> Result<(), String> {
55
+ let cache = STRING_CACHE.lock().map_err(|e| e.to_string())?;
56
+
57
+ for header in headers {
58
+ if let Some(count) = cache.get(header) {
59
+ let remaining = count.fetch_sub(1, std::sync::atomic::Ordering::Relaxed);
60
+ if remaining == 0 {
61
+ let ptr = *header as *const str as *mut str;
62
+ unsafe {
63
+ let _ = Box::from_raw(ptr);
64
+ }
65
+ }
66
+ }
67
+ }
68
+
69
+ Ok(())
70
+ }
71
+ }
@@ -1,8 +1,8 @@
1
1
  mod builder;
2
+ mod header_cache;
2
3
  mod parser;
3
4
  mod reader;
4
5
  mod record;
5
6
 
6
7
  pub use builder::RecordReaderBuilder;
7
8
  pub use record::CsvRecord;
8
-
@@ -2,24 +2,24 @@ use std::collections::HashMap;
2
2
 
3
3
  pub trait RecordParser {
4
4
  type Output;
5
- fn parse<'a>(
6
- headers: &'a [String],
5
+ fn parse(
6
+ headers: &[&'static str],
7
7
  record: &csv::StringRecord,
8
8
  null_string: &str,
9
9
  ) -> Self::Output;
10
10
  }
11
11
 
12
- impl RecordParser for HashMap<String, Option<String>> {
12
+ impl RecordParser for HashMap<&'static str, Option<String>> {
13
13
  type Output = Self;
14
- fn parse<'a>(
15
- headers: &'a [String],
14
+ fn parse(
15
+ headers: &[&'static str],
16
16
  record: &csv::StringRecord,
17
17
  null_string: &str,
18
18
  ) -> Self::Output {
19
19
  let mut map = HashMap::with_capacity(headers.len());
20
20
  for (header, field) in headers.iter().zip(record.iter()) {
21
21
  map.insert(
22
- header.clone(),
22
+ *header,
23
23
  if field == null_string {
24
24
  None
25
25
  } else {
@@ -33,8 +33,8 @@ impl RecordParser for HashMap<String, Option<String>> {
33
33
 
34
34
  impl RecordParser for Vec<Option<String>> {
35
35
  type Output = Self;
36
- fn parse<'a>(
37
- _headers: &'a [String],
36
+ fn parse(
37
+ _headers: &[&'static str],
38
38
  record: &csv::StringRecord,
39
39
  null_string: &str,
40
40
  ) -> Self::Output {
@@ -1,4 +1,4 @@
1
- use super::parser::RecordParser;
1
+ use super::{header_cache::StringCache, parser::RecordParser};
2
2
  use magnus::{Error, Ruby};
3
3
  use std::{io::Read, thread};
4
4
 
@@ -6,14 +6,36 @@ pub struct RecordReader<T: RecordParser> {
6
6
  pub(crate) reader: ReadImpl<T>,
7
7
  }
8
8
 
9
+ impl<T: RecordParser> Drop for RecordReader<T> {
10
+ fn drop(&mut self) {
11
+ match &mut self.reader {
12
+ ReadImpl::MultiThreaded {
13
+ receiver,
14
+ handle,
15
+ headers,
16
+ } => {
17
+ receiver.close();
18
+ if let Some(handle) = handle.take() {
19
+ let _ = handle.join();
20
+ }
21
+ StringCache::clear(headers).unwrap();
22
+ }
23
+ ReadImpl::SingleThreaded { headers, .. } => {
24
+ StringCache::clear(headers).unwrap();
25
+ }
26
+ }
27
+ }
28
+ }
29
+
9
30
  #[allow(dead_code)]
10
31
  pub enum ReadImpl<T: RecordParser> {
11
32
  SingleThreaded {
12
33
  reader: csv::Reader<Box<dyn Read + Send + 'static>>,
13
- headers: Vec<String>,
34
+ headers: Vec<&'static str>,
14
35
  null_string: String,
15
36
  },
16
37
  MultiThreaded {
38
+ headers: Vec<&'static str>,
17
39
  receiver: kanal::Receiver<T::Output>,
18
40
  handle: Option<thread::JoinHandle<()>>,
19
41
  },
@@ -48,7 +70,9 @@ impl<T: RecordParser> Iterator for RecordReader<T> {
48
70
 
49
71
  fn next(&mut self) -> Option<Self::Item> {
50
72
  match &mut self.reader {
51
- ReadImpl::MultiThreaded { receiver, handle } => match receiver.recv() {
73
+ ReadImpl::MultiThreaded {
74
+ receiver, handle, ..
75
+ } => match receiver.recv() {
52
76
  Ok(record) => Some(record),
53
77
  Err(_) => {
54
78
  if let Some(handle) = handle.take() {
@@ -1,17 +1,23 @@
1
- use magnus::{IntoValue, Ruby, Value};
1
+ use magnus::{IntoValue, RHash, Ruby, Value};
2
2
  use std::collections::HashMap;
3
3
 
4
4
  #[derive(Debug)]
5
5
  pub enum CsvRecord {
6
6
  Vec(Vec<Option<String>>),
7
- Map(HashMap<String, Option<String>>),
7
+ Map(HashMap<&'static str, Option<String>>),
8
8
  }
9
9
 
10
10
  impl IntoValue for CsvRecord {
11
11
  fn into_value_with(self, handle: &Ruby) -> Value {
12
12
  match self {
13
13
  CsvRecord::Vec(vec) => vec.into_value_with(handle),
14
- CsvRecord::Map(map) => map.into_value_with(handle),
14
+ CsvRecord::Map(map) => {
15
+ let hash = RHash::new();
16
+ for (k, v) in map {
17
+ hash.aset(k, v).unwrap();
18
+ }
19
+ hash.into_value_with(handle)
20
+ }
15
21
  }
16
22
  }
17
23
  }
@@ -1,15 +1,15 @@
1
- use std::collections::HashMap;
2
-
3
1
  use crate::csv::{CsvRecord, RecordReaderBuilder};
4
2
  use crate::utils::*;
5
3
  use magnus::value::ReprValue;
6
4
  use magnus::{block::Yield, Error, KwArgs, RHash, Ruby, Symbol, Value};
5
+ use std::collections::HashMap;
7
6
 
8
7
  pub fn parse_csv(
9
- ruby: &Ruby,
10
8
  rb_self: Value,
11
9
  args: &[Value],
12
10
  ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord>>>, Error> {
11
+ let ruby = unsafe { Ruby::get_unchecked() };
12
+
13
13
  let CsvArgs {
14
14
  to_read,
15
15
  has_headers,
@@ -18,7 +18,7 @@ pub fn parse_csv(
18
18
  null_string,
19
19
  buffer_size,
20
20
  result_type,
21
- } = parse_csv_args(ruby, args)?;
21
+ } = parse_csv_args(&ruby, args)?;
22
22
 
23
23
  if !ruby.block_given() {
24
24
  return create_enumerator(EnumeratorArgs {
@@ -35,7 +35,7 @@ pub fn parse_csv(
35
35
 
36
36
  let iter: Box<dyn Iterator<Item = CsvRecord>> = match result_type.as_str() {
37
37
  "hash" => Box::new(
38
- RecordReaderBuilder::<HashMap<String, Option<String>>>::new(ruby, to_read)
38
+ RecordReaderBuilder::<HashMap<&'static str, Option<String>>>::new(&ruby, to_read)
39
39
  .has_headers(has_headers)
40
40
  .delimiter(delimiter)
41
41
  .quote_char(quote_char)
@@ -45,7 +45,7 @@ pub fn parse_csv(
45
45
  .map(CsvRecord::Map),
46
46
  ),
47
47
  "array" => Box::new(
48
- RecordReaderBuilder::<Vec<Option<String>>>::new(ruby, to_read)
48
+ RecordReaderBuilder::<Vec<Option<String>>>::new(&ruby, to_read)
49
49
  .has_headers(has_headers)
50
50
  .delimiter(delimiter)
51
51
  .quote_char(quote_char)
data/lib/osv/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module OSV
2
- VERSION = "0.3.1"
2
+ VERSION = "0.3.3"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: osv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
@@ -56,6 +56,7 @@ files:
56
56
  - ext/osv/Cargo.toml
57
57
  - ext/osv/extconf.rb
58
58
  - ext/osv/src/csv/builder.rs
59
+ - ext/osv/src/csv/header_cache.rs
59
60
  - ext/osv/src/csv/mod.rs
60
61
  - ext/osv/src/csv/parser.rs
61
62
  - ext/osv/src/csv/reader.rs