osv 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +5 -9
- data/ext/osv/src/csv/builder.rs +114 -0
- data/ext/osv/src/csv/mod.rs +8 -0
- data/ext/osv/src/csv/parser.rs +43 -0
- data/ext/osv/src/csv/reader.rs +73 -0
- data/ext/osv/src/csv/record.rs +17 -0
- data/ext/osv/src/lib.rs +1 -1
- data/ext/osv/src/reader.rs +81 -216
- data/ext/osv/src/utils.rs +108 -5
- data/lib/osv/version.rb +1 -1
- data/lib/osv.rbi +20 -22
- metadata +6 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f22d1d56b0eba1e23ca192db2c70e68689486e2f0032672285017e1f98a530d2
|
4
|
+
data.tar.gz: 6288dce70b95faf312e8aa244ba56a60c3d59b85bdd16a4a951060df78b97e1e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5399b43ecd3987c73daf09341d51a1ee8e5d060f1085e9a7aac9b823ab723ccbbc4084c5c0c9abbd36e7e17becfcfa1757af6af3666e01ef3992a27e35b5b983
|
7
|
+
data.tar.gz: 2a6d98b645af40ab08a5a01a2bcf5b67ce9ebff18a993e602f6b00fa2f3f80d65c63605a7f5a715286ac20751db0607c515686e4439156a4039ae24f49e95e10
|
data/README.md
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
# OSV
|
2
2
|
|
3
|
+
[](https://badge.fury.io/rb/osv)
|
4
|
+
|
3
5
|
OSV is a high-performance CSV parser for Ruby, implemented in Rust. It wraps BurntSushi's excellent [csv-rs](https://github.com/BurntSushi/rust-csv) crate.
|
4
6
|
|
5
7
|
It provides a simple interface for reading CSV files with support for both hash-based and array-based row formats.
|
6
8
|
|
7
9
|
The array-based mode is faster than the hash-based mode, so if you don't need the hash keys, use the array-based mode.
|
8
10
|
|
9
|
-
I have yet to figure out how to get rust to accept an implementation of this as one method with different return types, so I've had to implement two methods.
|
10
|
-
|
11
11
|
## Installation
|
12
12
|
|
13
13
|
Add this line to your application's Gemfile:
|
@@ -53,7 +53,7 @@ rows.each { |row| puts row["name"] }
|
|
53
53
|
If you prefer working with arrays instead of hashes, use `for_each_compat`:
|
54
54
|
|
55
55
|
```ruby
|
56
|
-
OSV.
|
56
|
+
OSV.for_each("path/to/file.csv", result_type: :array) do |row|
|
57
57
|
# row is an Array like ["John", "25"]
|
58
58
|
puts row[0]
|
59
59
|
end
|
@@ -64,11 +64,11 @@ end
|
|
64
64
|
Both methods support the following options:
|
65
65
|
|
66
66
|
- `has_headers`: Boolean indicating if the first row contains headers (default: true)
|
67
|
-
- `
|
67
|
+
- `col_sep`: String specifying the field separator (default: ",")
|
68
68
|
|
69
69
|
```ruby
|
70
70
|
# Reading TSV files
|
71
|
-
OSV.for_each("path/to/file.tsv",
|
71
|
+
OSV.for_each("path/to/file.tsv", col_sep: "\t") do |row|
|
72
72
|
puts row["name"]
|
73
73
|
end
|
74
74
|
|
@@ -111,7 +111,3 @@ OSV.for_each(data) { |row| puts row["name"] }
|
|
111
111
|
## Performance
|
112
112
|
|
113
113
|
This library is faster than the standard Ruby CSV library, and is comparable to the fastest CSV parser gems I've used.
|
114
|
-
|
115
|
-
## License
|
116
|
-
|
117
|
-
This gem is not currently licensed for public use.
|
@@ -0,0 +1,114 @@
|
|
1
|
+
use super::{
|
2
|
+
parser::RecordParser,
|
3
|
+
reader::{ReadImpl, RecordReader},
|
4
|
+
};
|
5
|
+
use magnus::{rb_sys::AsRawValue, value::ReprValue, Error, RString, Ruby, Value};
|
6
|
+
use std::{fs::File, io::Read, marker::PhantomData, os::fd::FromRawFd, thread};
|
7
|
+
|
8
|
+
pub struct RecordReaderBuilder<'a, T: RecordParser + Send + 'static> {
|
9
|
+
ruby: &'a Ruby,
|
10
|
+
to_read: Value,
|
11
|
+
has_headers: bool,
|
12
|
+
delimiter: u8,
|
13
|
+
quote_char: u8,
|
14
|
+
null_string: String,
|
15
|
+
buffer: usize,
|
16
|
+
_phantom: PhantomData<T>,
|
17
|
+
}
|
18
|
+
|
19
|
+
impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
20
|
+
pub fn new(ruby: &'a Ruby, to_read: Value) -> Self {
|
21
|
+
Self {
|
22
|
+
ruby,
|
23
|
+
to_read,
|
24
|
+
has_headers: true,
|
25
|
+
delimiter: b',',
|
26
|
+
quote_char: b'"',
|
27
|
+
null_string: String::new(),
|
28
|
+
buffer: 1000,
|
29
|
+
_phantom: PhantomData,
|
30
|
+
}
|
31
|
+
}
|
32
|
+
|
33
|
+
pub fn has_headers(mut self, has_headers: bool) -> Self {
|
34
|
+
self.has_headers = has_headers;
|
35
|
+
self
|
36
|
+
}
|
37
|
+
|
38
|
+
pub fn delimiter(mut self, delimiter: u8) -> Self {
|
39
|
+
self.delimiter = delimiter;
|
40
|
+
self
|
41
|
+
}
|
42
|
+
|
43
|
+
pub fn quote_char(mut self, quote_char: u8) -> Self {
|
44
|
+
self.quote_char = quote_char;
|
45
|
+
self
|
46
|
+
}
|
47
|
+
|
48
|
+
pub fn null_string(mut self, null_string: String) -> Self {
|
49
|
+
self.null_string = null_string;
|
50
|
+
self
|
51
|
+
}
|
52
|
+
|
53
|
+
pub fn buffer(mut self, buffer: usize) -> Self {
|
54
|
+
self.buffer = buffer;
|
55
|
+
self
|
56
|
+
}
|
57
|
+
|
58
|
+
fn get_reader(&self) -> Result<Box<dyn Read + Send + 'static>, Error> {
|
59
|
+
let string_io: magnus::RClass = self.ruby.eval("StringIO")?;
|
60
|
+
|
61
|
+
if self.to_read.is_kind_of(string_io) {
|
62
|
+
let string: RString = self.to_read.funcall("string", ())?;
|
63
|
+
let content = string.to_string()?;
|
64
|
+
Ok(Box::new(std::io::Cursor::new(content)))
|
65
|
+
} else if self.to_read.is_kind_of(self.ruby.class_io()) {
|
66
|
+
let fd = unsafe { rb_sys::rb_io_descriptor(self.to_read.as_raw()) };
|
67
|
+
let file = unsafe { File::from_raw_fd(fd) };
|
68
|
+
Ok(Box::new(file))
|
69
|
+
} else {
|
70
|
+
let path = self.to_read.to_r_string()?.to_string()?;
|
71
|
+
let file = std::fs::File::open(&path).map_err(|e| {
|
72
|
+
Error::new(
|
73
|
+
self.ruby.exception_runtime_error(),
|
74
|
+
format!("Failed to open file: {e}"),
|
75
|
+
)
|
76
|
+
})?;
|
77
|
+
Ok(Box::new(file))
|
78
|
+
}
|
79
|
+
}
|
80
|
+
|
81
|
+
pub fn build(self) -> Result<RecordReader<T>, Error> {
|
82
|
+
let readable = self.get_reader()?;
|
83
|
+
|
84
|
+
let mut reader = csv::ReaderBuilder::new()
|
85
|
+
.has_headers(self.has_headers)
|
86
|
+
.delimiter(self.delimiter)
|
87
|
+
.quote(self.quote_char)
|
88
|
+
.from_reader(readable);
|
89
|
+
|
90
|
+
let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
|
91
|
+
let headers_clone = headers.clone();
|
92
|
+
let null_string = self.null_string;
|
93
|
+
|
94
|
+
let (sender, receiver) = kanal::bounded(self.buffer);
|
95
|
+
let handle = thread::spawn(move || {
|
96
|
+
let mut record = csv::StringRecord::new();
|
97
|
+
while let Ok(true) = reader.read_record(&mut record) {
|
98
|
+
let row = T::parse(&headers_clone, &record, &null_string);
|
99
|
+
if sender.send(row).is_err() {
|
100
|
+
break;
|
101
|
+
}
|
102
|
+
}
|
103
|
+
let file_to_forget = reader.into_inner();
|
104
|
+
std::mem::forget(file_to_forget);
|
105
|
+
});
|
106
|
+
|
107
|
+
Ok(RecordReader {
|
108
|
+
reader: ReadImpl::MultiThreaded {
|
109
|
+
receiver,
|
110
|
+
handle: Some(handle),
|
111
|
+
},
|
112
|
+
})
|
113
|
+
}
|
114
|
+
}
|
@@ -0,0 +1,43 @@
|
|
1
|
+
use std::collections::HashMap;
|
2
|
+
|
3
|
+
pub trait RecordParser {
|
4
|
+
type Output;
|
5
|
+
|
6
|
+
fn parse(headers: &[String], record: &csv::StringRecord, null_string: &str) -> Self::Output;
|
7
|
+
}
|
8
|
+
|
9
|
+
impl RecordParser for HashMap<String, Option<String>> {
|
10
|
+
type Output = Self;
|
11
|
+
|
12
|
+
fn parse(headers: &[String], record: &csv::StringRecord, null_string: &str) -> Self::Output {
|
13
|
+
headers
|
14
|
+
.iter()
|
15
|
+
.zip(record.iter())
|
16
|
+
.map(|(header, field)| {
|
17
|
+
let value = if field == null_string {
|
18
|
+
None
|
19
|
+
} else {
|
20
|
+
Some(field.to_string())
|
21
|
+
};
|
22
|
+
(header.clone(), value)
|
23
|
+
})
|
24
|
+
.collect()
|
25
|
+
}
|
26
|
+
}
|
27
|
+
|
28
|
+
impl RecordParser for Vec<Option<String>> {
|
29
|
+
type Output = Self;
|
30
|
+
|
31
|
+
fn parse(_headers: &[String], record: &csv::StringRecord, null_string: &str) -> Self::Output {
|
32
|
+
record
|
33
|
+
.iter()
|
34
|
+
.map(|field| {
|
35
|
+
if field == null_string {
|
36
|
+
None
|
37
|
+
} else {
|
38
|
+
Some(field.to_string())
|
39
|
+
}
|
40
|
+
})
|
41
|
+
.collect()
|
42
|
+
}
|
43
|
+
}
|
@@ -0,0 +1,73 @@
|
|
1
|
+
use super::parser::RecordParser;
|
2
|
+
use magnus::{Error, Ruby};
|
3
|
+
use std::{io::Read, thread};
|
4
|
+
|
5
|
+
pub struct RecordReader<T: RecordParser> {
|
6
|
+
pub(crate) reader: ReadImpl<T>,
|
7
|
+
}
|
8
|
+
|
9
|
+
#[allow(dead_code)]
|
10
|
+
pub enum ReadImpl<T: RecordParser> {
|
11
|
+
SingleThreaded {
|
12
|
+
reader: csv::Reader<Box<dyn Read + Send + 'static>>,
|
13
|
+
headers: Vec<String>,
|
14
|
+
null_string: String,
|
15
|
+
},
|
16
|
+
MultiThreaded {
|
17
|
+
receiver: kanal::Receiver<T::Output>,
|
18
|
+
handle: Option<thread::JoinHandle<()>>,
|
19
|
+
},
|
20
|
+
}
|
21
|
+
|
22
|
+
impl<T: RecordParser> RecordReader<T> {
|
23
|
+
pub(crate) fn get_headers(
|
24
|
+
ruby: &Ruby,
|
25
|
+
reader: &mut csv::Reader<impl Read>,
|
26
|
+
has_headers: bool,
|
27
|
+
) -> Result<Vec<String>, Error> {
|
28
|
+
let first_row = reader
|
29
|
+
.headers()
|
30
|
+
.map_err(|e| {
|
31
|
+
Error::new(
|
32
|
+
ruby.exception_runtime_error(),
|
33
|
+
format!("Failed to read headers: {e}"),
|
34
|
+
)
|
35
|
+
})?
|
36
|
+
.clone();
|
37
|
+
|
38
|
+
Ok(if has_headers {
|
39
|
+
first_row.iter().map(String::from).collect()
|
40
|
+
} else {
|
41
|
+
(0..first_row.len()).map(|i| format!("c{i}")).collect()
|
42
|
+
})
|
43
|
+
}
|
44
|
+
}
|
45
|
+
|
46
|
+
impl<T: RecordParser> Iterator for RecordReader<T> {
|
47
|
+
type Item = T::Output;
|
48
|
+
|
49
|
+
fn next(&mut self) -> Option<Self::Item> {
|
50
|
+
match &mut self.reader {
|
51
|
+
ReadImpl::MultiThreaded { receiver, handle } => match receiver.recv() {
|
52
|
+
Ok(record) => Some(record),
|
53
|
+
Err(_) => {
|
54
|
+
if let Some(handle) = handle.take() {
|
55
|
+
let _ = handle.join();
|
56
|
+
}
|
57
|
+
None
|
58
|
+
}
|
59
|
+
},
|
60
|
+
ReadImpl::SingleThreaded {
|
61
|
+
reader,
|
62
|
+
headers,
|
63
|
+
null_string,
|
64
|
+
} => {
|
65
|
+
let mut record = csv::StringRecord::new();
|
66
|
+
match reader.read_record(&mut record) {
|
67
|
+
Ok(true) => Some(T::parse(headers, &record, null_string)),
|
68
|
+
_ => None,
|
69
|
+
}
|
70
|
+
}
|
71
|
+
}
|
72
|
+
}
|
73
|
+
}
|
@@ -0,0 +1,17 @@
|
|
1
|
+
use magnus::{IntoValue, Ruby, Value};
|
2
|
+
use std::collections::HashMap;
|
3
|
+
|
4
|
+
#[derive(Debug)]
|
5
|
+
pub enum CsvRecord {
|
6
|
+
Vec(Vec<Option<String>>),
|
7
|
+
Map(HashMap<String, Option<String>>),
|
8
|
+
}
|
9
|
+
|
10
|
+
impl IntoValue for CsvRecord {
|
11
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
12
|
+
match self {
|
13
|
+
CsvRecord::Vec(vec) => vec.into_value_with(handle),
|
14
|
+
CsvRecord::Map(map) => map.into_value_with(handle),
|
15
|
+
}
|
16
|
+
}
|
17
|
+
}
|
data/ext/osv/src/lib.rs
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
mod csv;
|
1
2
|
mod reader;
|
2
3
|
mod utils;
|
3
4
|
|
@@ -10,6 +11,5 @@ use magnus::{Error, Ruby};
|
|
10
11
|
fn init(ruby: &Ruby) -> Result<(), Error> {
|
11
12
|
let module = ruby.define_module("OSV")?;
|
12
13
|
module.define_module_function("for_each", magnus::method!(parse_csv, -1))?;
|
13
|
-
module.define_module_function("for_each_compat", magnus::method!(parse_compat, -1))?;
|
14
14
|
Ok(())
|
15
15
|
}
|
data/ext/osv/src/reader.rs
CHANGED
@@ -1,235 +1,100 @@
|
|
1
|
+
use std::collections::HashMap;
|
2
|
+
|
3
|
+
use crate::csv::{CsvRecord, RecordReaderBuilder};
|
1
4
|
use crate::utils::*;
|
2
|
-
use magnus::
|
3
|
-
|
4
|
-
};
|
5
|
-
use std::{collections::HashMap, fs::File, io::Read, os::fd::FromRawFd, thread};
|
5
|
+
use magnus::value::ReprValue;
|
6
|
+
use magnus::{block::Yield, Error, KwArgs, RHash, Ruby, Symbol, Value};
|
6
7
|
|
7
|
-
/// Parses CSV data from a file and yields each row as a hash to the block.
|
8
8
|
pub fn parse_csv(
|
9
9
|
ruby: &Ruby,
|
10
10
|
rb_self: Value,
|
11
11
|
args: &[Value],
|
12
|
-
) -> Result<Yield<
|
13
|
-
|
14
|
-
return Ok(Yield::Enumerator(rb_self.enumeratorize("for_each", args)));
|
15
|
-
}
|
16
|
-
let (to_read, has_headers, delimiter) = parse_csv_args(args)?;
|
17
|
-
|
18
|
-
let iter = RecordReader::<HashMap<String, String>>::new(
|
19
|
-
ruby,
|
12
|
+
) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord>>>, Error> {
|
13
|
+
let CsvArgs {
|
20
14
|
to_read,
|
21
15
|
has_headers,
|
22
|
-
delimiter
|
23
|
-
|
24
|
-
|
16
|
+
delimiter,
|
17
|
+
quote_char,
|
18
|
+
null_string,
|
19
|
+
buffer_size,
|
20
|
+
result_type,
|
21
|
+
} = parse_csv_args(ruby, args)?;
|
25
22
|
|
26
|
-
Ok(Yield::Iter(iter))
|
27
|
-
}
|
28
|
-
|
29
|
-
pub fn parse_compat(
|
30
|
-
ruby: &Ruby,
|
31
|
-
rb_self: Value,
|
32
|
-
args: &[Value],
|
33
|
-
) -> Result<Yield<impl Iterator<Item = Vec<String>>>, Error> {
|
34
23
|
if !ruby.block_given() {
|
35
|
-
return
|
36
|
-
rb_self
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
delimiter.unwrap_or_else(|| ",".to_string()).as_bytes()[0],
|
46
|
-
1000,
|
47
|
-
)?;
|
48
|
-
|
49
|
-
Ok(Yield::Iter(iter))
|
50
|
-
}
|
51
|
-
|
52
|
-
pub trait RecordParser {
|
53
|
-
type Output;
|
54
|
-
|
55
|
-
fn parse(headers: &[String], record: &csv::StringRecord) -> Self::Output;
|
56
|
-
}
|
57
|
-
|
58
|
-
impl RecordParser for HashMap<String, String> {
|
59
|
-
type Output = Self;
|
60
|
-
|
61
|
-
fn parse(headers: &[String], record: &csv::StringRecord) -> Self::Output {
|
62
|
-
let capacity = headers.len();
|
63
|
-
let mut map = HashMap::with_capacity(capacity);
|
64
|
-
for (i, field) in record.iter().enumerate() {
|
65
|
-
map.insert(headers[i].to_owned(), field.to_string());
|
66
|
-
}
|
67
|
-
map
|
24
|
+
return create_enumerator(EnumeratorArgs {
|
25
|
+
rb_self,
|
26
|
+
to_read,
|
27
|
+
has_headers,
|
28
|
+
delimiter,
|
29
|
+
quote_char,
|
30
|
+
null_string,
|
31
|
+
buffer_size,
|
32
|
+
result_type,
|
33
|
+
});
|
68
34
|
}
|
69
|
-
}
|
70
|
-
|
71
|
-
impl RecordParser for Vec<String> {
|
72
|
-
type Output = Self;
|
73
35
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
36
|
+
let iter: Box<dyn Iterator<Item = CsvRecord>> = match result_type.as_str() {
|
37
|
+
"hash" => Box::new(
|
38
|
+
RecordReaderBuilder::<HashMap<String, Option<String>>>::new(ruby, to_read)
|
39
|
+
.has_headers(has_headers)
|
40
|
+
.delimiter(delimiter)
|
41
|
+
.quote_char(quote_char)
|
42
|
+
.null_string(null_string)
|
43
|
+
.buffer(buffer_size)
|
44
|
+
.build()?
|
45
|
+
.map(CsvRecord::Map),
|
46
|
+
),
|
47
|
+
"array" => Box::new(
|
48
|
+
RecordReaderBuilder::<Vec<Option<String>>>::new(ruby, to_read)
|
49
|
+
.has_headers(has_headers)
|
50
|
+
.delimiter(delimiter)
|
51
|
+
.quote_char(quote_char)
|
52
|
+
.null_string(null_string)
|
53
|
+
.buffer(buffer_size)
|
54
|
+
.build()?
|
55
|
+
.map(CsvRecord::Vec),
|
56
|
+
),
|
57
|
+
_ => {
|
58
|
+
return Err(Error::new(
|
59
|
+
ruby.exception_runtime_error(),
|
60
|
+
"Invalid result type",
|
61
|
+
))
|
78
62
|
}
|
79
|
-
|
80
|
-
}
|
81
|
-
}
|
82
|
-
|
83
|
-
struct RecordReader<T: RecordParser> {
|
84
|
-
reader: ReadImpl<T>,
|
85
|
-
}
|
63
|
+
};
|
86
64
|
|
87
|
-
|
88
|
-
enum ReadImpl<T: RecordParser> {
|
89
|
-
SingleThreaded {
|
90
|
-
reader: csv::Reader<Box<dyn Read + Send + 'static>>,
|
91
|
-
headers: Vec<String>,
|
92
|
-
},
|
93
|
-
MultiThreaded {
|
94
|
-
receiver: kanal::Receiver<T::Output>,
|
95
|
-
handle: Option<thread::JoinHandle<()>>,
|
96
|
-
},
|
65
|
+
Ok(Yield::Iter(iter))
|
97
66
|
}
|
98
67
|
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
Error::new(
|
109
|
-
ruby.exception_runtime_error(),
|
110
|
-
format!("Failed to get StringIO class: {}", e),
|
111
|
-
)
|
112
|
-
})?;
|
113
|
-
|
114
|
-
let readable: Box<dyn Read + Send + 'static> = if to_read.is_kind_of(string_io) {
|
115
|
-
let string: RString = to_read.funcall("string", ()).map_err(|e| {
|
116
|
-
Error::new(
|
117
|
-
ruby.exception_runtime_error(),
|
118
|
-
format!("Failed to get string from StringIO: {}", e),
|
119
|
-
)
|
120
|
-
})?;
|
121
|
-
let content = string.to_string().map_err(|e| {
|
122
|
-
Error::new(
|
123
|
-
ruby.exception_runtime_error(),
|
124
|
-
format!("Failed to convert string to Rust String: {}", e),
|
125
|
-
)
|
126
|
-
})?;
|
127
|
-
Box::new(std::io::Cursor::new(content))
|
128
|
-
} else if to_read.is_kind_of(ruby.class_io()) {
|
129
|
-
let fd = unsafe { rb_sys::rb_io_descriptor(to_read.as_raw()) };
|
130
|
-
let file = unsafe { File::from_raw_fd(fd) };
|
131
|
-
Box::new(file)
|
132
|
-
} else {
|
133
|
-
let path = to_read
|
134
|
-
.to_r_string()
|
135
|
-
.map_err(|e| {
|
136
|
-
Error::new(
|
137
|
-
ruby.exception_runtime_error(),
|
138
|
-
format!("Failed to convert path to string: {}", e),
|
139
|
-
)
|
140
|
-
})?
|
141
|
-
.to_string()
|
142
|
-
.map_err(|e| {
|
143
|
-
Error::new(
|
144
|
-
ruby.exception_runtime_error(),
|
145
|
-
format!("Failed to convert RString to Rust String: {}", e),
|
146
|
-
)
|
147
|
-
})?;
|
148
|
-
let file = std::fs::File::open(&path).map_err(|e| {
|
149
|
-
Error::new(
|
150
|
-
ruby.exception_runtime_error(),
|
151
|
-
format!("Failed to open file: {}", e),
|
152
|
-
)
|
153
|
-
})?;
|
154
|
-
Box::new(file)
|
155
|
-
};
|
156
|
-
|
157
|
-
let mut reader = csv::ReaderBuilder::new()
|
158
|
-
.has_headers(has_headers)
|
159
|
-
.delimiter(delimiter)
|
160
|
-
.from_reader(readable);
|
161
|
-
|
162
|
-
let headers = Self::get_headers(&mut reader, has_headers)?;
|
163
|
-
let headers_clone = headers.clone();
|
164
|
-
|
165
|
-
let (sender, receiver) = kanal::bounded(buffer);
|
166
|
-
let handle = thread::spawn(move || {
|
167
|
-
let mut record = csv::StringRecord::new();
|
168
|
-
while let Ok(read) = reader.read_record(&mut record) {
|
169
|
-
if !read {
|
170
|
-
let file_to_forget = reader.into_inner();
|
171
|
-
std::mem::forget(file_to_forget);
|
172
|
-
break;
|
173
|
-
}
|
174
|
-
let row = T::parse(&headers_clone, &record);
|
175
|
-
if sender.send(row).is_err() {
|
176
|
-
break;
|
177
|
-
}
|
178
|
-
}
|
179
|
-
});
|
180
|
-
|
181
|
-
let read_impl = ReadImpl::MultiThreaded {
|
182
|
-
receiver,
|
183
|
-
handle: Some(handle),
|
184
|
-
};
|
185
|
-
|
186
|
-
Ok(Self { reader: read_impl })
|
187
|
-
}
|
188
|
-
|
189
|
-
fn get_headers(
|
190
|
-
reader: &mut csv::Reader<impl Read>,
|
191
|
-
has_headers: bool,
|
192
|
-
) -> Result<Vec<String>, Error> {
|
193
|
-
let first_row = reader
|
194
|
-
.headers()
|
195
|
-
.map_err(|e| {
|
196
|
-
Error::new(
|
197
|
-
magnus::exception::runtime_error(),
|
198
|
-
format!("Failed to read headers: {}", e),
|
199
|
-
)
|
200
|
-
})?
|
201
|
-
.clone();
|
202
|
-
let num_fields = first_row.len();
|
203
|
-
|
204
|
-
Ok(if has_headers {
|
205
|
-
first_row.iter().map(|h| h.to_string()).collect()
|
206
|
-
} else {
|
207
|
-
(0..num_fields).map(|i| format!("c{}", i)).collect()
|
208
|
-
})
|
209
|
-
}
|
68
|
+
struct EnumeratorArgs {
|
69
|
+
rb_self: Value,
|
70
|
+
to_read: Value,
|
71
|
+
has_headers: bool,
|
72
|
+
delimiter: u8,
|
73
|
+
quote_char: u8,
|
74
|
+
null_string: String,
|
75
|
+
buffer_size: usize,
|
76
|
+
result_type: String,
|
210
77
|
}
|
211
78
|
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
}
|
234
|
-
}
|
79
|
+
fn create_enumerator(
|
80
|
+
args: EnumeratorArgs,
|
81
|
+
) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord>>>, Error> {
|
82
|
+
let kwargs = RHash::new();
|
83
|
+
kwargs.aset(Symbol::new("has_headers"), args.has_headers)?;
|
84
|
+
kwargs.aset(
|
85
|
+
Symbol::new("col_sep"),
|
86
|
+
String::from_utf8(vec![args.delimiter]).unwrap(),
|
87
|
+
)?;
|
88
|
+
kwargs.aset(
|
89
|
+
Symbol::new("quote_char"),
|
90
|
+
String::from_utf8(vec![args.quote_char]).unwrap(),
|
91
|
+
)?;
|
92
|
+
kwargs.aset(Symbol::new("nil_string"), args.null_string)?;
|
93
|
+
kwargs.aset(Symbol::new("buffer_size"), args.buffer_size)?;
|
94
|
+
kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
|
95
|
+
|
96
|
+
let enumerator = args
|
97
|
+
.rb_self
|
98
|
+
.enumeratorize("for_each", (args.to_read, KwArgs(kwargs)));
|
99
|
+
Ok(Yield::Enumerator(enumerator))
|
235
100
|
}
|
data/ext/osv/src/utils.rs
CHANGED
@@ -1,20 +1,123 @@
|
|
1
1
|
use magnus::{
|
2
2
|
scan_args::{get_kwargs, scan_args},
|
3
|
-
|
3
|
+
value::ReprValue,
|
4
|
+
Error, RString, Ruby, Symbol, Value,
|
4
5
|
};
|
5
6
|
|
7
|
+
#[derive(Debug)]
|
8
|
+
pub struct CsvArgs {
|
9
|
+
pub to_read: Value,
|
10
|
+
pub has_headers: bool,
|
11
|
+
pub delimiter: u8,
|
12
|
+
pub quote_char: u8,
|
13
|
+
pub null_string: String,
|
14
|
+
pub buffer_size: usize,
|
15
|
+
pub result_type: String,
|
16
|
+
}
|
17
|
+
|
6
18
|
/// Parse common arguments for CSV parsing
|
7
|
-
pub fn parse_csv_args(args: &[Value]) -> Result<
|
19
|
+
pub fn parse_csv_args(ruby: &Ruby, args: &[Value]) -> Result<CsvArgs, Error> {
|
8
20
|
let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
|
9
21
|
let (to_read,) = parsed_args.required;
|
10
22
|
|
11
|
-
let kwargs = get_kwargs::<
|
23
|
+
let kwargs = get_kwargs::<
|
24
|
+
_,
|
25
|
+
(),
|
26
|
+
(
|
27
|
+
Option<bool>,
|
28
|
+
Option<String>,
|
29
|
+
Option<String>,
|
30
|
+
Option<String>,
|
31
|
+
Option<usize>,
|
32
|
+
Option<Value>,
|
33
|
+
),
|
34
|
+
(),
|
35
|
+
>(
|
12
36
|
parsed_args.keywords,
|
13
37
|
&[],
|
14
|
-
&[
|
38
|
+
&[
|
39
|
+
"has_headers",
|
40
|
+
"col_sep",
|
41
|
+
"quote_char",
|
42
|
+
"nil_string",
|
43
|
+
"buffer_size",
|
44
|
+
"result_type",
|
45
|
+
],
|
15
46
|
)?;
|
16
47
|
|
17
48
|
let has_headers = kwargs.optional.0.unwrap_or(true);
|
18
49
|
|
19
|
-
|
50
|
+
let delimiter = *kwargs
|
51
|
+
.optional
|
52
|
+
.1
|
53
|
+
.unwrap_or_else(|| ",".to_string())
|
54
|
+
.as_bytes()
|
55
|
+
.first()
|
56
|
+
.ok_or_else(|| {
|
57
|
+
Error::new(
|
58
|
+
magnus::exception::runtime_error(),
|
59
|
+
"Delimiter cannot be empty",
|
60
|
+
)
|
61
|
+
})?;
|
62
|
+
|
63
|
+
let quote_char = *kwargs
|
64
|
+
.optional
|
65
|
+
.2
|
66
|
+
.unwrap_or_else(|| "\"".to_string())
|
67
|
+
.as_bytes()
|
68
|
+
.first()
|
69
|
+
.ok_or_else(|| {
|
70
|
+
Error::new(
|
71
|
+
magnus::exception::runtime_error(),
|
72
|
+
"Quote character cannot be empty",
|
73
|
+
)
|
74
|
+
})?;
|
75
|
+
|
76
|
+
let null_string = kwargs.optional.3.unwrap_or_else(|| "".to_string());
|
77
|
+
|
78
|
+
let buffer_size = kwargs.optional.4.unwrap_or(1000);
|
79
|
+
|
80
|
+
let result_type = match kwargs.optional.5 {
|
81
|
+
Some(value) => {
|
82
|
+
let parsed = if value.is_kind_of(ruby.class_string()) {
|
83
|
+
RString::from_value(value)
|
84
|
+
.ok_or_else(|| {
|
85
|
+
Error::new(magnus::exception::type_error(), "Invalid string value")
|
86
|
+
})?
|
87
|
+
.to_string()?
|
88
|
+
} else if value.is_kind_of(ruby.class_symbol()) {
|
89
|
+
Symbol::from_value(value)
|
90
|
+
.ok_or_else(|| {
|
91
|
+
Error::new(magnus::exception::type_error(), "Invalid symbol value")
|
92
|
+
})?
|
93
|
+
.funcall("to_s", ())?
|
94
|
+
} else {
|
95
|
+
return Err(Error::new(
|
96
|
+
magnus::exception::type_error(),
|
97
|
+
"result_type must be a String or Symbol",
|
98
|
+
));
|
99
|
+
};
|
100
|
+
|
101
|
+
match parsed.as_str() {
|
102
|
+
"hash" | "array" => parsed,
|
103
|
+
_ => {
|
104
|
+
return Err(Error::new(
|
105
|
+
magnus::exception::runtime_error(),
|
106
|
+
"result_type must be either 'hash' or 'array'",
|
107
|
+
))
|
108
|
+
}
|
109
|
+
}
|
110
|
+
}
|
111
|
+
None => String::from("hash"),
|
112
|
+
};
|
113
|
+
|
114
|
+
Ok(CsvArgs {
|
115
|
+
to_read,
|
116
|
+
has_headers,
|
117
|
+
delimiter,
|
118
|
+
quote_char,
|
119
|
+
null_string,
|
120
|
+
buffer_size,
|
121
|
+
result_type,
|
122
|
+
})
|
20
123
|
}
|
data/lib/osv/version.rb
CHANGED
data/lib/osv.rbi
CHANGED
@@ -2,28 +2,26 @@
|
|
2
2
|
|
3
3
|
module OSV
|
4
4
|
sig do
|
5
|
-
|
6
|
-
.
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
)
|
12
|
-
.
|
5
|
+
params(
|
6
|
+
input: T.any(String, StringIO, IO),
|
7
|
+
has_headers: T.nilable(T::Boolean),
|
8
|
+
col_sep: T.nilable(String),
|
9
|
+
quote_char: T.nilable(String),
|
10
|
+
nil_string: T.nilable(String),
|
11
|
+
buffer_size: T.nilable(Integer),
|
12
|
+
result_type: T.nilable(String),
|
13
|
+
blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.nilable(String)], T::Array[T.nilable(String)])).void)
|
14
|
+
).returns(T.any(Enumerator, T.untyped))
|
13
15
|
end
|
14
|
-
def self.for_each(
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
)
|
25
|
-
.returns(T.untyped)
|
26
|
-
end
|
27
|
-
def self.for_each_compat(input, has_headers: true, delimiter: nil, &blk)
|
16
|
+
def self.for_each(
|
17
|
+
input,
|
18
|
+
has_headers: true,
|
19
|
+
col_sep: nil,
|
20
|
+
quote_char: nil,
|
21
|
+
nil_string: nil,
|
22
|
+
buffer_size: nil,
|
23
|
+
result_type: nil,
|
24
|
+
&blk
|
25
|
+
)
|
28
26
|
end
|
29
27
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: osv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
@@ -55,6 +55,11 @@ files:
|
|
55
55
|
- ext/osv/Cargo.lock
|
56
56
|
- ext/osv/Cargo.toml
|
57
57
|
- ext/osv/extconf.rb
|
58
|
+
- ext/osv/src/csv/builder.rs
|
59
|
+
- ext/osv/src/csv/mod.rs
|
60
|
+
- ext/osv/src/csv/parser.rs
|
61
|
+
- ext/osv/src/csv/reader.rs
|
62
|
+
- ext/osv/src/csv/record.rs
|
58
63
|
- ext/osv/src/lib.rs
|
59
64
|
- ext/osv/src/reader.rs
|
60
65
|
- ext/osv/src/utils.rs
|