osv 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSE +21 -0
- data/README.md +113 -0
- data/ext/osv/src/csv/builder.rs +114 -0
- data/ext/osv/src/csv/mod.rs +8 -0
- data/ext/osv/src/csv/parser.rs +43 -0
- data/ext/osv/src/csv/reader.rs +73 -0
- data/ext/osv/src/csv/record.rs +17 -0
- data/ext/osv/src/lib.rs +1 -1
- data/ext/osv/src/reader.rs +81 -211
- data/ext/osv/src/utils.rs +108 -5
- data/lib/osv/version.rb +1 -1
- data/lib/osv.rbi +20 -22
- metadata +8 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f22d1d56b0eba1e23ca192db2c70e68689486e2f0032672285017e1f98a530d2
|
4
|
+
data.tar.gz: 6288dce70b95faf312e8aa244ba56a60c3d59b85bdd16a4a951060df78b97e1e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5399b43ecd3987c73daf09341d51a1ee8e5d060f1085e9a7aac9b823ab723ccbbc4084c5c0c9abbd36e7e17becfcfa1757af6af3666e01ef3992a27e35b5b983
|
7
|
+
data.tar.gz: 2a6d98b645af40ab08a5a01a2bcf5b67ce9ebff18a993e602f6b00fa2f3f80d65c63605a7f5a715286ac20751db0607c515686e4439156a4039ae24f49e95e10
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2024 Nathan Jaremko
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,113 @@
|
|
1
|
+
# OSV
|
2
|
+
|
3
|
+
[![Gem Version](https://badge.fury.io/rb/osv.svg)](https://badge.fury.io/rb/osv)
|
4
|
+
|
5
|
+
OSV is a high-performance CSV parser for Ruby, implemented in Rust. It wraps BurntSushi's excellent [csv-rs](https://github.com/BurntSushi/rust-csv) crate.
|
6
|
+
|
7
|
+
It provides a simple interface for reading CSV files with support for both hash-based and array-based row formats.
|
8
|
+
|
9
|
+
The array-based mode is faster than the hash-based mode, so if you don't need the hash keys, use the array-based mode.
|
10
|
+
|
11
|
+
## Installation
|
12
|
+
|
13
|
+
Add this line to your application's Gemfile:
|
14
|
+
|
15
|
+
```ruby
|
16
|
+
gem 'osv'
|
17
|
+
```
|
18
|
+
|
19
|
+
And then execute:
|
20
|
+
|
21
|
+
```bash
|
22
|
+
bundle install
|
23
|
+
```
|
24
|
+
|
25
|
+
Or install it directly:
|
26
|
+
|
27
|
+
```bash
|
28
|
+
gem install osv
|
29
|
+
```
|
30
|
+
|
31
|
+
## Usage
|
32
|
+
|
33
|
+
### Basic Usage with Hash Output
|
34
|
+
|
35
|
+
Each row is returned as a hash where the keys are the column headers:
|
36
|
+
|
37
|
+
```ruby
|
38
|
+
require 'osv'
|
39
|
+
|
40
|
+
# Read from a file
|
41
|
+
OSV.for_each("path/to/file.csv") do |row|
|
42
|
+
# row is a Hash like {"name" => "John", "age" => "25"}
|
43
|
+
puts row["name"]
|
44
|
+
end
|
45
|
+
|
46
|
+
# Without a block, returns an Enumerator
|
47
|
+
rows = OSV.for_each("path/to/file.csv")
|
48
|
+
rows.each { |row| puts row["name"] }
|
49
|
+
```
|
50
|
+
|
51
|
+
### Array Output Mode
|
52
|
+
|
53
|
+
If you prefer working with arrays instead of hashes, use `for_each_compat`:
|
54
|
+
|
55
|
+
```ruby
|
56
|
+
OSV.for_each("path/to/file.csv", result_type: :array) do |row|
|
57
|
+
# row is an Array like ["John", "25"]
|
58
|
+
puts row[0]
|
59
|
+
end
|
60
|
+
```
|
61
|
+
|
62
|
+
### Options
|
63
|
+
|
64
|
+
Both methods support the following options:
|
65
|
+
|
66
|
+
- `has_headers`: Boolean indicating if the first row contains headers (default: true)
|
67
|
+
- `col_sep`: String specifying the field separator (default: ",")
|
68
|
+
|
69
|
+
```ruby
|
70
|
+
# Reading TSV files
|
71
|
+
OSV.for_each("path/to/file.tsv", col_sep: "\t") do |row|
|
72
|
+
puts row["name"]
|
73
|
+
end
|
74
|
+
|
75
|
+
# Reading without headers
|
76
|
+
OSV.for_each("path/to/file.csv", has_headers: false) do |row|
|
77
|
+
# Headers will be automatically generated as "c0", "c1", etc.
|
78
|
+
puts row["c0"]
|
79
|
+
end
|
80
|
+
```
|
81
|
+
|
82
|
+
### Input Sources
|
83
|
+
|
84
|
+
OSV supports reading from:
|
85
|
+
|
86
|
+
- File paths (as strings)
|
87
|
+
- IO objects
|
88
|
+
- Important caveat: the IO object must respond to `rb_io_descriptor` with a file descriptor.
|
89
|
+
- StringIO objects
|
90
|
+
- Note: when you do this, the string is read (in full) into a Rust string, and we parse it there.
|
91
|
+
|
92
|
+
```ruby
|
93
|
+
# From file path
|
94
|
+
OSV.for_each("path/to/file.csv") { |row| puts row["name"] }
|
95
|
+
|
96
|
+
# From IO object
|
97
|
+
File.open("path/to/file.csv") do |file|
|
98
|
+
OSV.for_each(file) { |row| puts row["name"] }
|
99
|
+
end
|
100
|
+
|
101
|
+
# From StringIO
|
102
|
+
data = StringIO.new("name,age\nJohn,25")
|
103
|
+
OSV.for_each(data) { |row| puts row["name"] }
|
104
|
+
```
|
105
|
+
|
106
|
+
## Requirements
|
107
|
+
|
108
|
+
- Ruby >= 3.1.0
|
109
|
+
- Rust toolchain (for installation from source)
|
110
|
+
|
111
|
+
## Performance
|
112
|
+
|
113
|
+
This library is faster than the standard Ruby CSV library, and is comparable to the fastest CSV parser gems I've used.
|
@@ -0,0 +1,114 @@
|
|
1
|
+
use super::{
|
2
|
+
parser::RecordParser,
|
3
|
+
reader::{ReadImpl, RecordReader},
|
4
|
+
};
|
5
|
+
use magnus::{rb_sys::AsRawValue, value::ReprValue, Error, RString, Ruby, Value};
|
6
|
+
use std::{fs::File, io::Read, marker::PhantomData, os::fd::FromRawFd, thread};
|
7
|
+
|
8
|
+
pub struct RecordReaderBuilder<'a, T: RecordParser + Send + 'static> {
|
9
|
+
ruby: &'a Ruby,
|
10
|
+
to_read: Value,
|
11
|
+
has_headers: bool,
|
12
|
+
delimiter: u8,
|
13
|
+
quote_char: u8,
|
14
|
+
null_string: String,
|
15
|
+
buffer: usize,
|
16
|
+
_phantom: PhantomData<T>,
|
17
|
+
}
|
18
|
+
|
19
|
+
impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
20
|
+
pub fn new(ruby: &'a Ruby, to_read: Value) -> Self {
|
21
|
+
Self {
|
22
|
+
ruby,
|
23
|
+
to_read,
|
24
|
+
has_headers: true,
|
25
|
+
delimiter: b',',
|
26
|
+
quote_char: b'"',
|
27
|
+
null_string: String::new(),
|
28
|
+
buffer: 1000,
|
29
|
+
_phantom: PhantomData,
|
30
|
+
}
|
31
|
+
}
|
32
|
+
|
33
|
+
pub fn has_headers(mut self, has_headers: bool) -> Self {
|
34
|
+
self.has_headers = has_headers;
|
35
|
+
self
|
36
|
+
}
|
37
|
+
|
38
|
+
pub fn delimiter(mut self, delimiter: u8) -> Self {
|
39
|
+
self.delimiter = delimiter;
|
40
|
+
self
|
41
|
+
}
|
42
|
+
|
43
|
+
pub fn quote_char(mut self, quote_char: u8) -> Self {
|
44
|
+
self.quote_char = quote_char;
|
45
|
+
self
|
46
|
+
}
|
47
|
+
|
48
|
+
pub fn null_string(mut self, null_string: String) -> Self {
|
49
|
+
self.null_string = null_string;
|
50
|
+
self
|
51
|
+
}
|
52
|
+
|
53
|
+
pub fn buffer(mut self, buffer: usize) -> Self {
|
54
|
+
self.buffer = buffer;
|
55
|
+
self
|
56
|
+
}
|
57
|
+
|
58
|
+
fn get_reader(&self) -> Result<Box<dyn Read + Send + 'static>, Error> {
|
59
|
+
let string_io: magnus::RClass = self.ruby.eval("StringIO")?;
|
60
|
+
|
61
|
+
if self.to_read.is_kind_of(string_io) {
|
62
|
+
let string: RString = self.to_read.funcall("string", ())?;
|
63
|
+
let content = string.to_string()?;
|
64
|
+
Ok(Box::new(std::io::Cursor::new(content)))
|
65
|
+
} else if self.to_read.is_kind_of(self.ruby.class_io()) {
|
66
|
+
let fd = unsafe { rb_sys::rb_io_descriptor(self.to_read.as_raw()) };
|
67
|
+
let file = unsafe { File::from_raw_fd(fd) };
|
68
|
+
Ok(Box::new(file))
|
69
|
+
} else {
|
70
|
+
let path = self.to_read.to_r_string()?.to_string()?;
|
71
|
+
let file = std::fs::File::open(&path).map_err(|e| {
|
72
|
+
Error::new(
|
73
|
+
self.ruby.exception_runtime_error(),
|
74
|
+
format!("Failed to open file: {e}"),
|
75
|
+
)
|
76
|
+
})?;
|
77
|
+
Ok(Box::new(file))
|
78
|
+
}
|
79
|
+
}
|
80
|
+
|
81
|
+
pub fn build(self) -> Result<RecordReader<T>, Error> {
|
82
|
+
let readable = self.get_reader()?;
|
83
|
+
|
84
|
+
let mut reader = csv::ReaderBuilder::new()
|
85
|
+
.has_headers(self.has_headers)
|
86
|
+
.delimiter(self.delimiter)
|
87
|
+
.quote(self.quote_char)
|
88
|
+
.from_reader(readable);
|
89
|
+
|
90
|
+
let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
|
91
|
+
let headers_clone = headers.clone();
|
92
|
+
let null_string = self.null_string;
|
93
|
+
|
94
|
+
let (sender, receiver) = kanal::bounded(self.buffer);
|
95
|
+
let handle = thread::spawn(move || {
|
96
|
+
let mut record = csv::StringRecord::new();
|
97
|
+
while let Ok(true) = reader.read_record(&mut record) {
|
98
|
+
let row = T::parse(&headers_clone, &record, &null_string);
|
99
|
+
if sender.send(row).is_err() {
|
100
|
+
break;
|
101
|
+
}
|
102
|
+
}
|
103
|
+
let file_to_forget = reader.into_inner();
|
104
|
+
std::mem::forget(file_to_forget);
|
105
|
+
});
|
106
|
+
|
107
|
+
Ok(RecordReader {
|
108
|
+
reader: ReadImpl::MultiThreaded {
|
109
|
+
receiver,
|
110
|
+
handle: Some(handle),
|
111
|
+
},
|
112
|
+
})
|
113
|
+
}
|
114
|
+
}
|
@@ -0,0 +1,43 @@
|
|
1
|
+
use std::collections::HashMap;
|
2
|
+
|
3
|
+
pub trait RecordParser {
|
4
|
+
type Output;
|
5
|
+
|
6
|
+
fn parse(headers: &[String], record: &csv::StringRecord, null_string: &str) -> Self::Output;
|
7
|
+
}
|
8
|
+
|
9
|
+
impl RecordParser for HashMap<String, Option<String>> {
|
10
|
+
type Output = Self;
|
11
|
+
|
12
|
+
fn parse(headers: &[String], record: &csv::StringRecord, null_string: &str) -> Self::Output {
|
13
|
+
headers
|
14
|
+
.iter()
|
15
|
+
.zip(record.iter())
|
16
|
+
.map(|(header, field)| {
|
17
|
+
let value = if field == null_string {
|
18
|
+
None
|
19
|
+
} else {
|
20
|
+
Some(field.to_string())
|
21
|
+
};
|
22
|
+
(header.clone(), value)
|
23
|
+
})
|
24
|
+
.collect()
|
25
|
+
}
|
26
|
+
}
|
27
|
+
|
28
|
+
impl RecordParser for Vec<Option<String>> {
|
29
|
+
type Output = Self;
|
30
|
+
|
31
|
+
fn parse(_headers: &[String], record: &csv::StringRecord, null_string: &str) -> Self::Output {
|
32
|
+
record
|
33
|
+
.iter()
|
34
|
+
.map(|field| {
|
35
|
+
if field == null_string {
|
36
|
+
None
|
37
|
+
} else {
|
38
|
+
Some(field.to_string())
|
39
|
+
}
|
40
|
+
})
|
41
|
+
.collect()
|
42
|
+
}
|
43
|
+
}
|
@@ -0,0 +1,73 @@
|
|
1
|
+
use super::parser::RecordParser;
|
2
|
+
use magnus::{Error, Ruby};
|
3
|
+
use std::{io::Read, thread};
|
4
|
+
|
5
|
+
pub struct RecordReader<T: RecordParser> {
|
6
|
+
pub(crate) reader: ReadImpl<T>,
|
7
|
+
}
|
8
|
+
|
9
|
+
#[allow(dead_code)]
|
10
|
+
pub enum ReadImpl<T: RecordParser> {
|
11
|
+
SingleThreaded {
|
12
|
+
reader: csv::Reader<Box<dyn Read + Send + 'static>>,
|
13
|
+
headers: Vec<String>,
|
14
|
+
null_string: String,
|
15
|
+
},
|
16
|
+
MultiThreaded {
|
17
|
+
receiver: kanal::Receiver<T::Output>,
|
18
|
+
handle: Option<thread::JoinHandle<()>>,
|
19
|
+
},
|
20
|
+
}
|
21
|
+
|
22
|
+
impl<T: RecordParser> RecordReader<T> {
|
23
|
+
pub(crate) fn get_headers(
|
24
|
+
ruby: &Ruby,
|
25
|
+
reader: &mut csv::Reader<impl Read>,
|
26
|
+
has_headers: bool,
|
27
|
+
) -> Result<Vec<String>, Error> {
|
28
|
+
let first_row = reader
|
29
|
+
.headers()
|
30
|
+
.map_err(|e| {
|
31
|
+
Error::new(
|
32
|
+
ruby.exception_runtime_error(),
|
33
|
+
format!("Failed to read headers: {e}"),
|
34
|
+
)
|
35
|
+
})?
|
36
|
+
.clone();
|
37
|
+
|
38
|
+
Ok(if has_headers {
|
39
|
+
first_row.iter().map(String::from).collect()
|
40
|
+
} else {
|
41
|
+
(0..first_row.len()).map(|i| format!("c{i}")).collect()
|
42
|
+
})
|
43
|
+
}
|
44
|
+
}
|
45
|
+
|
46
|
+
impl<T: RecordParser> Iterator for RecordReader<T> {
|
47
|
+
type Item = T::Output;
|
48
|
+
|
49
|
+
fn next(&mut self) -> Option<Self::Item> {
|
50
|
+
match &mut self.reader {
|
51
|
+
ReadImpl::MultiThreaded { receiver, handle } => match receiver.recv() {
|
52
|
+
Ok(record) => Some(record),
|
53
|
+
Err(_) => {
|
54
|
+
if let Some(handle) = handle.take() {
|
55
|
+
let _ = handle.join();
|
56
|
+
}
|
57
|
+
None
|
58
|
+
}
|
59
|
+
},
|
60
|
+
ReadImpl::SingleThreaded {
|
61
|
+
reader,
|
62
|
+
headers,
|
63
|
+
null_string,
|
64
|
+
} => {
|
65
|
+
let mut record = csv::StringRecord::new();
|
66
|
+
match reader.read_record(&mut record) {
|
67
|
+
Ok(true) => Some(T::parse(headers, &record, null_string)),
|
68
|
+
_ => None,
|
69
|
+
}
|
70
|
+
}
|
71
|
+
}
|
72
|
+
}
|
73
|
+
}
|
@@ -0,0 +1,17 @@
|
|
1
|
+
use magnus::{IntoValue, Ruby, Value};
|
2
|
+
use std::collections::HashMap;
|
3
|
+
|
4
|
+
#[derive(Debug)]
|
5
|
+
pub enum CsvRecord {
|
6
|
+
Vec(Vec<Option<String>>),
|
7
|
+
Map(HashMap<String, Option<String>>),
|
8
|
+
}
|
9
|
+
|
10
|
+
impl IntoValue for CsvRecord {
|
11
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
12
|
+
match self {
|
13
|
+
CsvRecord::Vec(vec) => vec.into_value_with(handle),
|
14
|
+
CsvRecord::Map(map) => map.into_value_with(handle),
|
15
|
+
}
|
16
|
+
}
|
17
|
+
}
|
data/ext/osv/src/lib.rs
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
mod csv;
|
1
2
|
mod reader;
|
2
3
|
mod utils;
|
3
4
|
|
@@ -10,6 +11,5 @@ use magnus::{Error, Ruby};
|
|
10
11
|
fn init(ruby: &Ruby) -> Result<(), Error> {
|
11
12
|
let module = ruby.define_module("OSV")?;
|
12
13
|
module.define_module_function("for_each", magnus::method!(parse_csv, -1))?;
|
13
|
-
module.define_module_function("for_each_compat", magnus::method!(parse_compat, -1))?;
|
14
14
|
Ok(())
|
15
15
|
}
|
data/ext/osv/src/reader.rs
CHANGED
@@ -1,230 +1,100 @@
|
|
1
|
+
use std::collections::HashMap;
|
2
|
+
|
3
|
+
use crate::csv::{CsvRecord, RecordReaderBuilder};
|
1
4
|
use crate::utils::*;
|
2
|
-
use magnus::
|
3
|
-
|
4
|
-
};
|
5
|
-
use std::{fs::File, io::Read, os::fd::FromRawFd, thread};
|
5
|
+
use magnus::value::ReprValue;
|
6
|
+
use magnus::{block::Yield, Error, KwArgs, RHash, Ruby, Symbol, Value};
|
6
7
|
|
7
|
-
/// Parses CSV data from a file and yields each row as a hash to the block.
|
8
8
|
pub fn parse_csv(
|
9
9
|
ruby: &Ruby,
|
10
10
|
rb_self: Value,
|
11
11
|
args: &[Value],
|
12
|
-
) -> Result<Yield<
|
13
|
-
|
14
|
-
return Ok(Yield::Enumerator(rb_self.enumeratorize("for_each", args)));
|
15
|
-
}
|
16
|
-
let (to_read, has_headers, delimiter) = parse_csv_args(args)?;
|
17
|
-
|
18
|
-
let iter = RecordReader::<std::collections::HashMap<String, String>>::new(
|
19
|
-
ruby,
|
12
|
+
) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord>>>, Error> {
|
13
|
+
let CsvArgs {
|
20
14
|
to_read,
|
21
15
|
has_headers,
|
22
|
-
delimiter
|
23
|
-
|
24
|
-
|
16
|
+
delimiter,
|
17
|
+
quote_char,
|
18
|
+
null_string,
|
19
|
+
buffer_size,
|
20
|
+
result_type,
|
21
|
+
} = parse_csv_args(ruby, args)?;
|
25
22
|
|
26
|
-
Ok(Yield::Iter(iter))
|
27
|
-
}
|
28
|
-
|
29
|
-
pub fn parse_compat(
|
30
|
-
ruby: &Ruby,
|
31
|
-
rb_self: Value,
|
32
|
-
args: &[Value],
|
33
|
-
) -> Result<Yield<impl Iterator<Item = Vec<String>>>, Error> {
|
34
23
|
if !ruby.block_given() {
|
35
|
-
return
|
36
|
-
rb_self
|
37
|
-
|
24
|
+
return create_enumerator(EnumeratorArgs {
|
25
|
+
rb_self,
|
26
|
+
to_read,
|
27
|
+
has_headers,
|
28
|
+
delimiter,
|
29
|
+
quote_char,
|
30
|
+
null_string,
|
31
|
+
buffer_size,
|
32
|
+
result_type,
|
33
|
+
});
|
38
34
|
}
|
39
|
-
let (to_read, has_headers, delimiter) = parse_csv_args(args)?;
|
40
35
|
|
41
|
-
let iter =
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
36
|
+
let iter: Box<dyn Iterator<Item = CsvRecord>> = match result_type.as_str() {
|
37
|
+
"hash" => Box::new(
|
38
|
+
RecordReaderBuilder::<HashMap<String, Option<String>>>::new(ruby, to_read)
|
39
|
+
.has_headers(has_headers)
|
40
|
+
.delimiter(delimiter)
|
41
|
+
.quote_char(quote_char)
|
42
|
+
.null_string(null_string)
|
43
|
+
.buffer(buffer_size)
|
44
|
+
.build()?
|
45
|
+
.map(CsvRecord::Map),
|
46
|
+
),
|
47
|
+
"array" => Box::new(
|
48
|
+
RecordReaderBuilder::<Vec<Option<String>>>::new(ruby, to_read)
|
49
|
+
.has_headers(has_headers)
|
50
|
+
.delimiter(delimiter)
|
51
|
+
.quote_char(quote_char)
|
52
|
+
.null_string(null_string)
|
53
|
+
.buffer(buffer_size)
|
54
|
+
.build()?
|
55
|
+
.map(CsvRecord::Vec),
|
56
|
+
),
|
57
|
+
_ => {
|
58
|
+
return Err(Error::new(
|
59
|
+
ruby.exception_runtime_error(),
|
60
|
+
"Invalid result type",
|
61
|
+
))
|
62
|
+
}
|
63
|
+
};
|
48
64
|
|
49
65
|
Ok(Yield::Iter(iter))
|
50
66
|
}
|
51
67
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
fn parse(headers: &[String], record: &csv::StringRecord) -> Self::Output {
|
62
|
-
record
|
63
|
-
.iter()
|
64
|
-
.enumerate()
|
65
|
-
.map(|(i, field)| (headers[i].clone(), field.to_string()))
|
66
|
-
.collect()
|
67
|
-
}
|
68
|
-
}
|
69
|
-
|
70
|
-
impl RecordParser for Vec<String> {
|
71
|
-
type Output = Self;
|
72
|
-
|
73
|
-
fn parse(_headers: &[String], record: &csv::StringRecord) -> Self::Output {
|
74
|
-
record.iter().map(|field| field.to_string()).collect()
|
75
|
-
}
|
76
|
-
}
|
77
|
-
|
78
|
-
struct RecordReader<T: RecordParser> {
|
79
|
-
reader: ReadImpl<T>,
|
80
|
-
}
|
81
|
-
|
82
|
-
#[allow(dead_code)]
|
83
|
-
enum ReadImpl<T: RecordParser> {
|
84
|
-
SingleThreaded {
|
85
|
-
reader: csv::Reader<Box<dyn Read + Send + 'static>>,
|
86
|
-
headers: Vec<String>,
|
87
|
-
},
|
88
|
-
MultiThreaded {
|
89
|
-
receiver: kanal::Receiver<T::Output>,
|
90
|
-
handle: Option<thread::JoinHandle<()>>,
|
91
|
-
},
|
92
|
-
}
|
93
|
-
|
94
|
-
impl<T: RecordParser + Send + 'static> RecordReader<T> {
|
95
|
-
fn new(
|
96
|
-
ruby: &Ruby,
|
97
|
-
to_read: Value,
|
98
|
-
has_headers: bool,
|
99
|
-
delimiter: u8,
|
100
|
-
buffer: usize,
|
101
|
-
) -> Result<Self, Error> {
|
102
|
-
let string_io = RClass::from(ruby.eval("StringIO").map_err(|e| {
|
103
|
-
Error::new(
|
104
|
-
ruby.exception_runtime_error(),
|
105
|
-
format!("Failed to get StringIO class: {}", e),
|
106
|
-
)
|
107
|
-
})?);
|
108
|
-
|
109
|
-
let readable: Box<dyn Read + Send + 'static> = if to_read.is_kind_of(string_io) {
|
110
|
-
let string: RString = to_read.funcall("string", ()).map_err(|e| {
|
111
|
-
Error::new(
|
112
|
-
ruby.exception_runtime_error(),
|
113
|
-
format!("Failed to get string from StringIO: {}", e),
|
114
|
-
)
|
115
|
-
})?;
|
116
|
-
let content = string.to_string().map_err(|e| {
|
117
|
-
Error::new(
|
118
|
-
ruby.exception_runtime_error(),
|
119
|
-
format!("Failed to convert string to Rust String: {}", e),
|
120
|
-
)
|
121
|
-
})?;
|
122
|
-
Box::new(std::io::Cursor::new(content))
|
123
|
-
} else if to_read.is_kind_of(ruby.class_io()) {
|
124
|
-
let fd = unsafe { rb_sys::rb_io_descriptor(to_read.as_raw()) };
|
125
|
-
let file = unsafe { File::from_raw_fd(fd) };
|
126
|
-
Box::new(file)
|
127
|
-
} else {
|
128
|
-
let path = to_read
|
129
|
-
.to_r_string()
|
130
|
-
.map_err(|e| {
|
131
|
-
Error::new(
|
132
|
-
ruby.exception_runtime_error(),
|
133
|
-
format!("Failed to convert path to string: {}", e),
|
134
|
-
)
|
135
|
-
})?
|
136
|
-
.to_string()
|
137
|
-
.map_err(|e| {
|
138
|
-
Error::new(
|
139
|
-
ruby.exception_runtime_error(),
|
140
|
-
format!("Failed to convert RString to Rust String: {}", e),
|
141
|
-
)
|
142
|
-
})?;
|
143
|
-
let file = std::fs::File::open(&path).map_err(|e| {
|
144
|
-
Error::new(
|
145
|
-
ruby.exception_runtime_error(),
|
146
|
-
format!("Failed to open file: {}", e),
|
147
|
-
)
|
148
|
-
})?;
|
149
|
-
Box::new(file)
|
150
|
-
};
|
151
|
-
|
152
|
-
let mut reader = csv::ReaderBuilder::new()
|
153
|
-
.has_headers(has_headers)
|
154
|
-
.delimiter(delimiter)
|
155
|
-
.from_reader(readable);
|
156
|
-
|
157
|
-
let headers = Self::get_headers(&mut reader, has_headers)?;
|
158
|
-
let headers_clone = headers.clone();
|
159
|
-
|
160
|
-
let (sender, receiver) = kanal::bounded(buffer);
|
161
|
-
let handle = thread::spawn(move || {
|
162
|
-
let mut record = csv::StringRecord::new();
|
163
|
-
while let Ok(read) = reader.read_record(&mut record) {
|
164
|
-
if !read {
|
165
|
-
let file_to_forget = reader.into_inner();
|
166
|
-
std::mem::forget(file_to_forget);
|
167
|
-
break;
|
168
|
-
}
|
169
|
-
let row = T::parse(&headers_clone, &record);
|
170
|
-
if sender.send(row).is_err() {
|
171
|
-
break;
|
172
|
-
}
|
173
|
-
}
|
174
|
-
});
|
175
|
-
|
176
|
-
let read_impl = ReadImpl::MultiThreaded {
|
177
|
-
receiver,
|
178
|
-
handle: Some(handle),
|
179
|
-
};
|
180
|
-
|
181
|
-
Ok(Self { reader: read_impl })
|
182
|
-
}
|
183
|
-
|
184
|
-
fn get_headers(
|
185
|
-
reader: &mut csv::Reader<impl Read>,
|
186
|
-
has_headers: bool,
|
187
|
-
) -> Result<Vec<String>, Error> {
|
188
|
-
let first_row = reader
|
189
|
-
.headers()
|
190
|
-
.map_err(|e| {
|
191
|
-
Error::new(
|
192
|
-
magnus::exception::runtime_error(),
|
193
|
-
format!("Failed to read headers: {}", e),
|
194
|
-
)
|
195
|
-
})?
|
196
|
-
.clone();
|
197
|
-
let num_fields = first_row.len();
|
198
|
-
|
199
|
-
Ok(if has_headers {
|
200
|
-
first_row.iter().map(|h| h.to_string()).collect()
|
201
|
-
} else {
|
202
|
-
(0..num_fields).map(|i| format!("c{}", i)).collect()
|
203
|
-
})
|
204
|
-
}
|
68
|
+
struct EnumeratorArgs {
|
69
|
+
rb_self: Value,
|
70
|
+
to_read: Value,
|
71
|
+
has_headers: bool,
|
72
|
+
delimiter: u8,
|
73
|
+
quote_char: u8,
|
74
|
+
null_string: String,
|
75
|
+
buffer_size: usize,
|
76
|
+
result_type: String,
|
205
77
|
}
|
206
78
|
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
}
|
229
|
-
}
|
79
|
+
fn create_enumerator(
|
80
|
+
args: EnumeratorArgs,
|
81
|
+
) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord>>>, Error> {
|
82
|
+
let kwargs = RHash::new();
|
83
|
+
kwargs.aset(Symbol::new("has_headers"), args.has_headers)?;
|
84
|
+
kwargs.aset(
|
85
|
+
Symbol::new("col_sep"),
|
86
|
+
String::from_utf8(vec![args.delimiter]).unwrap(),
|
87
|
+
)?;
|
88
|
+
kwargs.aset(
|
89
|
+
Symbol::new("quote_char"),
|
90
|
+
String::from_utf8(vec![args.quote_char]).unwrap(),
|
91
|
+
)?;
|
92
|
+
kwargs.aset(Symbol::new("nil_string"), args.null_string)?;
|
93
|
+
kwargs.aset(Symbol::new("buffer_size"), args.buffer_size)?;
|
94
|
+
kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
|
95
|
+
|
96
|
+
let enumerator = args
|
97
|
+
.rb_self
|
98
|
+
.enumeratorize("for_each", (args.to_read, KwArgs(kwargs)));
|
99
|
+
Ok(Yield::Enumerator(enumerator))
|
230
100
|
}
|
data/ext/osv/src/utils.rs
CHANGED
@@ -1,20 +1,123 @@
|
|
1
1
|
use magnus::{
|
2
2
|
scan_args::{get_kwargs, scan_args},
|
3
|
-
|
3
|
+
value::ReprValue,
|
4
|
+
Error, RString, Ruby, Symbol, Value,
|
4
5
|
};
|
5
6
|
|
7
|
+
#[derive(Debug)]
|
8
|
+
pub struct CsvArgs {
|
9
|
+
pub to_read: Value,
|
10
|
+
pub has_headers: bool,
|
11
|
+
pub delimiter: u8,
|
12
|
+
pub quote_char: u8,
|
13
|
+
pub null_string: String,
|
14
|
+
pub buffer_size: usize,
|
15
|
+
pub result_type: String,
|
16
|
+
}
|
17
|
+
|
6
18
|
/// Parse common arguments for CSV parsing
|
7
|
-
pub fn parse_csv_args(args: &[Value]) -> Result<
|
19
|
+
pub fn parse_csv_args(ruby: &Ruby, args: &[Value]) -> Result<CsvArgs, Error> {
|
8
20
|
let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
|
9
21
|
let (to_read,) = parsed_args.required;
|
10
22
|
|
11
|
-
let kwargs = get_kwargs::<
|
23
|
+
let kwargs = get_kwargs::<
|
24
|
+
_,
|
25
|
+
(),
|
26
|
+
(
|
27
|
+
Option<bool>,
|
28
|
+
Option<String>,
|
29
|
+
Option<String>,
|
30
|
+
Option<String>,
|
31
|
+
Option<usize>,
|
32
|
+
Option<Value>,
|
33
|
+
),
|
34
|
+
(),
|
35
|
+
>(
|
12
36
|
parsed_args.keywords,
|
13
37
|
&[],
|
14
|
-
&[
|
38
|
+
&[
|
39
|
+
"has_headers",
|
40
|
+
"col_sep",
|
41
|
+
"quote_char",
|
42
|
+
"nil_string",
|
43
|
+
"buffer_size",
|
44
|
+
"result_type",
|
45
|
+
],
|
15
46
|
)?;
|
16
47
|
|
17
48
|
let has_headers = kwargs.optional.0.unwrap_or(true);
|
18
49
|
|
19
|
-
|
50
|
+
let delimiter = *kwargs
|
51
|
+
.optional
|
52
|
+
.1
|
53
|
+
.unwrap_or_else(|| ",".to_string())
|
54
|
+
.as_bytes()
|
55
|
+
.first()
|
56
|
+
.ok_or_else(|| {
|
57
|
+
Error::new(
|
58
|
+
magnus::exception::runtime_error(),
|
59
|
+
"Delimiter cannot be empty",
|
60
|
+
)
|
61
|
+
})?;
|
62
|
+
|
63
|
+
let quote_char = *kwargs
|
64
|
+
.optional
|
65
|
+
.2
|
66
|
+
.unwrap_or_else(|| "\"".to_string())
|
67
|
+
.as_bytes()
|
68
|
+
.first()
|
69
|
+
.ok_or_else(|| {
|
70
|
+
Error::new(
|
71
|
+
magnus::exception::runtime_error(),
|
72
|
+
"Quote character cannot be empty",
|
73
|
+
)
|
74
|
+
})?;
|
75
|
+
|
76
|
+
let null_string = kwargs.optional.3.unwrap_or_else(|| "".to_string());
|
77
|
+
|
78
|
+
let buffer_size = kwargs.optional.4.unwrap_or(1000);
|
79
|
+
|
80
|
+
let result_type = match kwargs.optional.5 {
|
81
|
+
Some(value) => {
|
82
|
+
let parsed = if value.is_kind_of(ruby.class_string()) {
|
83
|
+
RString::from_value(value)
|
84
|
+
.ok_or_else(|| {
|
85
|
+
Error::new(magnus::exception::type_error(), "Invalid string value")
|
86
|
+
})?
|
87
|
+
.to_string()?
|
88
|
+
} else if value.is_kind_of(ruby.class_symbol()) {
|
89
|
+
Symbol::from_value(value)
|
90
|
+
.ok_or_else(|| {
|
91
|
+
Error::new(magnus::exception::type_error(), "Invalid symbol value")
|
92
|
+
})?
|
93
|
+
.funcall("to_s", ())?
|
94
|
+
} else {
|
95
|
+
return Err(Error::new(
|
96
|
+
magnus::exception::type_error(),
|
97
|
+
"result_type must be a String or Symbol",
|
98
|
+
));
|
99
|
+
};
|
100
|
+
|
101
|
+
match parsed.as_str() {
|
102
|
+
"hash" | "array" => parsed,
|
103
|
+
_ => {
|
104
|
+
return Err(Error::new(
|
105
|
+
magnus::exception::runtime_error(),
|
106
|
+
"result_type must be either 'hash' or 'array'",
|
107
|
+
))
|
108
|
+
}
|
109
|
+
}
|
110
|
+
}
|
111
|
+
None => String::from("hash"),
|
112
|
+
};
|
113
|
+
|
114
|
+
Ok(CsvArgs {
|
115
|
+
to_read,
|
116
|
+
has_headers,
|
117
|
+
delimiter,
|
118
|
+
quote_char,
|
119
|
+
null_string,
|
120
|
+
buffer_size,
|
121
|
+
result_type,
|
122
|
+
})
|
20
123
|
}
|
data/lib/osv/version.rb
CHANGED
data/lib/osv.rbi
CHANGED
@@ -2,28 +2,26 @@
|
|
2
2
|
|
3
3
|
module OSV
|
4
4
|
sig do
|
5
|
-
|
6
|
-
.
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
)
|
12
|
-
.
|
5
|
+
params(
|
6
|
+
input: T.any(String, StringIO, IO),
|
7
|
+
has_headers: T.nilable(T::Boolean),
|
8
|
+
col_sep: T.nilable(String),
|
9
|
+
quote_char: T.nilable(String),
|
10
|
+
nil_string: T.nilable(String),
|
11
|
+
buffer_size: T.nilable(Integer),
|
12
|
+
result_type: T.nilable(String),
|
13
|
+
blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.nilable(String)], T::Array[T.nilable(String)])).void)
|
14
|
+
).returns(T.any(Enumerator, T.untyped))
|
13
15
|
end
|
14
|
-
def self.for_each(
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
)
|
25
|
-
.returns(T.untyped)
|
26
|
-
end
|
27
|
-
def self.for_each_compat(input, has_headers: true, delimiter: nil, &blk)
|
16
|
+
def self.for_each(
|
17
|
+
input,
|
18
|
+
has_headers: true,
|
19
|
+
col_sep: nil,
|
20
|
+
quote_char: nil,
|
21
|
+
nil_string: nil,
|
22
|
+
buffer_size: nil,
|
23
|
+
result_type: nil,
|
24
|
+
&blk
|
25
|
+
)
|
28
26
|
end
|
29
27
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: osv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
@@ -49,10 +49,17 @@ files:
|
|
49
49
|
- Cargo.lock
|
50
50
|
- Cargo.toml
|
51
51
|
- Gemfile
|
52
|
+
- LICENSE
|
53
|
+
- README.md
|
52
54
|
- Rakefile
|
53
55
|
- ext/osv/Cargo.lock
|
54
56
|
- ext/osv/Cargo.toml
|
55
57
|
- ext/osv/extconf.rb
|
58
|
+
- ext/osv/src/csv/builder.rs
|
59
|
+
- ext/osv/src/csv/mod.rs
|
60
|
+
- ext/osv/src/csv/parser.rs
|
61
|
+
- ext/osv/src/csv/reader.rs
|
62
|
+
- ext/osv/src/csv/record.rs
|
56
63
|
- ext/osv/src/lib.rs
|
57
64
|
- ext/osv/src/reader.rs
|
58
65
|
- ext/osv/src/utils.rs
|