osv 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a1b0c347b0bab5c9d31069c56f47999bfa51e85dfc1e127d1c4474a84ac19c53
4
- data.tar.gz: 847b199da27b7c1329c1fa64fc8636592f004e93a12fc2ddd8db6127298ac23d
3
+ metadata.gz: f22d1d56b0eba1e23ca192db2c70e68689486e2f0032672285017e1f98a530d2
4
+ data.tar.gz: 6288dce70b95faf312e8aa244ba56a60c3d59b85bdd16a4a951060df78b97e1e
5
5
  SHA512:
6
- metadata.gz: 62fa77c1ca98031f483569a4dba7cf9e4eca52a4b5fae293d274d5f89c48003e301eab01d95116cfc9cc6a2642e742d16046231a21d25e4a5143bd6ec3b40dac
7
- data.tar.gz: 3832cbb6ebadfc718a8a5d1963de960ed3abf09d4559d2f0ffe031c642a6c2581dc6ad6edf5d65f22248812585ca464916375503753831024fc355fe4cd04455
6
+ metadata.gz: 5399b43ecd3987c73daf09341d51a1ee8e5d060f1085e9a7aac9b823ab723ccbbc4084c5c0c9abbd36e7e17becfcfa1757af6af3666e01ef3992a27e35b5b983
7
+ data.tar.gz: 2a6d98b645af40ab08a5a01a2bcf5b67ce9ebff18a993e602f6b00fa2f3f80d65c63605a7f5a715286ac20751db0607c515686e4439156a4039ae24f49e95e10
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Nathan Jaremko
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,113 @@
1
+ # OSV
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/osv.svg)](https://badge.fury.io/rb/osv)
4
+
5
+ OSV is a high-performance CSV parser for Ruby, implemented in Rust. It wraps BurntSushi's excellent [csv-rs](https://github.com/BurntSushi/rust-csv) crate.
6
+
7
+ It provides a simple interface for reading CSV files with support for both hash-based and array-based row formats.
8
+
9
+ The array-based mode is faster than the hash-based mode, so if you don't need the hash keys, use the array-based mode.
10
+
11
+ ## Installation
12
+
13
+ Add this line to your application's Gemfile:
14
+
15
+ ```ruby
16
+ gem 'osv'
17
+ ```
18
+
19
+ And then execute:
20
+
21
+ ```bash
22
+ bundle install
23
+ ```
24
+
25
+ Or install it directly:
26
+
27
+ ```bash
28
+ gem install osv
29
+ ```
30
+
31
+ ## Usage
32
+
33
+ ### Basic Usage with Hash Output
34
+
35
+ Each row is returned as a hash where the keys are the column headers:
36
+
37
+ ```ruby
38
+ require 'osv'
39
+
40
+ # Read from a file
41
+ OSV.for_each("path/to/file.csv") do |row|
42
+ # row is a Hash like {"name" => "John", "age" => "25"}
43
+ puts row["name"]
44
+ end
45
+
46
+ # Without a block, returns an Enumerator
47
+ rows = OSV.for_each("path/to/file.csv")
48
+ rows.each { |row| puts row["name"] }
49
+ ```
50
+
51
+ ### Array Output Mode
52
+
53
+ If you prefer working with arrays instead of hashes, use `for_each_compat`:
54
+
55
+ ```ruby
56
+ OSV.for_each("path/to/file.csv", result_type: :array) do |row|
57
+ # row is an Array like ["John", "25"]
58
+ puts row[0]
59
+ end
60
+ ```
61
+
62
+ ### Options
63
+
64
+ Both methods support the following options:
65
+
66
+ - `has_headers`: Boolean indicating if the first row contains headers (default: true)
67
+ - `col_sep`: String specifying the field separator (default: ",")
68
+
69
+ ```ruby
70
+ # Reading TSV files
71
+ OSV.for_each("path/to/file.tsv", col_sep: "\t") do |row|
72
+ puts row["name"]
73
+ end
74
+
75
+ # Reading without headers
76
+ OSV.for_each("path/to/file.csv", has_headers: false) do |row|
77
+ # Headers will be automatically generated as "c0", "c1", etc.
78
+ puts row["c0"]
79
+ end
80
+ ```
81
+
82
+ ### Input Sources
83
+
84
+ OSV supports reading from:
85
+
86
+ - File paths (as strings)
87
+ - IO objects
88
+ - Important caveat: the IO object must respond to `rb_io_descriptor` with a file descriptor.
89
+ - StringIO objects
90
+ - Note: when you do this, the string is read (in full) into a Rust string, and we parse it there.
91
+
92
+ ```ruby
93
+ # From file path
94
+ OSV.for_each("path/to/file.csv") { |row| puts row["name"] }
95
+
96
+ # From IO object
97
+ File.open("path/to/file.csv") do |file|
98
+ OSV.for_each(file) { |row| puts row["name"] }
99
+ end
100
+
101
+ # From StringIO
102
+ data = StringIO.new("name,age\nJohn,25")
103
+ OSV.for_each(data) { |row| puts row["name"] }
104
+ ```
105
+
106
+ ## Requirements
107
+
108
+ - Ruby >= 3.1.0
109
+ - Rust toolchain (for installation from source)
110
+
111
+ ## Performance
112
+
113
+ This library is faster than the standard Ruby CSV library, and is comparable to the fastest CSV parser gems I've used.
@@ -0,0 +1,114 @@
1
+ use super::{
2
+ parser::RecordParser,
3
+ reader::{ReadImpl, RecordReader},
4
+ };
5
+ use magnus::{rb_sys::AsRawValue, value::ReprValue, Error, RString, Ruby, Value};
6
+ use std::{fs::File, io::Read, marker::PhantomData, os::fd::FromRawFd, thread};
7
+
8
+ pub struct RecordReaderBuilder<'a, T: RecordParser + Send + 'static> {
9
+ ruby: &'a Ruby,
10
+ to_read: Value,
11
+ has_headers: bool,
12
+ delimiter: u8,
13
+ quote_char: u8,
14
+ null_string: String,
15
+ buffer: usize,
16
+ _phantom: PhantomData<T>,
17
+ }
18
+
19
+ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
20
+ pub fn new(ruby: &'a Ruby, to_read: Value) -> Self {
21
+ Self {
22
+ ruby,
23
+ to_read,
24
+ has_headers: true,
25
+ delimiter: b',',
26
+ quote_char: b'"',
27
+ null_string: String::new(),
28
+ buffer: 1000,
29
+ _phantom: PhantomData,
30
+ }
31
+ }
32
+
33
+ pub fn has_headers(mut self, has_headers: bool) -> Self {
34
+ self.has_headers = has_headers;
35
+ self
36
+ }
37
+
38
+ pub fn delimiter(mut self, delimiter: u8) -> Self {
39
+ self.delimiter = delimiter;
40
+ self
41
+ }
42
+
43
+ pub fn quote_char(mut self, quote_char: u8) -> Self {
44
+ self.quote_char = quote_char;
45
+ self
46
+ }
47
+
48
+ pub fn null_string(mut self, null_string: String) -> Self {
49
+ self.null_string = null_string;
50
+ self
51
+ }
52
+
53
+ pub fn buffer(mut self, buffer: usize) -> Self {
54
+ self.buffer = buffer;
55
+ self
56
+ }
57
+
58
+ fn get_reader(&self) -> Result<Box<dyn Read + Send + 'static>, Error> {
59
+ let string_io: magnus::RClass = self.ruby.eval("StringIO")?;
60
+
61
+ if self.to_read.is_kind_of(string_io) {
62
+ let string: RString = self.to_read.funcall("string", ())?;
63
+ let content = string.to_string()?;
64
+ Ok(Box::new(std::io::Cursor::new(content)))
65
+ } else if self.to_read.is_kind_of(self.ruby.class_io()) {
66
+ let fd = unsafe { rb_sys::rb_io_descriptor(self.to_read.as_raw()) };
67
+ let file = unsafe { File::from_raw_fd(fd) };
68
+ Ok(Box::new(file))
69
+ } else {
70
+ let path = self.to_read.to_r_string()?.to_string()?;
71
+ let file = std::fs::File::open(&path).map_err(|e| {
72
+ Error::new(
73
+ self.ruby.exception_runtime_error(),
74
+ format!("Failed to open file: {e}"),
75
+ )
76
+ })?;
77
+ Ok(Box::new(file))
78
+ }
79
+ }
80
+
81
+ pub fn build(self) -> Result<RecordReader<T>, Error> {
82
+ let readable = self.get_reader()?;
83
+
84
+ let mut reader = csv::ReaderBuilder::new()
85
+ .has_headers(self.has_headers)
86
+ .delimiter(self.delimiter)
87
+ .quote(self.quote_char)
88
+ .from_reader(readable);
89
+
90
+ let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
91
+ let headers_clone = headers.clone();
92
+ let null_string = self.null_string;
93
+
94
+ let (sender, receiver) = kanal::bounded(self.buffer);
95
+ let handle = thread::spawn(move || {
96
+ let mut record = csv::StringRecord::new();
97
+ while let Ok(true) = reader.read_record(&mut record) {
98
+ let row = T::parse(&headers_clone, &record, &null_string);
99
+ if sender.send(row).is_err() {
100
+ break;
101
+ }
102
+ }
103
+ let file_to_forget = reader.into_inner();
104
+ std::mem::forget(file_to_forget);
105
+ });
106
+
107
+ Ok(RecordReader {
108
+ reader: ReadImpl::MultiThreaded {
109
+ receiver,
110
+ handle: Some(handle),
111
+ },
112
+ })
113
+ }
114
+ }
@@ -0,0 +1,8 @@
1
+ mod builder;
2
+ mod parser;
3
+ mod reader;
4
+ mod record;
5
+
6
+ pub use builder::RecordReaderBuilder;
7
+ pub use record::CsvRecord;
8
+
@@ -0,0 +1,43 @@
1
+ use std::collections::HashMap;
2
+
3
+ pub trait RecordParser {
4
+ type Output;
5
+
6
+ fn parse(headers: &[String], record: &csv::StringRecord, null_string: &str) -> Self::Output;
7
+ }
8
+
9
+ impl RecordParser for HashMap<String, Option<String>> {
10
+ type Output = Self;
11
+
12
+ fn parse(headers: &[String], record: &csv::StringRecord, null_string: &str) -> Self::Output {
13
+ headers
14
+ .iter()
15
+ .zip(record.iter())
16
+ .map(|(header, field)| {
17
+ let value = if field == null_string {
18
+ None
19
+ } else {
20
+ Some(field.to_string())
21
+ };
22
+ (header.clone(), value)
23
+ })
24
+ .collect()
25
+ }
26
+ }
27
+
28
+ impl RecordParser for Vec<Option<String>> {
29
+ type Output = Self;
30
+
31
+ fn parse(_headers: &[String], record: &csv::StringRecord, null_string: &str) -> Self::Output {
32
+ record
33
+ .iter()
34
+ .map(|field| {
35
+ if field == null_string {
36
+ None
37
+ } else {
38
+ Some(field.to_string())
39
+ }
40
+ })
41
+ .collect()
42
+ }
43
+ }
@@ -0,0 +1,73 @@
1
+ use super::parser::RecordParser;
2
+ use magnus::{Error, Ruby};
3
+ use std::{io::Read, thread};
4
+
5
+ pub struct RecordReader<T: RecordParser> {
6
+ pub(crate) reader: ReadImpl<T>,
7
+ }
8
+
9
+ #[allow(dead_code)]
10
+ pub enum ReadImpl<T: RecordParser> {
11
+ SingleThreaded {
12
+ reader: csv::Reader<Box<dyn Read + Send + 'static>>,
13
+ headers: Vec<String>,
14
+ null_string: String,
15
+ },
16
+ MultiThreaded {
17
+ receiver: kanal::Receiver<T::Output>,
18
+ handle: Option<thread::JoinHandle<()>>,
19
+ },
20
+ }
21
+
22
+ impl<T: RecordParser> RecordReader<T> {
23
+ pub(crate) fn get_headers(
24
+ ruby: &Ruby,
25
+ reader: &mut csv::Reader<impl Read>,
26
+ has_headers: bool,
27
+ ) -> Result<Vec<String>, Error> {
28
+ let first_row = reader
29
+ .headers()
30
+ .map_err(|e| {
31
+ Error::new(
32
+ ruby.exception_runtime_error(),
33
+ format!("Failed to read headers: {e}"),
34
+ )
35
+ })?
36
+ .clone();
37
+
38
+ Ok(if has_headers {
39
+ first_row.iter().map(String::from).collect()
40
+ } else {
41
+ (0..first_row.len()).map(|i| format!("c{i}")).collect()
42
+ })
43
+ }
44
+ }
45
+
46
+ impl<T: RecordParser> Iterator for RecordReader<T> {
47
+ type Item = T::Output;
48
+
49
+ fn next(&mut self) -> Option<Self::Item> {
50
+ match &mut self.reader {
51
+ ReadImpl::MultiThreaded { receiver, handle } => match receiver.recv() {
52
+ Ok(record) => Some(record),
53
+ Err(_) => {
54
+ if let Some(handle) = handle.take() {
55
+ let _ = handle.join();
56
+ }
57
+ None
58
+ }
59
+ },
60
+ ReadImpl::SingleThreaded {
61
+ reader,
62
+ headers,
63
+ null_string,
64
+ } => {
65
+ let mut record = csv::StringRecord::new();
66
+ match reader.read_record(&mut record) {
67
+ Ok(true) => Some(T::parse(headers, &record, null_string)),
68
+ _ => None,
69
+ }
70
+ }
71
+ }
72
+ }
73
+ }
@@ -0,0 +1,17 @@
1
+ use magnus::{IntoValue, Ruby, Value};
2
+ use std::collections::HashMap;
3
+
4
+ #[derive(Debug)]
5
+ pub enum CsvRecord {
6
+ Vec(Vec<Option<String>>),
7
+ Map(HashMap<String, Option<String>>),
8
+ }
9
+
10
+ impl IntoValue for CsvRecord {
11
+ fn into_value_with(self, handle: &Ruby) -> Value {
12
+ match self {
13
+ CsvRecord::Vec(vec) => vec.into_value_with(handle),
14
+ CsvRecord::Map(map) => map.into_value_with(handle),
15
+ }
16
+ }
17
+ }
data/ext/osv/src/lib.rs CHANGED
@@ -1,3 +1,4 @@
1
+ mod csv;
1
2
  mod reader;
2
3
  mod utils;
3
4
 
@@ -10,6 +11,5 @@ use magnus::{Error, Ruby};
10
11
  fn init(ruby: &Ruby) -> Result<(), Error> {
11
12
  let module = ruby.define_module("OSV")?;
12
13
  module.define_module_function("for_each", magnus::method!(parse_csv, -1))?;
13
- module.define_module_function("for_each_compat", magnus::method!(parse_compat, -1))?;
14
14
  Ok(())
15
15
  }
@@ -1,230 +1,100 @@
1
+ use std::collections::HashMap;
2
+
3
+ use crate::csv::{CsvRecord, RecordReaderBuilder};
1
4
  use crate::utils::*;
2
- use magnus::{
3
- block::Yield, rb_sys::AsRawValue, value::ReprValue, Error, RClass, RString, Ruby, Value,
4
- };
5
- use std::{fs::File, io::Read, os::fd::FromRawFd, thread};
5
+ use magnus::value::ReprValue;
6
+ use magnus::{block::Yield, Error, KwArgs, RHash, Ruby, Symbol, Value};
6
7
 
7
- /// Parses CSV data from a file and yields each row as a hash to the block.
8
8
  pub fn parse_csv(
9
9
  ruby: &Ruby,
10
10
  rb_self: Value,
11
11
  args: &[Value],
12
- ) -> Result<Yield<impl Iterator<Item = std::collections::HashMap<String, String>>>, Error> {
13
- if !ruby.block_given() {
14
- return Ok(Yield::Enumerator(rb_self.enumeratorize("for_each", args)));
15
- }
16
- let (to_read, has_headers, delimiter) = parse_csv_args(args)?;
17
-
18
- let iter = RecordReader::<std::collections::HashMap<String, String>>::new(
19
- ruby,
12
+ ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord>>>, Error> {
13
+ let CsvArgs {
20
14
  to_read,
21
15
  has_headers,
22
- delimiter.unwrap_or_else(|| ",".to_string()).as_bytes()[0],
23
- 1000,
24
- )?;
16
+ delimiter,
17
+ quote_char,
18
+ null_string,
19
+ buffer_size,
20
+ result_type,
21
+ } = parse_csv_args(ruby, args)?;
25
22
 
26
- Ok(Yield::Iter(iter))
27
- }
28
-
29
- pub fn parse_compat(
30
- ruby: &Ruby,
31
- rb_self: Value,
32
- args: &[Value],
33
- ) -> Result<Yield<impl Iterator<Item = Vec<String>>>, Error> {
34
23
  if !ruby.block_given() {
35
- return Ok(Yield::Enumerator(
36
- rb_self.enumeratorize("for_each_compat", args),
37
- ));
24
+ return create_enumerator(EnumeratorArgs {
25
+ rb_self,
26
+ to_read,
27
+ has_headers,
28
+ delimiter,
29
+ quote_char,
30
+ null_string,
31
+ buffer_size,
32
+ result_type,
33
+ });
38
34
  }
39
- let (to_read, has_headers, delimiter) = parse_csv_args(args)?;
40
35
 
41
- let iter = RecordReader::<Vec<String>>::new(
42
- ruby,
43
- to_read,
44
- has_headers,
45
- delimiter.unwrap_or_else(|| ",".to_string()).as_bytes()[0],
46
- 1000,
47
- )?;
36
+ let iter: Box<dyn Iterator<Item = CsvRecord>> = match result_type.as_str() {
37
+ "hash" => Box::new(
38
+ RecordReaderBuilder::<HashMap<String, Option<String>>>::new(ruby, to_read)
39
+ .has_headers(has_headers)
40
+ .delimiter(delimiter)
41
+ .quote_char(quote_char)
42
+ .null_string(null_string)
43
+ .buffer(buffer_size)
44
+ .build()?
45
+ .map(CsvRecord::Map),
46
+ ),
47
+ "array" => Box::new(
48
+ RecordReaderBuilder::<Vec<Option<String>>>::new(ruby, to_read)
49
+ .has_headers(has_headers)
50
+ .delimiter(delimiter)
51
+ .quote_char(quote_char)
52
+ .null_string(null_string)
53
+ .buffer(buffer_size)
54
+ .build()?
55
+ .map(CsvRecord::Vec),
56
+ ),
57
+ _ => {
58
+ return Err(Error::new(
59
+ ruby.exception_runtime_error(),
60
+ "Invalid result type",
61
+ ))
62
+ }
63
+ };
48
64
 
49
65
  Ok(Yield::Iter(iter))
50
66
  }
51
67
 
52
- pub trait RecordParser {
53
- type Output;
54
-
55
- fn parse(headers: &[String], record: &csv::StringRecord) -> Self::Output;
56
- }
57
-
58
- impl RecordParser for std::collections::HashMap<String, String> {
59
- type Output = Self;
60
-
61
- fn parse(headers: &[String], record: &csv::StringRecord) -> Self::Output {
62
- record
63
- .iter()
64
- .enumerate()
65
- .map(|(i, field)| (headers[i].clone(), field.to_string()))
66
- .collect()
67
- }
68
- }
69
-
70
- impl RecordParser for Vec<String> {
71
- type Output = Self;
72
-
73
- fn parse(_headers: &[String], record: &csv::StringRecord) -> Self::Output {
74
- record.iter().map(|field| field.to_string()).collect()
75
- }
76
- }
77
-
78
- struct RecordReader<T: RecordParser> {
79
- reader: ReadImpl<T>,
80
- }
81
-
82
- #[allow(dead_code)]
83
- enum ReadImpl<T: RecordParser> {
84
- SingleThreaded {
85
- reader: csv::Reader<Box<dyn Read + Send + 'static>>,
86
- headers: Vec<String>,
87
- },
88
- MultiThreaded {
89
- receiver: kanal::Receiver<T::Output>,
90
- handle: Option<thread::JoinHandle<()>>,
91
- },
92
- }
93
-
94
- impl<T: RecordParser + Send + 'static> RecordReader<T> {
95
- fn new(
96
- ruby: &Ruby,
97
- to_read: Value,
98
- has_headers: bool,
99
- delimiter: u8,
100
- buffer: usize,
101
- ) -> Result<Self, Error> {
102
- let string_io = RClass::from(ruby.eval("StringIO").map_err(|e| {
103
- Error::new(
104
- ruby.exception_runtime_error(),
105
- format!("Failed to get StringIO class: {}", e),
106
- )
107
- })?);
108
-
109
- let readable: Box<dyn Read + Send + 'static> = if to_read.is_kind_of(string_io) {
110
- let string: RString = to_read.funcall("string", ()).map_err(|e| {
111
- Error::new(
112
- ruby.exception_runtime_error(),
113
- format!("Failed to get string from StringIO: {}", e),
114
- )
115
- })?;
116
- let content = string.to_string().map_err(|e| {
117
- Error::new(
118
- ruby.exception_runtime_error(),
119
- format!("Failed to convert string to Rust String: {}", e),
120
- )
121
- })?;
122
- Box::new(std::io::Cursor::new(content))
123
- } else if to_read.is_kind_of(ruby.class_io()) {
124
- let fd = unsafe { rb_sys::rb_io_descriptor(to_read.as_raw()) };
125
- let file = unsafe { File::from_raw_fd(fd) };
126
- Box::new(file)
127
- } else {
128
- let path = to_read
129
- .to_r_string()
130
- .map_err(|e| {
131
- Error::new(
132
- ruby.exception_runtime_error(),
133
- format!("Failed to convert path to string: {}", e),
134
- )
135
- })?
136
- .to_string()
137
- .map_err(|e| {
138
- Error::new(
139
- ruby.exception_runtime_error(),
140
- format!("Failed to convert RString to Rust String: {}", e),
141
- )
142
- })?;
143
- let file = std::fs::File::open(&path).map_err(|e| {
144
- Error::new(
145
- ruby.exception_runtime_error(),
146
- format!("Failed to open file: {}", e),
147
- )
148
- })?;
149
- Box::new(file)
150
- };
151
-
152
- let mut reader = csv::ReaderBuilder::new()
153
- .has_headers(has_headers)
154
- .delimiter(delimiter)
155
- .from_reader(readable);
156
-
157
- let headers = Self::get_headers(&mut reader, has_headers)?;
158
- let headers_clone = headers.clone();
159
-
160
- let (sender, receiver) = kanal::bounded(buffer);
161
- let handle = thread::spawn(move || {
162
- let mut record = csv::StringRecord::new();
163
- while let Ok(read) = reader.read_record(&mut record) {
164
- if !read {
165
- let file_to_forget = reader.into_inner();
166
- std::mem::forget(file_to_forget);
167
- break;
168
- }
169
- let row = T::parse(&headers_clone, &record);
170
- if sender.send(row).is_err() {
171
- break;
172
- }
173
- }
174
- });
175
-
176
- let read_impl = ReadImpl::MultiThreaded {
177
- receiver,
178
- handle: Some(handle),
179
- };
180
-
181
- Ok(Self { reader: read_impl })
182
- }
183
-
184
- fn get_headers(
185
- reader: &mut csv::Reader<impl Read>,
186
- has_headers: bool,
187
- ) -> Result<Vec<String>, Error> {
188
- let first_row = reader
189
- .headers()
190
- .map_err(|e| {
191
- Error::new(
192
- magnus::exception::runtime_error(),
193
- format!("Failed to read headers: {}", e),
194
- )
195
- })?
196
- .clone();
197
- let num_fields = first_row.len();
198
-
199
- Ok(if has_headers {
200
- first_row.iter().map(|h| h.to_string()).collect()
201
- } else {
202
- (0..num_fields).map(|i| format!("c{}", i)).collect()
203
- })
204
- }
68
+ struct EnumeratorArgs {
69
+ rb_self: Value,
70
+ to_read: Value,
71
+ has_headers: bool,
72
+ delimiter: u8,
73
+ quote_char: u8,
74
+ null_string: String,
75
+ buffer_size: usize,
76
+ result_type: String,
205
77
  }
206
78
 
207
- impl<T: RecordParser> Iterator for RecordReader<T> {
208
- type Item = T::Output;
209
-
210
- fn next(&mut self) -> Option<Self::Item> {
211
- match &mut self.reader {
212
- ReadImpl::MultiThreaded { receiver, handle } => match receiver.recv() {
213
- Ok(record) => Some(record),
214
- Err(_) => {
215
- if let Some(handle) = handle.take() {
216
- let _ = handle.join();
217
- }
218
- None
219
- }
220
- },
221
- ReadImpl::SingleThreaded { reader, headers } => {
222
- let mut record = csv::StringRecord::new();
223
- match reader.read_record(&mut record) {
224
- Ok(true) => Some(T::parse(headers, &record)),
225
- _ => None,
226
- }
227
- }
228
- }
229
- }
79
+ fn create_enumerator(
80
+ args: EnumeratorArgs,
81
+ ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord>>>, Error> {
82
+ let kwargs = RHash::new();
83
+ kwargs.aset(Symbol::new("has_headers"), args.has_headers)?;
84
+ kwargs.aset(
85
+ Symbol::new("col_sep"),
86
+ String::from_utf8(vec![args.delimiter]).unwrap(),
87
+ )?;
88
+ kwargs.aset(
89
+ Symbol::new("quote_char"),
90
+ String::from_utf8(vec![args.quote_char]).unwrap(),
91
+ )?;
92
+ kwargs.aset(Symbol::new("nil_string"), args.null_string)?;
93
+ kwargs.aset(Symbol::new("buffer_size"), args.buffer_size)?;
94
+ kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
95
+
96
+ let enumerator = args
97
+ .rb_self
98
+ .enumeratorize("for_each", (args.to_read, KwArgs(kwargs)));
99
+ Ok(Yield::Enumerator(enumerator))
230
100
  }
data/ext/osv/src/utils.rs CHANGED
@@ -1,20 +1,123 @@
1
1
  use magnus::{
2
2
  scan_args::{get_kwargs, scan_args},
3
- Error, Value,
3
+ value::ReprValue,
4
+ Error, RString, Ruby, Symbol, Value,
4
5
  };
5
6
 
7
+ #[derive(Debug)]
8
+ pub struct CsvArgs {
9
+ pub to_read: Value,
10
+ pub has_headers: bool,
11
+ pub delimiter: u8,
12
+ pub quote_char: u8,
13
+ pub null_string: String,
14
+ pub buffer_size: usize,
15
+ pub result_type: String,
16
+ }
17
+
6
18
  /// Parse common arguments for CSV parsing
7
- pub fn parse_csv_args(args: &[Value]) -> Result<(Value, bool, Option<String>), Error> {
19
+ pub fn parse_csv_args(ruby: &Ruby, args: &[Value]) -> Result<CsvArgs, Error> {
8
20
  let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
9
21
  let (to_read,) = parsed_args.required;
10
22
 
11
- let kwargs = get_kwargs::<_, (), (Option<bool>, Option<String>), ()>(
23
+ let kwargs = get_kwargs::<
24
+ _,
25
+ (),
26
+ (
27
+ Option<bool>,
28
+ Option<String>,
29
+ Option<String>,
30
+ Option<String>,
31
+ Option<usize>,
32
+ Option<Value>,
33
+ ),
34
+ (),
35
+ >(
12
36
  parsed_args.keywords,
13
37
  &[],
14
- &["has_headers", "delimiter"],
38
+ &[
39
+ "has_headers",
40
+ "col_sep",
41
+ "quote_char",
42
+ "nil_string",
43
+ "buffer_size",
44
+ "result_type",
45
+ ],
15
46
  )?;
16
47
 
17
48
  let has_headers = kwargs.optional.0.unwrap_or(true);
18
49
 
19
- Ok((to_read, has_headers, kwargs.optional.1))
50
+ let delimiter = *kwargs
51
+ .optional
52
+ .1
53
+ .unwrap_or_else(|| ",".to_string())
54
+ .as_bytes()
55
+ .first()
56
+ .ok_or_else(|| {
57
+ Error::new(
58
+ magnus::exception::runtime_error(),
59
+ "Delimiter cannot be empty",
60
+ )
61
+ })?;
62
+
63
+ let quote_char = *kwargs
64
+ .optional
65
+ .2
66
+ .unwrap_or_else(|| "\"".to_string())
67
+ .as_bytes()
68
+ .first()
69
+ .ok_or_else(|| {
70
+ Error::new(
71
+ magnus::exception::runtime_error(),
72
+ "Quote character cannot be empty",
73
+ )
74
+ })?;
75
+
76
+ let null_string = kwargs.optional.3.unwrap_or_else(|| "".to_string());
77
+
78
+ let buffer_size = kwargs.optional.4.unwrap_or(1000);
79
+
80
+ let result_type = match kwargs.optional.5 {
81
+ Some(value) => {
82
+ let parsed = if value.is_kind_of(ruby.class_string()) {
83
+ RString::from_value(value)
84
+ .ok_or_else(|| {
85
+ Error::new(magnus::exception::type_error(), "Invalid string value")
86
+ })?
87
+ .to_string()?
88
+ } else if value.is_kind_of(ruby.class_symbol()) {
89
+ Symbol::from_value(value)
90
+ .ok_or_else(|| {
91
+ Error::new(magnus::exception::type_error(), "Invalid symbol value")
92
+ })?
93
+ .funcall("to_s", ())?
94
+ } else {
95
+ return Err(Error::new(
96
+ magnus::exception::type_error(),
97
+ "result_type must be a String or Symbol",
98
+ ));
99
+ };
100
+
101
+ match parsed.as_str() {
102
+ "hash" | "array" => parsed,
103
+ _ => {
104
+ return Err(Error::new(
105
+ magnus::exception::runtime_error(),
106
+ "result_type must be either 'hash' or 'array'",
107
+ ))
108
+ }
109
+ }
110
+ }
111
+ None => String::from("hash"),
112
+ };
113
+
114
+ Ok(CsvArgs {
115
+ to_read,
116
+ has_headers,
117
+ delimiter,
118
+ quote_char,
119
+ null_string,
120
+ buffer_size,
121
+ result_type,
122
+ })
20
123
  }
data/lib/osv/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module OSV
2
- VERSION = "0.2.0"
2
+ VERSION = "0.3.0"
3
3
  end
data/lib/osv.rbi CHANGED
@@ -2,28 +2,26 @@
2
2
 
3
3
  module OSV
4
4
  sig do
5
- type_parameters(:T)
6
- .params(
7
- input: T.any(String, StringIO, IO),
8
- has_headers: T.nilable(T::Boolean),
9
- delimiter: T.nilable(String),
10
- blk: T.proc.params(row: T::Hash[String, String]).void
11
- )
12
- .returns(T.untyped)
5
+ params(
6
+ input: T.any(String, StringIO, IO),
7
+ has_headers: T.nilable(T::Boolean),
8
+ col_sep: T.nilable(String),
9
+ quote_char: T.nilable(String),
10
+ nil_string: T.nilable(String),
11
+ buffer_size: T.nilable(Integer),
12
+ result_type: T.nilable(String),
13
+ blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.nilable(String)], T::Array[T.nilable(String)])).void)
14
+ ).returns(T.any(Enumerator, T.untyped))
13
15
  end
14
- def self.for_each(input, has_headers: true, delimiter: nil, &blk)
15
- end
16
-
17
- sig do
18
- type_parameters(:T)
19
- .params(
20
- input: T.any(String, StringIO, IO),
21
- has_headers: T.nilable(T::Boolean),
22
- delimiter: T.nilable(String),
23
- blk: T.proc.params(row: T::Array[String]).void
24
- )
25
- .returns(T.untyped)
26
- end
27
- def self.for_each_compat(input, has_headers: true, delimiter: nil, &blk)
16
+ def self.for_each(
17
+ input,
18
+ has_headers: true,
19
+ col_sep: nil,
20
+ quote_char: nil,
21
+ nil_string: nil,
22
+ buffer_size: nil,
23
+ result_type: nil,
24
+ &blk
25
+ )
28
26
  end
29
27
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: osv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
@@ -49,10 +49,17 @@ files:
49
49
  - Cargo.lock
50
50
  - Cargo.toml
51
51
  - Gemfile
52
+ - LICENSE
53
+ - README.md
52
54
  - Rakefile
53
55
  - ext/osv/Cargo.lock
54
56
  - ext/osv/Cargo.toml
55
57
  - ext/osv/extconf.rb
58
+ - ext/osv/src/csv/builder.rs
59
+ - ext/osv/src/csv/mod.rs
60
+ - ext/osv/src/csv/parser.rs
61
+ - ext/osv/src/csv/reader.rs
62
+ - ext/osv/src/csv/record.rs
56
63
  - ext/osv/src/lib.rs
57
64
  - ext/osv/src/reader.rs
58
65
  - ext/osv/src/utils.rs