osv 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a1b0c347b0bab5c9d31069c56f47999bfa51e85dfc1e127d1c4474a84ac19c53
4
- data.tar.gz: 847b199da27b7c1329c1fa64fc8636592f004e93a12fc2ddd8db6127298ac23d
3
+ metadata.gz: f22d1d56b0eba1e23ca192db2c70e68689486e2f0032672285017e1f98a530d2
4
+ data.tar.gz: 6288dce70b95faf312e8aa244ba56a60c3d59b85bdd16a4a951060df78b97e1e
5
5
  SHA512:
6
- metadata.gz: 62fa77c1ca98031f483569a4dba7cf9e4eca52a4b5fae293d274d5f89c48003e301eab01d95116cfc9cc6a2642e742d16046231a21d25e4a5143bd6ec3b40dac
7
- data.tar.gz: 3832cbb6ebadfc718a8a5d1963de960ed3abf09d4559d2f0ffe031c642a6c2581dc6ad6edf5d65f22248812585ca464916375503753831024fc355fe4cd04455
6
+ metadata.gz: 5399b43ecd3987c73daf09341d51a1ee8e5d060f1085e9a7aac9b823ab723ccbbc4084c5c0c9abbd36e7e17becfcfa1757af6af3666e01ef3992a27e35b5b983
7
+ data.tar.gz: 2a6d98b645af40ab08a5a01a2bcf5b67ce9ebff18a993e602f6b00fa2f3f80d65c63605a7f5a715286ac20751db0607c515686e4439156a4039ae24f49e95e10
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Nathan Jaremko
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,113 @@
1
+ # OSV
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/osv.svg)](https://badge.fury.io/rb/osv)
4
+
5
+ OSV is a high-performance CSV parser for Ruby, implemented in Rust. It wraps BurntSushi's excellent [csv-rs](https://github.com/BurntSushi/rust-csv) crate.
6
+
7
+ It provides a simple interface for reading CSV files with support for both hash-based and array-based row formats.
8
+
9
+ The array-based mode is faster than the hash-based mode, so if you don't need the hash keys, use the array-based mode.
10
+
11
+ ## Installation
12
+
13
+ Add this line to your application's Gemfile:
14
+
15
+ ```ruby
16
+ gem 'osv'
17
+ ```
18
+
19
+ And then execute:
20
+
21
+ ```bash
22
+ bundle install
23
+ ```
24
+
25
+ Or install it directly:
26
+
27
+ ```bash
28
+ gem install osv
29
+ ```
30
+
31
+ ## Usage
32
+
33
+ ### Basic Usage with Hash Output
34
+
35
+ Each row is returned as a hash where the keys are the column headers:
36
+
37
+ ```ruby
38
+ require 'osv'
39
+
40
+ # Read from a file
41
+ OSV.for_each("path/to/file.csv") do |row|
42
+ # row is a Hash like {"name" => "John", "age" => "25"}
43
+ puts row["name"]
44
+ end
45
+
46
+ # Without a block, returns an Enumerator
47
+ rows = OSV.for_each("path/to/file.csv")
48
+ rows.each { |row| puts row["name"] }
49
+ ```
50
+
51
+ ### Array Output Mode
52
+
53
+ If you prefer working with arrays instead of hashes, use `for_each_compat`:
54
+
55
+ ```ruby
56
+ OSV.for_each("path/to/file.csv", result_type: :array) do |row|
57
+ # row is an Array like ["John", "25"]
58
+ puts row[0]
59
+ end
60
+ ```
61
+
62
+ ### Options
63
+
64
+ Both methods support the following options:
65
+
66
+ - `has_headers`: Boolean indicating if the first row contains headers (default: true)
67
+ - `col_sep`: String specifying the field separator (default: ",")
68
+
69
+ ```ruby
70
+ # Reading TSV files
71
+ OSV.for_each("path/to/file.tsv", col_sep: "\t") do |row|
72
+ puts row["name"]
73
+ end
74
+
75
+ # Reading without headers
76
+ OSV.for_each("path/to/file.csv", has_headers: false) do |row|
77
+ # Headers will be automatically generated as "c0", "c1", etc.
78
+ puts row["c0"]
79
+ end
80
+ ```
81
+
82
+ ### Input Sources
83
+
84
+ OSV supports reading from:
85
+
86
+ - File paths (as strings)
87
+ - IO objects
88
+ - Important caveat: the IO object must respond to `rb_io_descriptor` with a file descriptor.
89
+ - StringIO objects
90
+ - Note: when you do this, the string is read (in full) into a Rust string, and we parse it there.
91
+
92
+ ```ruby
93
+ # From file path
94
+ OSV.for_each("path/to/file.csv") { |row| puts row["name"] }
95
+
96
+ # From IO object
97
+ File.open("path/to/file.csv") do |file|
98
+ OSV.for_each(file) { |row| puts row["name"] }
99
+ end
100
+
101
+ # From StringIO
102
+ data = StringIO.new("name,age\nJohn,25")
103
+ OSV.for_each(data) { |row| puts row["name"] }
104
+ ```
105
+
106
+ ## Requirements
107
+
108
+ - Ruby >= 3.1.0
109
+ - Rust toolchain (for installation from source)
110
+
111
+ ## Performance
112
+
113
+ This library is faster than the standard Ruby CSV library, and is comparable to the fastest CSV parser gems I've used.
@@ -0,0 +1,114 @@
1
+ use super::{
2
+ parser::RecordParser,
3
+ reader::{ReadImpl, RecordReader},
4
+ };
5
+ use magnus::{rb_sys::AsRawValue, value::ReprValue, Error, RString, Ruby, Value};
6
+ use std::{fs::File, io::Read, marker::PhantomData, os::fd::FromRawFd, thread};
7
+
8
+ pub struct RecordReaderBuilder<'a, T: RecordParser + Send + 'static> {
9
+ ruby: &'a Ruby,
10
+ to_read: Value,
11
+ has_headers: bool,
12
+ delimiter: u8,
13
+ quote_char: u8,
14
+ null_string: String,
15
+ buffer: usize,
16
+ _phantom: PhantomData<T>,
17
+ }
18
+
19
+ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
20
+ pub fn new(ruby: &'a Ruby, to_read: Value) -> Self {
21
+ Self {
22
+ ruby,
23
+ to_read,
24
+ has_headers: true,
25
+ delimiter: b',',
26
+ quote_char: b'"',
27
+ null_string: String::new(),
28
+ buffer: 1000,
29
+ _phantom: PhantomData,
30
+ }
31
+ }
32
+
33
+ pub fn has_headers(mut self, has_headers: bool) -> Self {
34
+ self.has_headers = has_headers;
35
+ self
36
+ }
37
+
38
+ pub fn delimiter(mut self, delimiter: u8) -> Self {
39
+ self.delimiter = delimiter;
40
+ self
41
+ }
42
+
43
+ pub fn quote_char(mut self, quote_char: u8) -> Self {
44
+ self.quote_char = quote_char;
45
+ self
46
+ }
47
+
48
+ pub fn null_string(mut self, null_string: String) -> Self {
49
+ self.null_string = null_string;
50
+ self
51
+ }
52
+
53
+ pub fn buffer(mut self, buffer: usize) -> Self {
54
+ self.buffer = buffer;
55
+ self
56
+ }
57
+
58
+ fn get_reader(&self) -> Result<Box<dyn Read + Send + 'static>, Error> {
59
+ let string_io: magnus::RClass = self.ruby.eval("StringIO")?;
60
+
61
+ if self.to_read.is_kind_of(string_io) {
62
+ let string: RString = self.to_read.funcall("string", ())?;
63
+ let content = string.to_string()?;
64
+ Ok(Box::new(std::io::Cursor::new(content)))
65
+ } else if self.to_read.is_kind_of(self.ruby.class_io()) {
66
+ let fd = unsafe { rb_sys::rb_io_descriptor(self.to_read.as_raw()) };
67
+ let file = unsafe { File::from_raw_fd(fd) };
68
+ Ok(Box::new(file))
69
+ } else {
70
+ let path = self.to_read.to_r_string()?.to_string()?;
71
+ let file = std::fs::File::open(&path).map_err(|e| {
72
+ Error::new(
73
+ self.ruby.exception_runtime_error(),
74
+ format!("Failed to open file: {e}"),
75
+ )
76
+ })?;
77
+ Ok(Box::new(file))
78
+ }
79
+ }
80
+
81
+ pub fn build(self) -> Result<RecordReader<T>, Error> {
82
+ let readable = self.get_reader()?;
83
+
84
+ let mut reader = csv::ReaderBuilder::new()
85
+ .has_headers(self.has_headers)
86
+ .delimiter(self.delimiter)
87
+ .quote(self.quote_char)
88
+ .from_reader(readable);
89
+
90
+ let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
91
+ let headers_clone = headers.clone();
92
+ let null_string = self.null_string;
93
+
94
+ let (sender, receiver) = kanal::bounded(self.buffer);
95
+ let handle = thread::spawn(move || {
96
+ let mut record = csv::StringRecord::new();
97
+ while let Ok(true) = reader.read_record(&mut record) {
98
+ let row = T::parse(&headers_clone, &record, &null_string);
99
+ if sender.send(row).is_err() {
100
+ break;
101
+ }
102
+ }
103
+ let file_to_forget = reader.into_inner();
104
+ std::mem::forget(file_to_forget);
105
+ });
106
+
107
+ Ok(RecordReader {
108
+ reader: ReadImpl::MultiThreaded {
109
+ receiver,
110
+ handle: Some(handle),
111
+ },
112
+ })
113
+ }
114
+ }
@@ -0,0 +1,8 @@
1
+ mod builder;
2
+ mod parser;
3
+ mod reader;
4
+ mod record;
5
+
6
+ pub use builder::RecordReaderBuilder;
7
+ pub use record::CsvRecord;
8
+
@@ -0,0 +1,43 @@
1
+ use std::collections::HashMap;
2
+
3
+ pub trait RecordParser {
4
+ type Output;
5
+
6
+ fn parse(headers: &[String], record: &csv::StringRecord, null_string: &str) -> Self::Output;
7
+ }
8
+
9
+ impl RecordParser for HashMap<String, Option<String>> {
10
+ type Output = Self;
11
+
12
+ fn parse(headers: &[String], record: &csv::StringRecord, null_string: &str) -> Self::Output {
13
+ headers
14
+ .iter()
15
+ .zip(record.iter())
16
+ .map(|(header, field)| {
17
+ let value = if field == null_string {
18
+ None
19
+ } else {
20
+ Some(field.to_string())
21
+ };
22
+ (header.clone(), value)
23
+ })
24
+ .collect()
25
+ }
26
+ }
27
+
28
+ impl RecordParser for Vec<Option<String>> {
29
+ type Output = Self;
30
+
31
+ fn parse(_headers: &[String], record: &csv::StringRecord, null_string: &str) -> Self::Output {
32
+ record
33
+ .iter()
34
+ .map(|field| {
35
+ if field == null_string {
36
+ None
37
+ } else {
38
+ Some(field.to_string())
39
+ }
40
+ })
41
+ .collect()
42
+ }
43
+ }
@@ -0,0 +1,73 @@
1
+ use super::parser::RecordParser;
2
+ use magnus::{Error, Ruby};
3
+ use std::{io::Read, thread};
4
+
5
+ pub struct RecordReader<T: RecordParser> {
6
+ pub(crate) reader: ReadImpl<T>,
7
+ }
8
+
9
+ #[allow(dead_code)]
10
+ pub enum ReadImpl<T: RecordParser> {
11
+ SingleThreaded {
12
+ reader: csv::Reader<Box<dyn Read + Send + 'static>>,
13
+ headers: Vec<String>,
14
+ null_string: String,
15
+ },
16
+ MultiThreaded {
17
+ receiver: kanal::Receiver<T::Output>,
18
+ handle: Option<thread::JoinHandle<()>>,
19
+ },
20
+ }
21
+
22
+ impl<T: RecordParser> RecordReader<T> {
23
+ pub(crate) fn get_headers(
24
+ ruby: &Ruby,
25
+ reader: &mut csv::Reader<impl Read>,
26
+ has_headers: bool,
27
+ ) -> Result<Vec<String>, Error> {
28
+ let first_row = reader
29
+ .headers()
30
+ .map_err(|e| {
31
+ Error::new(
32
+ ruby.exception_runtime_error(),
33
+ format!("Failed to read headers: {e}"),
34
+ )
35
+ })?
36
+ .clone();
37
+
38
+ Ok(if has_headers {
39
+ first_row.iter().map(String::from).collect()
40
+ } else {
41
+ (0..first_row.len()).map(|i| format!("c{i}")).collect()
42
+ })
43
+ }
44
+ }
45
+
46
+ impl<T: RecordParser> Iterator for RecordReader<T> {
47
+ type Item = T::Output;
48
+
49
+ fn next(&mut self) -> Option<Self::Item> {
50
+ match &mut self.reader {
51
+ ReadImpl::MultiThreaded { receiver, handle } => match receiver.recv() {
52
+ Ok(record) => Some(record),
53
+ Err(_) => {
54
+ if let Some(handle) = handle.take() {
55
+ let _ = handle.join();
56
+ }
57
+ None
58
+ }
59
+ },
60
+ ReadImpl::SingleThreaded {
61
+ reader,
62
+ headers,
63
+ null_string,
64
+ } => {
65
+ let mut record = csv::StringRecord::new();
66
+ match reader.read_record(&mut record) {
67
+ Ok(true) => Some(T::parse(headers, &record, null_string)),
68
+ _ => None,
69
+ }
70
+ }
71
+ }
72
+ }
73
+ }
@@ -0,0 +1,17 @@
1
+ use magnus::{IntoValue, Ruby, Value};
2
+ use std::collections::HashMap;
3
+
4
+ #[derive(Debug)]
5
+ pub enum CsvRecord {
6
+ Vec(Vec<Option<String>>),
7
+ Map(HashMap<String, Option<String>>),
8
+ }
9
+
10
+ impl IntoValue for CsvRecord {
11
+ fn into_value_with(self, handle: &Ruby) -> Value {
12
+ match self {
13
+ CsvRecord::Vec(vec) => vec.into_value_with(handle),
14
+ CsvRecord::Map(map) => map.into_value_with(handle),
15
+ }
16
+ }
17
+ }
data/ext/osv/src/lib.rs CHANGED
@@ -1,3 +1,4 @@
1
+ mod csv;
1
2
  mod reader;
2
3
  mod utils;
3
4
 
@@ -10,6 +11,5 @@ use magnus::{Error, Ruby};
10
11
  fn init(ruby: &Ruby) -> Result<(), Error> {
11
12
  let module = ruby.define_module("OSV")?;
12
13
  module.define_module_function("for_each", magnus::method!(parse_csv, -1))?;
13
- module.define_module_function("for_each_compat", magnus::method!(parse_compat, -1))?;
14
14
  Ok(())
15
15
  }
@@ -1,230 +1,100 @@
1
+ use std::collections::HashMap;
2
+
3
+ use crate::csv::{CsvRecord, RecordReaderBuilder};
1
4
  use crate::utils::*;
2
- use magnus::{
3
- block::Yield, rb_sys::AsRawValue, value::ReprValue, Error, RClass, RString, Ruby, Value,
4
- };
5
- use std::{fs::File, io::Read, os::fd::FromRawFd, thread};
5
+ use magnus::value::ReprValue;
6
+ use magnus::{block::Yield, Error, KwArgs, RHash, Ruby, Symbol, Value};
6
7
 
7
- /// Parses CSV data from a file and yields each row as a hash to the block.
8
8
  pub fn parse_csv(
9
9
  ruby: &Ruby,
10
10
  rb_self: Value,
11
11
  args: &[Value],
12
- ) -> Result<Yield<impl Iterator<Item = std::collections::HashMap<String, String>>>, Error> {
13
- if !ruby.block_given() {
14
- return Ok(Yield::Enumerator(rb_self.enumeratorize("for_each", args)));
15
- }
16
- let (to_read, has_headers, delimiter) = parse_csv_args(args)?;
17
-
18
- let iter = RecordReader::<std::collections::HashMap<String, String>>::new(
19
- ruby,
12
+ ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord>>>, Error> {
13
+ let CsvArgs {
20
14
  to_read,
21
15
  has_headers,
22
- delimiter.unwrap_or_else(|| ",".to_string()).as_bytes()[0],
23
- 1000,
24
- )?;
16
+ delimiter,
17
+ quote_char,
18
+ null_string,
19
+ buffer_size,
20
+ result_type,
21
+ } = parse_csv_args(ruby, args)?;
25
22
 
26
- Ok(Yield::Iter(iter))
27
- }
28
-
29
- pub fn parse_compat(
30
- ruby: &Ruby,
31
- rb_self: Value,
32
- args: &[Value],
33
- ) -> Result<Yield<impl Iterator<Item = Vec<String>>>, Error> {
34
23
  if !ruby.block_given() {
35
- return Ok(Yield::Enumerator(
36
- rb_self.enumeratorize("for_each_compat", args),
37
- ));
24
+ return create_enumerator(EnumeratorArgs {
25
+ rb_self,
26
+ to_read,
27
+ has_headers,
28
+ delimiter,
29
+ quote_char,
30
+ null_string,
31
+ buffer_size,
32
+ result_type,
33
+ });
38
34
  }
39
- let (to_read, has_headers, delimiter) = parse_csv_args(args)?;
40
35
 
41
- let iter = RecordReader::<Vec<String>>::new(
42
- ruby,
43
- to_read,
44
- has_headers,
45
- delimiter.unwrap_or_else(|| ",".to_string()).as_bytes()[0],
46
- 1000,
47
- )?;
36
+ let iter: Box<dyn Iterator<Item = CsvRecord>> = match result_type.as_str() {
37
+ "hash" => Box::new(
38
+ RecordReaderBuilder::<HashMap<String, Option<String>>>::new(ruby, to_read)
39
+ .has_headers(has_headers)
40
+ .delimiter(delimiter)
41
+ .quote_char(quote_char)
42
+ .null_string(null_string)
43
+ .buffer(buffer_size)
44
+ .build()?
45
+ .map(CsvRecord::Map),
46
+ ),
47
+ "array" => Box::new(
48
+ RecordReaderBuilder::<Vec<Option<String>>>::new(ruby, to_read)
49
+ .has_headers(has_headers)
50
+ .delimiter(delimiter)
51
+ .quote_char(quote_char)
52
+ .null_string(null_string)
53
+ .buffer(buffer_size)
54
+ .build()?
55
+ .map(CsvRecord::Vec),
56
+ ),
57
+ _ => {
58
+ return Err(Error::new(
59
+ ruby.exception_runtime_error(),
60
+ "Invalid result type",
61
+ ))
62
+ }
63
+ };
48
64
 
49
65
  Ok(Yield::Iter(iter))
50
66
  }
51
67
 
52
- pub trait RecordParser {
53
- type Output;
54
-
55
- fn parse(headers: &[String], record: &csv::StringRecord) -> Self::Output;
56
- }
57
-
58
- impl RecordParser for std::collections::HashMap<String, String> {
59
- type Output = Self;
60
-
61
- fn parse(headers: &[String], record: &csv::StringRecord) -> Self::Output {
62
- record
63
- .iter()
64
- .enumerate()
65
- .map(|(i, field)| (headers[i].clone(), field.to_string()))
66
- .collect()
67
- }
68
- }
69
-
70
- impl RecordParser for Vec<String> {
71
- type Output = Self;
72
-
73
- fn parse(_headers: &[String], record: &csv::StringRecord) -> Self::Output {
74
- record.iter().map(|field| field.to_string()).collect()
75
- }
76
- }
77
-
78
- struct RecordReader<T: RecordParser> {
79
- reader: ReadImpl<T>,
80
- }
81
-
82
- #[allow(dead_code)]
83
- enum ReadImpl<T: RecordParser> {
84
- SingleThreaded {
85
- reader: csv::Reader<Box<dyn Read + Send + 'static>>,
86
- headers: Vec<String>,
87
- },
88
- MultiThreaded {
89
- receiver: kanal::Receiver<T::Output>,
90
- handle: Option<thread::JoinHandle<()>>,
91
- },
92
- }
93
-
94
- impl<T: RecordParser + Send + 'static> RecordReader<T> {
95
- fn new(
96
- ruby: &Ruby,
97
- to_read: Value,
98
- has_headers: bool,
99
- delimiter: u8,
100
- buffer: usize,
101
- ) -> Result<Self, Error> {
102
- let string_io = RClass::from(ruby.eval("StringIO").map_err(|e| {
103
- Error::new(
104
- ruby.exception_runtime_error(),
105
- format!("Failed to get StringIO class: {}", e),
106
- )
107
- })?);
108
-
109
- let readable: Box<dyn Read + Send + 'static> = if to_read.is_kind_of(string_io) {
110
- let string: RString = to_read.funcall("string", ()).map_err(|e| {
111
- Error::new(
112
- ruby.exception_runtime_error(),
113
- format!("Failed to get string from StringIO: {}", e),
114
- )
115
- })?;
116
- let content = string.to_string().map_err(|e| {
117
- Error::new(
118
- ruby.exception_runtime_error(),
119
- format!("Failed to convert string to Rust String: {}", e),
120
- )
121
- })?;
122
- Box::new(std::io::Cursor::new(content))
123
- } else if to_read.is_kind_of(ruby.class_io()) {
124
- let fd = unsafe { rb_sys::rb_io_descriptor(to_read.as_raw()) };
125
- let file = unsafe { File::from_raw_fd(fd) };
126
- Box::new(file)
127
- } else {
128
- let path = to_read
129
- .to_r_string()
130
- .map_err(|e| {
131
- Error::new(
132
- ruby.exception_runtime_error(),
133
- format!("Failed to convert path to string: {}", e),
134
- )
135
- })?
136
- .to_string()
137
- .map_err(|e| {
138
- Error::new(
139
- ruby.exception_runtime_error(),
140
- format!("Failed to convert RString to Rust String: {}", e),
141
- )
142
- })?;
143
- let file = std::fs::File::open(&path).map_err(|e| {
144
- Error::new(
145
- ruby.exception_runtime_error(),
146
- format!("Failed to open file: {}", e),
147
- )
148
- })?;
149
- Box::new(file)
150
- };
151
-
152
- let mut reader = csv::ReaderBuilder::new()
153
- .has_headers(has_headers)
154
- .delimiter(delimiter)
155
- .from_reader(readable);
156
-
157
- let headers = Self::get_headers(&mut reader, has_headers)?;
158
- let headers_clone = headers.clone();
159
-
160
- let (sender, receiver) = kanal::bounded(buffer);
161
- let handle = thread::spawn(move || {
162
- let mut record = csv::StringRecord::new();
163
- while let Ok(read) = reader.read_record(&mut record) {
164
- if !read {
165
- let file_to_forget = reader.into_inner();
166
- std::mem::forget(file_to_forget);
167
- break;
168
- }
169
- let row = T::parse(&headers_clone, &record);
170
- if sender.send(row).is_err() {
171
- break;
172
- }
173
- }
174
- });
175
-
176
- let read_impl = ReadImpl::MultiThreaded {
177
- receiver,
178
- handle: Some(handle),
179
- };
180
-
181
- Ok(Self { reader: read_impl })
182
- }
183
-
184
- fn get_headers(
185
- reader: &mut csv::Reader<impl Read>,
186
- has_headers: bool,
187
- ) -> Result<Vec<String>, Error> {
188
- let first_row = reader
189
- .headers()
190
- .map_err(|e| {
191
- Error::new(
192
- magnus::exception::runtime_error(),
193
- format!("Failed to read headers: {}", e),
194
- )
195
- })?
196
- .clone();
197
- let num_fields = first_row.len();
198
-
199
- Ok(if has_headers {
200
- first_row.iter().map(|h| h.to_string()).collect()
201
- } else {
202
- (0..num_fields).map(|i| format!("c{}", i)).collect()
203
- })
204
- }
68
+ struct EnumeratorArgs {
69
+ rb_self: Value,
70
+ to_read: Value,
71
+ has_headers: bool,
72
+ delimiter: u8,
73
+ quote_char: u8,
74
+ null_string: String,
75
+ buffer_size: usize,
76
+ result_type: String,
205
77
  }
206
78
 
207
- impl<T: RecordParser> Iterator for RecordReader<T> {
208
- type Item = T::Output;
209
-
210
- fn next(&mut self) -> Option<Self::Item> {
211
- match &mut self.reader {
212
- ReadImpl::MultiThreaded { receiver, handle } => match receiver.recv() {
213
- Ok(record) => Some(record),
214
- Err(_) => {
215
- if let Some(handle) = handle.take() {
216
- let _ = handle.join();
217
- }
218
- None
219
- }
220
- },
221
- ReadImpl::SingleThreaded { reader, headers } => {
222
- let mut record = csv::StringRecord::new();
223
- match reader.read_record(&mut record) {
224
- Ok(true) => Some(T::parse(headers, &record)),
225
- _ => None,
226
- }
227
- }
228
- }
229
- }
79
+ fn create_enumerator(
80
+ args: EnumeratorArgs,
81
+ ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord>>>, Error> {
82
+ let kwargs = RHash::new();
83
+ kwargs.aset(Symbol::new("has_headers"), args.has_headers)?;
84
+ kwargs.aset(
85
+ Symbol::new("col_sep"),
86
+ String::from_utf8(vec![args.delimiter]).unwrap(),
87
+ )?;
88
+ kwargs.aset(
89
+ Symbol::new("quote_char"),
90
+ String::from_utf8(vec![args.quote_char]).unwrap(),
91
+ )?;
92
+ kwargs.aset(Symbol::new("nil_string"), args.null_string)?;
93
+ kwargs.aset(Symbol::new("buffer_size"), args.buffer_size)?;
94
+ kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
95
+
96
+ let enumerator = args
97
+ .rb_self
98
+ .enumeratorize("for_each", (args.to_read, KwArgs(kwargs)));
99
+ Ok(Yield::Enumerator(enumerator))
230
100
  }
data/ext/osv/src/utils.rs CHANGED
@@ -1,20 +1,123 @@
1
1
  use magnus::{
2
2
  scan_args::{get_kwargs, scan_args},
3
- Error, Value,
3
+ value::ReprValue,
4
+ Error, RString, Ruby, Symbol, Value,
4
5
  };
5
6
 
7
+ #[derive(Debug)]
8
+ pub struct CsvArgs {
9
+ pub to_read: Value,
10
+ pub has_headers: bool,
11
+ pub delimiter: u8,
12
+ pub quote_char: u8,
13
+ pub null_string: String,
14
+ pub buffer_size: usize,
15
+ pub result_type: String,
16
+ }
17
+
6
18
  /// Parse common arguments for CSV parsing
7
- pub fn parse_csv_args(args: &[Value]) -> Result<(Value, bool, Option<String>), Error> {
19
+ pub fn parse_csv_args(ruby: &Ruby, args: &[Value]) -> Result<CsvArgs, Error> {
8
20
  let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
9
21
  let (to_read,) = parsed_args.required;
10
22
 
11
- let kwargs = get_kwargs::<_, (), (Option<bool>, Option<String>), ()>(
23
+ let kwargs = get_kwargs::<
24
+ _,
25
+ (),
26
+ (
27
+ Option<bool>,
28
+ Option<String>,
29
+ Option<String>,
30
+ Option<String>,
31
+ Option<usize>,
32
+ Option<Value>,
33
+ ),
34
+ (),
35
+ >(
12
36
  parsed_args.keywords,
13
37
  &[],
14
- &["has_headers", "delimiter"],
38
+ &[
39
+ "has_headers",
40
+ "col_sep",
41
+ "quote_char",
42
+ "nil_string",
43
+ "buffer_size",
44
+ "result_type",
45
+ ],
15
46
  )?;
16
47
 
17
48
  let has_headers = kwargs.optional.0.unwrap_or(true);
18
49
 
19
- Ok((to_read, has_headers, kwargs.optional.1))
50
+ let delimiter = *kwargs
51
+ .optional
52
+ .1
53
+ .unwrap_or_else(|| ",".to_string())
54
+ .as_bytes()
55
+ .first()
56
+ .ok_or_else(|| {
57
+ Error::new(
58
+ magnus::exception::runtime_error(),
59
+ "Delimiter cannot be empty",
60
+ )
61
+ })?;
62
+
63
+ let quote_char = *kwargs
64
+ .optional
65
+ .2
66
+ .unwrap_or_else(|| "\"".to_string())
67
+ .as_bytes()
68
+ .first()
69
+ .ok_or_else(|| {
70
+ Error::new(
71
+ magnus::exception::runtime_error(),
72
+ "Quote character cannot be empty",
73
+ )
74
+ })?;
75
+
76
+ let null_string = kwargs.optional.3.unwrap_or_else(|| "".to_string());
77
+
78
+ let buffer_size = kwargs.optional.4.unwrap_or(1000);
79
+
80
+ let result_type = match kwargs.optional.5 {
81
+ Some(value) => {
82
+ let parsed = if value.is_kind_of(ruby.class_string()) {
83
+ RString::from_value(value)
84
+ .ok_or_else(|| {
85
+ Error::new(magnus::exception::type_error(), "Invalid string value")
86
+ })?
87
+ .to_string()?
88
+ } else if value.is_kind_of(ruby.class_symbol()) {
89
+ Symbol::from_value(value)
90
+ .ok_or_else(|| {
91
+ Error::new(magnus::exception::type_error(), "Invalid symbol value")
92
+ })?
93
+ .funcall("to_s", ())?
94
+ } else {
95
+ return Err(Error::new(
96
+ magnus::exception::type_error(),
97
+ "result_type must be a String or Symbol",
98
+ ));
99
+ };
100
+
101
+ match parsed.as_str() {
102
+ "hash" | "array" => parsed,
103
+ _ => {
104
+ return Err(Error::new(
105
+ magnus::exception::runtime_error(),
106
+ "result_type must be either 'hash' or 'array'",
107
+ ))
108
+ }
109
+ }
110
+ }
111
+ None => String::from("hash"),
112
+ };
113
+
114
+ Ok(CsvArgs {
115
+ to_read,
116
+ has_headers,
117
+ delimiter,
118
+ quote_char,
119
+ null_string,
120
+ buffer_size,
121
+ result_type,
122
+ })
20
123
  }
data/lib/osv/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module OSV
2
- VERSION = "0.2.0"
2
+ VERSION = "0.3.0"
3
3
  end
data/lib/osv.rbi CHANGED
@@ -2,28 +2,26 @@
2
2
 
3
3
  module OSV
4
4
  sig do
5
- type_parameters(:T)
6
- .params(
7
- input: T.any(String, StringIO, IO),
8
- has_headers: T.nilable(T::Boolean),
9
- delimiter: T.nilable(String),
10
- blk: T.proc.params(row: T::Hash[String, String]).void
11
- )
12
- .returns(T.untyped)
5
+ params(
6
+ input: T.any(String, StringIO, IO),
7
+ has_headers: T.nilable(T::Boolean),
8
+ col_sep: T.nilable(String),
9
+ quote_char: T.nilable(String),
10
+ nil_string: T.nilable(String),
11
+ buffer_size: T.nilable(Integer),
12
+ result_type: T.nilable(String),
13
+ blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.nilable(String)], T::Array[T.nilable(String)])).void)
14
+ ).returns(T.any(Enumerator, T.untyped))
13
15
  end
14
- def self.for_each(input, has_headers: true, delimiter: nil, &blk)
15
- end
16
-
17
- sig do
18
- type_parameters(:T)
19
- .params(
20
- input: T.any(String, StringIO, IO),
21
- has_headers: T.nilable(T::Boolean),
22
- delimiter: T.nilable(String),
23
- blk: T.proc.params(row: T::Array[String]).void
24
- )
25
- .returns(T.untyped)
26
- end
27
- def self.for_each_compat(input, has_headers: true, delimiter: nil, &blk)
16
+ def self.for_each(
17
+ input,
18
+ has_headers: true,
19
+ col_sep: nil,
20
+ quote_char: nil,
21
+ nil_string: nil,
22
+ buffer_size: nil,
23
+ result_type: nil,
24
+ &blk
25
+ )
28
26
  end
29
27
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: osv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
@@ -49,10 +49,17 @@ files:
49
49
  - Cargo.lock
50
50
  - Cargo.toml
51
51
  - Gemfile
52
+ - LICENSE
53
+ - README.md
52
54
  - Rakefile
53
55
  - ext/osv/Cargo.lock
54
56
  - ext/osv/Cargo.toml
55
57
  - ext/osv/extconf.rb
58
+ - ext/osv/src/csv/builder.rs
59
+ - ext/osv/src/csv/mod.rs
60
+ - ext/osv/src/csv/parser.rs
61
+ - ext/osv/src/csv/reader.rs
62
+ - ext/osv/src/csv/record.rs
56
63
  - ext/osv/src/lib.rs
57
64
  - ext/osv/src/reader.rs
58
65
  - ext/osv/src/utils.rs