patchwork_csv_utils 0.1.6-x86_64-darwin → 0.1.8-x86_64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +2 -1
- data/ext/csv_utils/src/lib.rs +2 -0
- data/ext/csv_utils/src/utils/csv.rs +96 -0
- data/ext/csv_utils/src/utils/dedup.rs +5 -11
- data/ext/csv_utils/src/utils/mod.rs +5 -0
- data/ext/csv_utils/src/utils/xls.rs +30 -38
- data/lib/csv_utils/2.7/csv_utils.bundle +0 -0
- data/lib/csv_utils/3.0/csv_utils.bundle +0 -0
- data/lib/csv_utils/3.1/csv_utils.bundle +0 -0
- data/lib/csv_utils/3.2/csv_utils.bundle +0 -0
- data/lib/csv_utils/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6603f8bcfc0587d4d92642e938c58f3077e7cc22a3147de6cc11ad83f562d337
|
4
|
+
data.tar.gz: 4b8e33dcd5fa853d8f9a95420d6f844645dc552f16651e91458c6429f92b67b3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a1102ac2ad779f7077324871702edb4597157715aae11fda27081c40dc6708aed94549053f4895ceb9dafcc862740de6b83b7cce09d770593bcf62dcb5fc1ed0
|
7
|
+
data.tar.gz: e33c5a1e1e521af86d6391093a7f0a5be3b77f733dad77e1114817aea944cd2788aa59b59b0aac94e0fbf82654ce0669c165f55966dc8d64ca0442b660becc22
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -14,7 +14,8 @@ gem install patchwork_csv_utils
|
|
14
14
|
```irb
|
15
15
|
require 'csv_utils'
|
16
16
|
CsvUtils.dedup('file1.csv', 'file2.csv', 'output.csv')
|
17
|
-
CsvUtils.to_csv('file1.xls', 'output_file1.csv', '
|
17
|
+
CsvUtils.to_csv('file1.xls', 'output_file1.csv', ['request_ids_to_skip']])
|
18
|
+
CsvUtils.transform_csv('file1.xls', 'output_file1.csv', ['request_ids_to_skip']])
|
18
19
|
```
|
19
20
|
|
20
21
|
## Release
|
data/ext/csv_utils/src/lib.rs
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
use magnus::{define_module, function, prelude::*};
|
2
|
+
use crate::utils::csv::transform_csv;
|
2
3
|
use crate::utils::dedup::dedup;
|
3
4
|
use crate::utils::xls::to_csv;
|
4
5
|
|
@@ -9,5 +10,6 @@ fn init() -> Result<(), magnus::Error> {
|
|
9
10
|
let module = define_module("CsvUtils")?;
|
10
11
|
module.define_singleton_method("dedup", function!(dedup, 3))?;
|
11
12
|
module.define_singleton_method("to_csv", function!(to_csv, 3))?;
|
13
|
+
module.define_singleton_method("transform_csv", function!(transform_csv, 3))?;
|
12
14
|
Ok(())
|
13
15
|
}
|
@@ -0,0 +1,96 @@
|
|
1
|
+
use std::collections::HashMap;
|
2
|
+
use std::fs::File;
|
3
|
+
|
4
|
+
use chrono::{NaiveDate, NaiveDateTime, NaiveTime, Utc};
|
5
|
+
use csv::{StringRecord, Writer};
|
6
|
+
use magnus::{RArray, Ruby};
|
7
|
+
|
8
|
+
use crate::utils::{FileExtension, magnus_err, missing_header};
|
9
|
+
|
10
|
+
pub fn transform_csv(ruby: &Ruby, csv_path: String, target_path: String, exclusions: RArray) -> magnus::error::Result<()> {
|
11
|
+
if !csv_path.has_extension(&["csv"]) {
|
12
|
+
return Err(magnus::Error::new(ruby.exception_standard_error(), "csv_path must be a csv file".to_string()));
|
13
|
+
}
|
14
|
+
|
15
|
+
let exclusions = RArray::to_vec(exclusions)?;
|
16
|
+
|
17
|
+
let csv_file = File::open(csv_path).map_err(|e| magnus_err(ruby, e, "csv_path"))?;
|
18
|
+
let mut csv: csv::Reader<File> = csv::Reader::from_reader(csv_file);
|
19
|
+
let mut wtr = Writer::from_path(target_path).map_err(|e| magnus_err(ruby, e, "target_path"))?;
|
20
|
+
let headers = csv.headers().map_err(|e| magnus_err(ruby, e, "csv_path headers"))?;
|
21
|
+
let header_map: HashMap<String, usize> = headers.iter().enumerate().map(|(i, h)| (h.to_string(), i)).collect();
|
22
|
+
let inverse_header_map: HashMap<usize, String> = headers.iter().enumerate().map(|(i, h)| (i, h.to_string())).collect();
|
23
|
+
|
24
|
+
wtr.write_byte_record(headers.as_byte_record()).map_err(|e| magnus_err(ruby, e, "write_byte_record"))?;
|
25
|
+
|
26
|
+
let request_id = header_map.get("Request Id").ok_or(missing_header(ruby, "Request Id"))?;
|
27
|
+
let date = header_map.get("Date").ok_or(missing_header(ruby, "Date"))?;
|
28
|
+
let start = header_map.get("Start").ok_or(missing_header(ruby, "Start"))?;
|
29
|
+
let end = header_map.get("End").ok_or(missing_header(ruby, "End"))?;
|
30
|
+
let actual_start = header_map.get("Actual Start").ok_or(missing_header(ruby, "Actual Start"))?;
|
31
|
+
let actual_end = header_map.get("Actual End").ok_or(missing_header(ruby, "Actual End"))?;
|
32
|
+
|
33
|
+
for (ri, record) in csv.records().enumerate() {
|
34
|
+
let record = record.map_err(|e| magnus_err(ruby, e, "record"))?;
|
35
|
+
|
36
|
+
if skip_excluded_rows(request_id, &record, &exclusions) { continue; }
|
37
|
+
if has_empty_row_skip(&record) { continue; }
|
38
|
+
if has_empty_first_col_skip_row(&record) { continue; }
|
39
|
+
|
40
|
+
let mut date_value = Utc::now().naive_utc();
|
41
|
+
|
42
|
+
let record = record.iter().enumerate().map(|(i, c)| {
|
43
|
+
let c = c.trim_end();
|
44
|
+
if i == *date {
|
45
|
+
let current = string_to_datetime(c).ok_or(to_datetime_error(ruby, c, ri, "Date"))?;
|
46
|
+
date_value = current;
|
47
|
+
Ok(current.to_string())
|
48
|
+
} else if i == *start || i == *end || i == *actual_start || i == *actual_end {
|
49
|
+
if c.is_empty() { return Ok(c.to_string()); }
|
50
|
+
let unknown = "Unknown".to_string();
|
51
|
+
let column_name = inverse_header_map.get(&i).unwrap_or(&unknown);
|
52
|
+
let current_time = string_to_time(c).ok_or(to_datetime_error(ruby, c, ri, column_name))?;
|
53
|
+
let datetime = transform_time_to_datetime(date_value, current_time);
|
54
|
+
Ok(datetime.to_string())
|
55
|
+
} else {
|
56
|
+
Ok(c.to_string())
|
57
|
+
}
|
58
|
+
}).collect::<Result<StringRecord, magnus::Error>>()?;
|
59
|
+
|
60
|
+
let record = record.into_iter().map(|r| r.trim_end()).collect::<StringRecord>();
|
61
|
+
wtr.write_byte_record(record.as_byte_record()).map_err(|e| magnus_err(ruby, e, "write_byte_record"))?;
|
62
|
+
}
|
63
|
+
|
64
|
+
wtr.flush().map_err(|e| magnus_err(ruby, e, "flush"))?;
|
65
|
+
|
66
|
+
Ok(())
|
67
|
+
}
|
68
|
+
|
69
|
+
fn skip_excluded_rows(request_id: &usize, r: &StringRecord, exclusions: &Vec<String>) -> bool {
|
70
|
+
let value = r.get(*request_id).unwrap_or_default();
|
71
|
+
exclusions.contains(&value.to_string())
|
72
|
+
}
|
73
|
+
|
74
|
+
fn string_to_datetime(s: &str) -> Option<NaiveDateTime> {
|
75
|
+
NaiveDate::parse_from_str(s, "%d-%b-%y").ok().map(|d| d.and_hms_opt(0, 0, 0)).flatten()
|
76
|
+
}
|
77
|
+
|
78
|
+
fn string_to_time(s: &str) -> Option<NaiveTime> {
|
79
|
+
NaiveTime::parse_from_str(s, "%H:%M").ok()
|
80
|
+
}
|
81
|
+
|
82
|
+
fn transform_time_to_datetime(t1: NaiveDateTime, t2: NaiveTime) -> NaiveDateTime {
|
83
|
+
NaiveDateTime::new(t1.date(), t2)
|
84
|
+
}
|
85
|
+
|
86
|
+
fn to_datetime_error(ruby: &Ruby, value: &str, row: usize, col: &str) -> magnus::Error {
|
87
|
+
magnus::Error::new(ruby.exception_standard_error(), format!("Could not parse datetime '{}', row: {}, col: {}", value, row, col))
|
88
|
+
}
|
89
|
+
|
90
|
+
fn has_empty_first_col_skip_row(record: &StringRecord) -> bool {
|
91
|
+
record[0].is_empty()
|
92
|
+
}
|
93
|
+
|
94
|
+
fn has_empty_row_skip(record: &StringRecord) -> bool {
|
95
|
+
record.iter().all(|r| r.is_empty())
|
96
|
+
}
|
@@ -28,7 +28,7 @@ pub fn dedup(ruby: &Ruby, previous_csv_path: String, new_csv_path: String, targe
|
|
28
28
|
return Err(magnus::Error::new(ruby.exception_standard_error(), "headers of both csv files must be the same".to_string()));
|
29
29
|
}
|
30
30
|
|
31
|
-
wtr.write_byte_record(previous_headers.as_byte_record()).
|
31
|
+
wtr.write_byte_record(previous_headers.as_byte_record()).map_err(|e| magnus_err(ruby, e, "write_byte_record"))?;
|
32
32
|
|
33
33
|
let mut previous_records = vec![];
|
34
34
|
for previous_record in previous_csv.records() {
|
@@ -49,27 +49,21 @@ pub fn dedup(ruby: &Ruby, previous_csv_path: String, new_csv_path: String, targe
|
|
49
49
|
|
50
50
|
let new_record = new_record.into_iter().map(|r| r.trim_end()).collect::<StringRecord>();
|
51
51
|
if !previous_records.contains(&new_record) {
|
52
|
-
wtr.write_byte_record(new_record.as_byte_record()).
|
52
|
+
wtr.write_byte_record(new_record.as_byte_record()).map_err(|e| magnus_err(ruby, e, "write_byte_record"))?;
|
53
53
|
}
|
54
54
|
}
|
55
55
|
|
56
|
-
wtr.flush().
|
56
|
+
wtr.flush().map_err(|e| magnus_err(ruby, e, "flush"))?;
|
57
57
|
|
58
58
|
Ok(())
|
59
59
|
}
|
60
60
|
|
61
61
|
fn has_empty_first_col_skip_row(previous_record: &StringRecord) -> bool {
|
62
|
-
|
63
|
-
return true;
|
64
|
-
}
|
65
|
-
false
|
62
|
+
previous_record[0].is_empty()
|
66
63
|
}
|
67
64
|
|
68
65
|
fn has_empty_row_skip(record: &StringRecord) -> bool {
|
69
|
-
|
70
|
-
return true;
|
71
|
-
}
|
72
|
-
false
|
66
|
+
record.iter().all(|r| r.is_empty())
|
73
67
|
}
|
74
68
|
|
75
69
|
|
@@ -3,9 +3,14 @@ use std::ffi::OsStr;
|
|
3
3
|
use std::path::Path;
|
4
4
|
use magnus::Ruby;
|
5
5
|
|
6
|
+
pub mod csv;
|
6
7
|
pub mod dedup;
|
7
8
|
pub mod xls;
|
8
9
|
|
10
|
+
fn missing_header(ruby: &Ruby, header: &str) -> magnus::Error {
|
11
|
+
magnus::Error::new(ruby.exception_standard_error(), format!("Missing '{}' header", header))
|
12
|
+
}
|
13
|
+
|
9
14
|
fn magnus_err<E: Error>(ruby: &Ruby, e: E, msg: &str) -> magnus::Error {
|
10
15
|
magnus::Error::new(ruby.exception_standard_error(), format!("{}: {}", msg, e.to_string()))
|
11
16
|
}
|
@@ -6,19 +6,15 @@ use calamine::{Data, open_workbook, Range, Reader, Xls};
|
|
6
6
|
use chrono::{NaiveDateTime, Utc};
|
7
7
|
use magnus::{RArray, Ruby};
|
8
8
|
|
9
|
-
use crate::utils::{FileExtension, magnus_err};
|
9
|
+
use crate::utils::{FileExtension, magnus_err, missing_header};
|
10
10
|
|
11
11
|
pub fn to_csv(ruby: &Ruby, xls_path: String, target_path: String, exclusions: RArray) -> magnus::error::Result<()> {
|
12
|
-
let exclusions = RArray::to_vec(exclusions)?;
|
13
|
-
|
14
|
-
println!("xls_path: {:?}", xls_path);
|
15
|
-
println!("target_path: {:?}", target_path);
|
16
|
-
println!("exclusions: {:?}", exclusions);
|
17
|
-
|
18
12
|
if !xls_path.has_extension(&["xls"]) {
|
19
13
|
return Err(magnus::Error::new(ruby.exception_standard_error(), "xls_path must be an xls file".to_string()));
|
20
14
|
}
|
21
15
|
|
16
|
+
let exclusions = RArray::to_vec(exclusions)?;
|
17
|
+
|
22
18
|
let mut workbook: Xls<_> = open_workbook(xls_path.clone()).map_err(|e| magnus_err(ruby, e, format!("could not open xls: {}", xls_path).as_str()))?;
|
23
19
|
let range = workbook.worksheet_range_at(0)
|
24
20
|
.ok_or(magnus::Error::new(ruby.exception_standard_error(), "no worksheet found in xls".to_string()))
|
@@ -34,18 +30,23 @@ pub fn to_csv(ruby: &Ruby, xls_path: String, target_path: String, exclusions: RA
|
|
34
30
|
|
35
31
|
fn write_csv<W: Write>(ruby: &Ruby, dest: &mut W, range: &Range<Data>, header_map: HashMap<String, usize>, exclusions: Vec<String>) -> magnus::error::Result<()> {
|
36
32
|
let n = range.get_size().1 - 1;
|
33
|
+
|
34
|
+
let request_id = header_map.get("Request Id").ok_or(missing_header(ruby, "Request Id"))?;
|
35
|
+
let date = header_map.get("Date").ok_or(missing_header(ruby, "Date"))?;
|
36
|
+
let start = header_map.get("Start").ok_or(missing_header(ruby, "Start"))?;
|
37
|
+
let end = header_map.get("End").ok_or(missing_header(ruby, "End"))?;
|
38
|
+
let actual_start = header_map.get("Actual Start").ok_or(missing_header(ruby, "Actual Start"))?;
|
39
|
+
let actual_end = header_map.get("Actual End").ok_or(missing_header(ruby, "Actual End"))?;
|
40
|
+
|
37
41
|
for (ri, r) in range.rows().enumerate() {
|
38
42
|
let mut date_value = Utc::now().naive_utc();
|
39
43
|
|
40
|
-
if skip_excluded_rows(&
|
44
|
+
if skip_excluded_rows(&request_id, r, &exclusions) { continue; }
|
41
45
|
if skip_empty_rows(r) { continue; }
|
42
|
-
if skip_rows_with_no_request_id(&
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
let end = header_map.get("End").ok_or(missing_header(ruby, "End"))?;
|
47
|
-
let actual_start = header_map.get("Actual Start").ok_or(missing_header(ruby, "Actual Start"))?;
|
48
|
-
let actual_end = header_map.get("Actual End").ok_or(missing_header(ruby, "Actual End"))?;
|
46
|
+
if skip_rows_with_no_request_id(&request_id, r) { continue; }
|
47
|
+
if date_value_is_not_present(&date, r) {
|
48
|
+
return Err(magnus::Error::new(ruby.exception_standard_error(), format!("Date value is not present in row: {}", ri)));
|
49
|
+
}
|
49
50
|
|
50
51
|
for (i, c) in r.iter().enumerate() {
|
51
52
|
match *c {
|
@@ -55,7 +56,7 @@ fn write_csv<W: Write>(ruby: &Ruby, dest: &mut W, range: &Range<Data>, header_ma
|
|
55
56
|
}
|
56
57
|
Data::Float(ref f) => write!(dest, "{}", f),
|
57
58
|
Data::DateTime(ref d) => {
|
58
|
-
let mut current = d.as_datetime().
|
59
|
+
let mut current = d.as_datetime().ok_or(to_datetime_error(ruby, &d.to_string(), ri, "Date"))?;
|
59
60
|
if i == *date {
|
60
61
|
date_value = current;
|
61
62
|
} else if i == *start || i == *end || i == *actual_start || i == *actual_end {
|
@@ -76,40 +77,31 @@ fn write_csv<W: Write>(ruby: &Ruby, dest: &mut W, range: &Range<Data>, header_ma
|
|
76
77
|
Ok(())
|
77
78
|
}
|
78
79
|
|
79
|
-
fn
|
80
|
-
|
81
|
-
let value = r[*request_id].to_string();
|
82
|
-
if exclusions.contains(&value) {
|
83
|
-
return true;
|
84
|
-
}
|
85
|
-
}
|
86
|
-
false
|
80
|
+
fn date_value_is_not_present(date: &usize, r: &[Data]) -> bool {
|
81
|
+
r[*date] == Data::Empty
|
87
82
|
}
|
88
83
|
|
89
|
-
fn
|
90
|
-
|
91
|
-
|
92
|
-
}
|
93
|
-
false
|
84
|
+
fn skip_excluded_rows(request_id: &usize, r: &[Data], exclusions: &Vec<String>) -> bool {
|
85
|
+
let value = r[*request_id].to_string();
|
86
|
+
exclusions.contains(&value.to_string())
|
94
87
|
}
|
95
88
|
|
96
|
-
fn
|
97
|
-
|
98
|
-
if r[*request_id] == Data::Empty {
|
99
|
-
return true;
|
100
|
-
}
|
101
|
-
}
|
102
|
-
false
|
89
|
+
fn skip_empty_rows(r: &[Data]) -> bool {
|
90
|
+
r.iter().all(|c| c == &Data::Empty)
|
103
91
|
}
|
104
92
|
|
105
|
-
fn
|
106
|
-
|
93
|
+
fn skip_rows_with_no_request_id(request_id: &usize, r: &[Data]) -> bool {
|
94
|
+
r[*request_id] == Data::Empty
|
107
95
|
}
|
108
96
|
|
109
97
|
fn transform_time_to_datetime(t1: NaiveDateTime, t2: NaiveDateTime) -> NaiveDateTime {
|
110
98
|
NaiveDateTime::new(t1.date(), t2.time())
|
111
99
|
}
|
112
100
|
|
101
|
+
fn to_datetime_error(ruby: &Ruby, value: &str, row: usize, col: &str) -> magnus::Error {
|
102
|
+
magnus::Error::new(ruby.exception_standard_error(), format!("Could not parse datetime '{}', row: {}, col: {}", value, row, col))
|
103
|
+
}
|
104
|
+
|
113
105
|
fn handle_commas<W: Write>(dest: &mut W, s: &str) -> std::io::Result<()> {
|
114
106
|
if s.contains(",") {
|
115
107
|
write!(dest, "{:?}", clean_strings(s).trim_end())
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/lib/csv_utils/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: patchwork_csv_utils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.8
|
5
5
|
platform: x86_64-darwin
|
6
6
|
authors:
|
7
7
|
- kingsley.hendrickse
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-08-
|
11
|
+
date: 2024-08-07 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Deduplication of CSV files and XLS to CSV conversion.
|
14
14
|
email:
|
@@ -28,6 +28,7 @@ files:
|
|
28
28
|
- ext/csv_utils/Cargo.toml
|
29
29
|
- ext/csv_utils/extconf.rb
|
30
30
|
- ext/csv_utils/src/lib.rs
|
31
|
+
- ext/csv_utils/src/utils/csv.rs
|
31
32
|
- ext/csv_utils/src/utils/dedup.rs
|
32
33
|
- ext/csv_utils/src/utils/mod.rs
|
33
34
|
- ext/csv_utils/src/utils/xls.rs
|