patchwork_csv_utils 0.1.6-x86_64-darwin → 0.1.8-x86_64-darwin
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +2 -1
- data/ext/csv_utils/src/lib.rs +2 -0
- data/ext/csv_utils/src/utils/csv.rs +96 -0
- data/ext/csv_utils/src/utils/dedup.rs +5 -11
- data/ext/csv_utils/src/utils/mod.rs +5 -0
- data/ext/csv_utils/src/utils/xls.rs +30 -38
- data/lib/csv_utils/2.7/csv_utils.bundle +0 -0
- data/lib/csv_utils/3.0/csv_utils.bundle +0 -0
- data/lib/csv_utils/3.1/csv_utils.bundle +0 -0
- data/lib/csv_utils/3.2/csv_utils.bundle +0 -0
- data/lib/csv_utils/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6603f8bcfc0587d4d92642e938c58f3077e7cc22a3147de6cc11ad83f562d337
|
4
|
+
data.tar.gz: 4b8e33dcd5fa853d8f9a95420d6f844645dc552f16651e91458c6429f92b67b3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a1102ac2ad779f7077324871702edb4597157715aae11fda27081c40dc6708aed94549053f4895ceb9dafcc862740de6b83b7cce09d770593bcf62dcb5fc1ed0
|
7
|
+
data.tar.gz: e33c5a1e1e521af86d6391093a7f0a5be3b77f733dad77e1114817aea944cd2788aa59b59b0aac94e0fbf82654ce0669c165f55966dc8d64ca0442b660becc22
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -14,7 +14,8 @@ gem install patchwork_csv_utils
|
|
14
14
|
```irb
|
15
15
|
require 'csv_utils'
|
16
16
|
CsvUtils.dedup('file1.csv', 'file2.csv', 'output.csv')
|
17
|
-
CsvUtils.to_csv('file1.xls', 'output_file1.csv', '
|
17
|
+
CsvUtils.to_csv('file1.xls', 'output_file1.csv', ['request_ids_to_skip']])
|
18
|
+
CsvUtils.transform_csv('file1.xls', 'output_file1.csv', ['request_ids_to_skip']])
|
18
19
|
```
|
19
20
|
|
20
21
|
## Release
|
data/ext/csv_utils/src/lib.rs
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
use magnus::{define_module, function, prelude::*};
|
2
|
+
use crate::utils::csv::transform_csv;
|
2
3
|
use crate::utils::dedup::dedup;
|
3
4
|
use crate::utils::xls::to_csv;
|
4
5
|
|
@@ -9,5 +10,6 @@ fn init() -> Result<(), magnus::Error> {
|
|
9
10
|
let module = define_module("CsvUtils")?;
|
10
11
|
module.define_singleton_method("dedup", function!(dedup, 3))?;
|
11
12
|
module.define_singleton_method("to_csv", function!(to_csv, 3))?;
|
13
|
+
module.define_singleton_method("transform_csv", function!(transform_csv, 3))?;
|
12
14
|
Ok(())
|
13
15
|
}
|
@@ -0,0 +1,96 @@
|
|
1
|
+
use std::collections::HashMap;
|
2
|
+
use std::fs::File;
|
3
|
+
|
4
|
+
use chrono::{NaiveDate, NaiveDateTime, NaiveTime, Utc};
|
5
|
+
use csv::{StringRecord, Writer};
|
6
|
+
use magnus::{RArray, Ruby};
|
7
|
+
|
8
|
+
use crate::utils::{FileExtension, magnus_err, missing_header};
|
9
|
+
|
10
|
+
pub fn transform_csv(ruby: &Ruby, csv_path: String, target_path: String, exclusions: RArray) -> magnus::error::Result<()> {
|
11
|
+
if !csv_path.has_extension(&["csv"]) {
|
12
|
+
return Err(magnus::Error::new(ruby.exception_standard_error(), "csv_path must be a csv file".to_string()));
|
13
|
+
}
|
14
|
+
|
15
|
+
let exclusions = RArray::to_vec(exclusions)?;
|
16
|
+
|
17
|
+
let csv_file = File::open(csv_path).map_err(|e| magnus_err(ruby, e, "csv_path"))?;
|
18
|
+
let mut csv: csv::Reader<File> = csv::Reader::from_reader(csv_file);
|
19
|
+
let mut wtr = Writer::from_path(target_path).map_err(|e| magnus_err(ruby, e, "target_path"))?;
|
20
|
+
let headers = csv.headers().map_err(|e| magnus_err(ruby, e, "csv_path headers"))?;
|
21
|
+
let header_map: HashMap<String, usize> = headers.iter().enumerate().map(|(i, h)| (h.to_string(), i)).collect();
|
22
|
+
let inverse_header_map: HashMap<usize, String> = headers.iter().enumerate().map(|(i, h)| (i, h.to_string())).collect();
|
23
|
+
|
24
|
+
wtr.write_byte_record(headers.as_byte_record()).map_err(|e| magnus_err(ruby, e, "write_byte_record"))?;
|
25
|
+
|
26
|
+
let request_id = header_map.get("Request Id").ok_or(missing_header(ruby, "Request Id"))?;
|
27
|
+
let date = header_map.get("Date").ok_or(missing_header(ruby, "Date"))?;
|
28
|
+
let start = header_map.get("Start").ok_or(missing_header(ruby, "Start"))?;
|
29
|
+
let end = header_map.get("End").ok_or(missing_header(ruby, "End"))?;
|
30
|
+
let actual_start = header_map.get("Actual Start").ok_or(missing_header(ruby, "Actual Start"))?;
|
31
|
+
let actual_end = header_map.get("Actual End").ok_or(missing_header(ruby, "Actual End"))?;
|
32
|
+
|
33
|
+
for (ri, record) in csv.records().enumerate() {
|
34
|
+
let record = record.map_err(|e| magnus_err(ruby, e, "record"))?;
|
35
|
+
|
36
|
+
if skip_excluded_rows(request_id, &record, &exclusions) { continue; }
|
37
|
+
if has_empty_row_skip(&record) { continue; }
|
38
|
+
if has_empty_first_col_skip_row(&record) { continue; }
|
39
|
+
|
40
|
+
let mut date_value = Utc::now().naive_utc();
|
41
|
+
|
42
|
+
let record = record.iter().enumerate().map(|(i, c)| {
|
43
|
+
let c = c.trim_end();
|
44
|
+
if i == *date {
|
45
|
+
let current = string_to_datetime(c).ok_or(to_datetime_error(ruby, c, ri, "Date"))?;
|
46
|
+
date_value = current;
|
47
|
+
Ok(current.to_string())
|
48
|
+
} else if i == *start || i == *end || i == *actual_start || i == *actual_end {
|
49
|
+
if c.is_empty() { return Ok(c.to_string()); }
|
50
|
+
let unknown = "Unknown".to_string();
|
51
|
+
let column_name = inverse_header_map.get(&i).unwrap_or(&unknown);
|
52
|
+
let current_time = string_to_time(c).ok_or(to_datetime_error(ruby, c, ri, column_name))?;
|
53
|
+
let datetime = transform_time_to_datetime(date_value, current_time);
|
54
|
+
Ok(datetime.to_string())
|
55
|
+
} else {
|
56
|
+
Ok(c.to_string())
|
57
|
+
}
|
58
|
+
}).collect::<Result<StringRecord, magnus::Error>>()?;
|
59
|
+
|
60
|
+
let record = record.into_iter().map(|r| r.trim_end()).collect::<StringRecord>();
|
61
|
+
wtr.write_byte_record(record.as_byte_record()).map_err(|e| magnus_err(ruby, e, "write_byte_record"))?;
|
62
|
+
}
|
63
|
+
|
64
|
+
wtr.flush().map_err(|e| magnus_err(ruby, e, "flush"))?;
|
65
|
+
|
66
|
+
Ok(())
|
67
|
+
}
|
68
|
+
|
69
|
+
fn skip_excluded_rows(request_id: &usize, r: &StringRecord, exclusions: &Vec<String>) -> bool {
|
70
|
+
let value = r.get(*request_id).unwrap_or_default();
|
71
|
+
exclusions.contains(&value.to_string())
|
72
|
+
}
|
73
|
+
|
74
|
+
fn string_to_datetime(s: &str) -> Option<NaiveDateTime> {
|
75
|
+
NaiveDate::parse_from_str(s, "%d-%b-%y").ok().map(|d| d.and_hms_opt(0, 0, 0)).flatten()
|
76
|
+
}
|
77
|
+
|
78
|
+
fn string_to_time(s: &str) -> Option<NaiveTime> {
|
79
|
+
NaiveTime::parse_from_str(s, "%H:%M").ok()
|
80
|
+
}
|
81
|
+
|
82
|
+
fn transform_time_to_datetime(t1: NaiveDateTime, t2: NaiveTime) -> NaiveDateTime {
|
83
|
+
NaiveDateTime::new(t1.date(), t2)
|
84
|
+
}
|
85
|
+
|
86
|
+
fn to_datetime_error(ruby: &Ruby, value: &str, row: usize, col: &str) -> magnus::Error {
|
87
|
+
magnus::Error::new(ruby.exception_standard_error(), format!("Could not parse datetime '{}', row: {}, col: {}", value, row, col))
|
88
|
+
}
|
89
|
+
|
90
|
+
fn has_empty_first_col_skip_row(record: &StringRecord) -> bool {
|
91
|
+
record[0].is_empty()
|
92
|
+
}
|
93
|
+
|
94
|
+
fn has_empty_row_skip(record: &StringRecord) -> bool {
|
95
|
+
record.iter().all(|r| r.is_empty())
|
96
|
+
}
|
@@ -28,7 +28,7 @@ pub fn dedup(ruby: &Ruby, previous_csv_path: String, new_csv_path: String, targe
|
|
28
28
|
return Err(magnus::Error::new(ruby.exception_standard_error(), "headers of both csv files must be the same".to_string()));
|
29
29
|
}
|
30
30
|
|
31
|
-
wtr.write_byte_record(previous_headers.as_byte_record()).
|
31
|
+
wtr.write_byte_record(previous_headers.as_byte_record()).map_err(|e| magnus_err(ruby, e, "write_byte_record"))?;
|
32
32
|
|
33
33
|
let mut previous_records = vec![];
|
34
34
|
for previous_record in previous_csv.records() {
|
@@ -49,27 +49,21 @@ pub fn dedup(ruby: &Ruby, previous_csv_path: String, new_csv_path: String, targe
|
|
49
49
|
|
50
50
|
let new_record = new_record.into_iter().map(|r| r.trim_end()).collect::<StringRecord>();
|
51
51
|
if !previous_records.contains(&new_record) {
|
52
|
-
wtr.write_byte_record(new_record.as_byte_record()).
|
52
|
+
wtr.write_byte_record(new_record.as_byte_record()).map_err(|e| magnus_err(ruby, e, "write_byte_record"))?;
|
53
53
|
}
|
54
54
|
}
|
55
55
|
|
56
|
-
wtr.flush().
|
56
|
+
wtr.flush().map_err(|e| magnus_err(ruby, e, "flush"))?;
|
57
57
|
|
58
58
|
Ok(())
|
59
59
|
}
|
60
60
|
|
61
61
|
fn has_empty_first_col_skip_row(previous_record: &StringRecord) -> bool {
|
62
|
-
|
63
|
-
return true;
|
64
|
-
}
|
65
|
-
false
|
62
|
+
previous_record[0].is_empty()
|
66
63
|
}
|
67
64
|
|
68
65
|
fn has_empty_row_skip(record: &StringRecord) -> bool {
|
69
|
-
|
70
|
-
return true;
|
71
|
-
}
|
72
|
-
false
|
66
|
+
record.iter().all(|r| r.is_empty())
|
73
67
|
}
|
74
68
|
|
75
69
|
|
@@ -3,9 +3,14 @@ use std::ffi::OsStr;
|
|
3
3
|
use std::path::Path;
|
4
4
|
use magnus::Ruby;
|
5
5
|
|
6
|
+
pub mod csv;
|
6
7
|
pub mod dedup;
|
7
8
|
pub mod xls;
|
8
9
|
|
10
|
+
fn missing_header(ruby: &Ruby, header: &str) -> magnus::Error {
|
11
|
+
magnus::Error::new(ruby.exception_standard_error(), format!("Missing '{}' header", header))
|
12
|
+
}
|
13
|
+
|
9
14
|
fn magnus_err<E: Error>(ruby: &Ruby, e: E, msg: &str) -> magnus::Error {
|
10
15
|
magnus::Error::new(ruby.exception_standard_error(), format!("{}: {}", msg, e.to_string()))
|
11
16
|
}
|
@@ -6,19 +6,15 @@ use calamine::{Data, open_workbook, Range, Reader, Xls};
|
|
6
6
|
use chrono::{NaiveDateTime, Utc};
|
7
7
|
use magnus::{RArray, Ruby};
|
8
8
|
|
9
|
-
use crate::utils::{FileExtension, magnus_err};
|
9
|
+
use crate::utils::{FileExtension, magnus_err, missing_header};
|
10
10
|
|
11
11
|
pub fn to_csv(ruby: &Ruby, xls_path: String, target_path: String, exclusions: RArray) -> magnus::error::Result<()> {
|
12
|
-
let exclusions = RArray::to_vec(exclusions)?;
|
13
|
-
|
14
|
-
println!("xls_path: {:?}", xls_path);
|
15
|
-
println!("target_path: {:?}", target_path);
|
16
|
-
println!("exclusions: {:?}", exclusions);
|
17
|
-
|
18
12
|
if !xls_path.has_extension(&["xls"]) {
|
19
13
|
return Err(magnus::Error::new(ruby.exception_standard_error(), "xls_path must be an xls file".to_string()));
|
20
14
|
}
|
21
15
|
|
16
|
+
let exclusions = RArray::to_vec(exclusions)?;
|
17
|
+
|
22
18
|
let mut workbook: Xls<_> = open_workbook(xls_path.clone()).map_err(|e| magnus_err(ruby, e, format!("could not open xls: {}", xls_path).as_str()))?;
|
23
19
|
let range = workbook.worksheet_range_at(0)
|
24
20
|
.ok_or(magnus::Error::new(ruby.exception_standard_error(), "no worksheet found in xls".to_string()))
|
@@ -34,18 +30,23 @@ pub fn to_csv(ruby: &Ruby, xls_path: String, target_path: String, exclusions: RA
|
|
34
30
|
|
35
31
|
fn write_csv<W: Write>(ruby: &Ruby, dest: &mut W, range: &Range<Data>, header_map: HashMap<String, usize>, exclusions: Vec<String>) -> magnus::error::Result<()> {
|
36
32
|
let n = range.get_size().1 - 1;
|
33
|
+
|
34
|
+
let request_id = header_map.get("Request Id").ok_or(missing_header(ruby, "Request Id"))?;
|
35
|
+
let date = header_map.get("Date").ok_or(missing_header(ruby, "Date"))?;
|
36
|
+
let start = header_map.get("Start").ok_or(missing_header(ruby, "Start"))?;
|
37
|
+
let end = header_map.get("End").ok_or(missing_header(ruby, "End"))?;
|
38
|
+
let actual_start = header_map.get("Actual Start").ok_or(missing_header(ruby, "Actual Start"))?;
|
39
|
+
let actual_end = header_map.get("Actual End").ok_or(missing_header(ruby, "Actual End"))?;
|
40
|
+
|
37
41
|
for (ri, r) in range.rows().enumerate() {
|
38
42
|
let mut date_value = Utc::now().naive_utc();
|
39
43
|
|
40
|
-
if skip_excluded_rows(&
|
44
|
+
if skip_excluded_rows(&request_id, r, &exclusions) { continue; }
|
41
45
|
if skip_empty_rows(r) { continue; }
|
42
|
-
if skip_rows_with_no_request_id(&
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
let end = header_map.get("End").ok_or(missing_header(ruby, "End"))?;
|
47
|
-
let actual_start = header_map.get("Actual Start").ok_or(missing_header(ruby, "Actual Start"))?;
|
48
|
-
let actual_end = header_map.get("Actual End").ok_or(missing_header(ruby, "Actual End"))?;
|
46
|
+
if skip_rows_with_no_request_id(&request_id, r) { continue; }
|
47
|
+
if date_value_is_not_present(&date, r) {
|
48
|
+
return Err(magnus::Error::new(ruby.exception_standard_error(), format!("Date value is not present in row: {}", ri)));
|
49
|
+
}
|
49
50
|
|
50
51
|
for (i, c) in r.iter().enumerate() {
|
51
52
|
match *c {
|
@@ -55,7 +56,7 @@ fn write_csv<W: Write>(ruby: &Ruby, dest: &mut W, range: &Range<Data>, header_ma
|
|
55
56
|
}
|
56
57
|
Data::Float(ref f) => write!(dest, "{}", f),
|
57
58
|
Data::DateTime(ref d) => {
|
58
|
-
let mut current = d.as_datetime().
|
59
|
+
let mut current = d.as_datetime().ok_or(to_datetime_error(ruby, &d.to_string(), ri, "Date"))?;
|
59
60
|
if i == *date {
|
60
61
|
date_value = current;
|
61
62
|
} else if i == *start || i == *end || i == *actual_start || i == *actual_end {
|
@@ -76,40 +77,31 @@ fn write_csv<W: Write>(ruby: &Ruby, dest: &mut W, range: &Range<Data>, header_ma
|
|
76
77
|
Ok(())
|
77
78
|
}
|
78
79
|
|
79
|
-
fn
|
80
|
-
|
81
|
-
let value = r[*request_id].to_string();
|
82
|
-
if exclusions.contains(&value) {
|
83
|
-
return true;
|
84
|
-
}
|
85
|
-
}
|
86
|
-
false
|
80
|
+
fn date_value_is_not_present(date: &usize, r: &[Data]) -> bool {
|
81
|
+
r[*date] == Data::Empty
|
87
82
|
}
|
88
83
|
|
89
|
-
fn
|
90
|
-
|
91
|
-
|
92
|
-
}
|
93
|
-
false
|
84
|
+
fn skip_excluded_rows(request_id: &usize, r: &[Data], exclusions: &Vec<String>) -> bool {
|
85
|
+
let value = r[*request_id].to_string();
|
86
|
+
exclusions.contains(&value.to_string())
|
94
87
|
}
|
95
88
|
|
96
|
-
fn
|
97
|
-
|
98
|
-
if r[*request_id] == Data::Empty {
|
99
|
-
return true;
|
100
|
-
}
|
101
|
-
}
|
102
|
-
false
|
89
|
+
fn skip_empty_rows(r: &[Data]) -> bool {
|
90
|
+
r.iter().all(|c| c == &Data::Empty)
|
103
91
|
}
|
104
92
|
|
105
|
-
fn
|
106
|
-
|
93
|
+
fn skip_rows_with_no_request_id(request_id: &usize, r: &[Data]) -> bool {
|
94
|
+
r[*request_id] == Data::Empty
|
107
95
|
}
|
108
96
|
|
109
97
|
fn transform_time_to_datetime(t1: NaiveDateTime, t2: NaiveDateTime) -> NaiveDateTime {
|
110
98
|
NaiveDateTime::new(t1.date(), t2.time())
|
111
99
|
}
|
112
100
|
|
101
|
+
fn to_datetime_error(ruby: &Ruby, value: &str, row: usize, col: &str) -> magnus::Error {
|
102
|
+
magnus::Error::new(ruby.exception_standard_error(), format!("Could not parse datetime '{}', row: {}, col: {}", value, row, col))
|
103
|
+
}
|
104
|
+
|
113
105
|
fn handle_commas<W: Write>(dest: &mut W, s: &str) -> std::io::Result<()> {
|
114
106
|
if s.contains(",") {
|
115
107
|
write!(dest, "{:?}", clean_strings(s).trim_end())
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/lib/csv_utils/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: patchwork_csv_utils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.8
|
5
5
|
platform: x86_64-darwin
|
6
6
|
authors:
|
7
7
|
- kingsley.hendrickse
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-08-
|
11
|
+
date: 2024-08-07 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Deduplication of CSV files and XLS to CSV conversion.
|
14
14
|
email:
|
@@ -28,6 +28,7 @@ files:
|
|
28
28
|
- ext/csv_utils/Cargo.toml
|
29
29
|
- ext/csv_utils/extconf.rb
|
30
30
|
- ext/csv_utils/src/lib.rs
|
31
|
+
- ext/csv_utils/src/utils/csv.rs
|
31
32
|
- ext/csv_utils/src/utils/dedup.rs
|
32
33
|
- ext/csv_utils/src/utils/mod.rs
|
33
34
|
- ext/csv_utils/src/utils/xls.rs
|