patchwork_csv_utils 0.1.10-x86_64-darwin → 0.1.12-x86_64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +3 -3
- data/ext/csv_utils/src/lib.rs +3 -3
- data/ext/csv_utils/src/utils/csv.rs +73 -22
- data/ext/csv_utils/src/utils/dedup.rs +57 -32
- data/ext/csv_utils/src/utils/mod.rs +46 -1
- data/ext/csv_utils/src/utils/xls.rs +70 -19
- data/lib/csv_utils/2.7/csv_utils.bundle +0 -0
- data/lib/csv_utils/3.0/csv_utils.bundle +0 -0
- data/lib/csv_utils/3.1/csv_utils.bundle +0 -0
- data/lib/csv_utils/3.2/csv_utils.bundle +0 -0
- data/lib/csv_utils/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6641485d47a1ac2560273f63ee4666a84648b03c2fd493fdbf822f71b95b5058
|
4
|
+
data.tar.gz: 1c0d98407df7fbd7b1d50a16457cc672a6542bf854655ac402b546ff58fed05c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d0e184033fa2e6ed8418ccafd4f7747aa8111e490167031295efbc147f5d1a12f991ff3934af269e65872c3f1d2ef2140c11160ce8fb6143b52e4f790c4ad5cf
|
7
|
+
data.tar.gz: d767afc020114a7ae899ac63e775f20e3bdc4547aecaae86199d6ca87d08c982a32397447d2340a6fd4aa3872f77624f46219803eaa186ad2cd37a286992b194
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -13,9 +13,9 @@ gem install patchwork_csv_utils
|
|
13
13
|
|
14
14
|
```irb
|
15
15
|
require 'csv_utils'
|
16
|
-
CsvUtils.dedup('file1.csv', 'file2.csv', 'output.csv')
|
17
|
-
CsvUtils.to_csv('file1.xls', 'output_file1.csv', ['request_ids_to_skip']])
|
18
|
-
CsvUtils.transform_csv('file1.xls', 'output_file1.csv', ['request_ids_to_skip']])
|
16
|
+
CsvUtils.dedup('file1.csv', 'file2.csv', 'output.csv', ['mandatory_headers'])
|
17
|
+
CsvUtils.to_csv('file1.xls', 'output_file1.csv', ['request_ids_to_skip'], ['mandatory_headers'])
|
18
|
+
CsvUtils.transform_csv('file1.xls', 'output_file1.csv', ['request_ids_to_skip'], ['mandatory_headers'])
|
19
19
|
```
|
20
20
|
|
21
21
|
## Release
|
data/ext/csv_utils/src/lib.rs
CHANGED
@@ -8,8 +8,8 @@ pub mod utils;
|
|
8
8
|
#[magnus::init]
|
9
9
|
fn init() -> Result<(), magnus::Error> {
|
10
10
|
let module = define_module("CsvUtils")?;
|
11
|
-
module.define_singleton_method("dedup", function!(dedup,
|
12
|
-
module.define_singleton_method("to_csv", function!(to_csv,
|
13
|
-
module.define_singleton_method("transform_csv", function!(transform_csv,
|
11
|
+
module.define_singleton_method("dedup", function!(dedup, 4))?;
|
12
|
+
module.define_singleton_method("to_csv", function!(to_csv, 4))?;
|
13
|
+
module.define_singleton_method("transform_csv", function!(transform_csv, 4))?;
|
14
14
|
Ok(())
|
15
15
|
}
|
@@ -2,26 +2,35 @@ use std::collections::HashMap;
|
|
2
2
|
use std::fs::File;
|
3
3
|
|
4
4
|
use chrono::{NaiveDate, NaiveDateTime, NaiveTime, Utc};
|
5
|
-
use csv::{StringRecord, Writer};
|
5
|
+
use csv::{Reader, StringRecord, Writer};
|
6
6
|
use magnus::{Error, RArray, Ruby};
|
7
7
|
|
8
|
-
use crate::utils::{FileExtension, magnus_err, missing_header, to_datetime_error};
|
8
|
+
use crate::utils::{FileExtension, magnus_err, missing_header, to_datetime_error, check_mandatory_headers, create_header_map, missing_value, headers_as_byte_record, index_of_header_in_mandatory_list};
|
9
9
|
|
10
|
-
pub fn transform_csv(ruby: &Ruby, csv_path: String,
|
10
|
+
pub fn transform_csv(ruby: &Ruby, csv_path: String,
|
11
|
+
target_path: String, exclusions: RArray,
|
12
|
+
mandatory_headers: RArray, ) -> magnus::error::Result<()> {
|
11
13
|
if !csv_path.has_extension(&["csv"]) {
|
12
|
-
return Err(
|
14
|
+
return Err(Error::new(ruby.exception_standard_error(), "csv_path must be a csv file".to_string()));
|
13
15
|
}
|
14
16
|
|
15
17
|
let exclusions = RArray::to_vec(exclusions)?;
|
18
|
+
let mandatory_headers: Vec<String> = RArray::to_vec(mandatory_headers)?;
|
16
19
|
|
17
20
|
let csv_file = File::open(csv_path).map_err(|e| magnus_err(ruby, e, "csv_path"))?;
|
18
|
-
let mut csv:
|
21
|
+
let mut csv: Reader<File> = Reader::from_reader(csv_file);
|
19
22
|
let mut wtr = Writer::from_path(target_path).map_err(|e| magnus_err(ruby, e, "target_path"))?;
|
20
|
-
let headers = csv.headers().map_err(|e| magnus_err(ruby, e, "csv_path headers"))
|
21
|
-
let
|
22
|
-
let inverse_header_map: HashMap<usize, String> = headers.iter().enumerate().map(|(i, h)| (i, h.to_string())).collect();
|
23
|
+
let headers = csv.headers().map_err(|e| magnus_err(ruby, e, "csv_path headers"))?.clone();
|
24
|
+
let headers_list: Vec<String> = headers.iter().map(|h| h.to_string()).collect();
|
23
25
|
|
24
|
-
|
26
|
+
if let Some(value) =
|
27
|
+
check_mandatory_headers(ruby, &headers_list, &mandatory_headers, "csv") { return value; }
|
28
|
+
|
29
|
+
let header_map: HashMap<String, usize> = create_header_map(&mandatory_headers);
|
30
|
+
let inverse_header_map: HashMap<usize, String> = mandatory_headers.iter().enumerate().map(|(i, h)| (i, h.to_string())).collect();
|
31
|
+
|
32
|
+
let csv_headers = headers_as_byte_record(mandatory_headers.clone());
|
33
|
+
wtr.write_byte_record(&csv_headers).map_err(|e| magnus_err(ruby, e, "write_byte_record"))?;
|
25
34
|
|
26
35
|
let request_id = header_map.get("Request Id").ok_or(missing_header(ruby, "Request Id"))?;
|
27
36
|
let date = header_map.get("Date").ok_or(missing_header(ruby, "Date"))?;
|
@@ -30,8 +39,9 @@ pub fn transform_csv(ruby: &Ruby, csv_path: String, target_path: String, exclusi
|
|
30
39
|
let actual_start = header_map.get("Actual Start").ok_or(missing_header(ruby, "Actual Start"))?;
|
31
40
|
let actual_end = header_map.get("Actual End").ok_or(missing_header(ruby, "Actual End"))?;
|
32
41
|
|
33
|
-
|
34
|
-
|
42
|
+
let mandatory_records = get_mandatory_records(&ruby, &mut csv, &headers_list, &mandatory_headers)?;
|
43
|
+
|
44
|
+
for (ri, record) in mandatory_records.iter().enumerate() {
|
35
45
|
|
36
46
|
if skip_excluded_rows(request_id, &record, &exclusions) { continue; }
|
37
47
|
if has_empty_row_skip(&record) { continue; }
|
@@ -39,22 +49,29 @@ pub fn transform_csv(ruby: &Ruby, csv_path: String, target_path: String, exclusi
|
|
39
49
|
|
40
50
|
let mut date_value = Utc::now().naive_utc();
|
41
51
|
|
42
|
-
let
|
43
|
-
|
52
|
+
let mut columns = vec![];
|
53
|
+
for (i, column) in mandatory_headers.iter().enumerate() {
|
54
|
+
let column_index = header_map.get(column).ok_or(missing_header(ruby, column))?;
|
55
|
+
let column_value = record.get(*column_index).ok_or(missing_value(ruby, column))?;
|
56
|
+
let column_value = column_value.trim_end();
|
57
|
+
|
44
58
|
if i == *date {
|
45
|
-
let current = string_to_datetime(
|
59
|
+
let current = string_to_datetime(column_value).ok_or(to_datetime_error(ruby, column_value, ri, "Date"))?;
|
46
60
|
date_value = current;
|
47
|
-
|
61
|
+
columns.push(current.to_string());
|
48
62
|
} else if i == *start || i == *end || i == *actual_start || i == *actual_end {
|
49
|
-
if
|
50
|
-
|
51
|
-
|
63
|
+
if column_value.is_empty() {
|
64
|
+
columns.push(column_value.to_string());
|
65
|
+
} else {
|
66
|
+
let column_name = get_column_name(&inverse_header_map, &i);
|
67
|
+
let current = process_datetime(ruby, ri, date_value, column_value, &column_name)?;
|
68
|
+
columns.push(current);
|
69
|
+
}
|
52
70
|
} else {
|
53
|
-
|
71
|
+
columns.push(column_value.to_string());
|
54
72
|
}
|
55
|
-
}
|
56
|
-
|
57
|
-
let record = record.into_iter().map(|r| r.trim_end()).collect::<StringRecord>();
|
73
|
+
}
|
74
|
+
let record = columns.into_iter().collect::<StringRecord>();
|
58
75
|
wtr.write_byte_record(record.as_byte_record()).map_err(|e| magnus_err(ruby, e, "write_byte_record"))?;
|
59
76
|
}
|
60
77
|
|
@@ -63,6 +80,28 @@ pub fn transform_csv(ruby: &Ruby, csv_path: String, target_path: String, exclusi
|
|
63
80
|
Ok(())
|
64
81
|
}
|
65
82
|
|
83
|
+
fn get_mandatory_records(ruby: &Ruby, csv: &mut Reader<File>, csv_header_list: &Vec<String>, mandatory_headers_list: &Vec<String>) -> magnus::error::Result<Vec<StringRecord>> {
|
84
|
+
let inverse_header_map: HashMap<usize, String> = csv_header_list.iter().enumerate().map(|(i, h)| (i, h.to_string())).collect();
|
85
|
+
|
86
|
+
let mut records = vec![];
|
87
|
+
for row in csv.records() {
|
88
|
+
let row = row.map_err(|e| magnus_err(ruby, e, "record error"))?;
|
89
|
+
let mut columns = vec![];
|
90
|
+
for (i, column_value) in row.iter().enumerate() {
|
91
|
+
let column_name = inverse_header_map.get(&i).ok_or(missing_header(ruby, &i.to_string()))?;
|
92
|
+
if mandatory_headers_list.contains(column_name) {
|
93
|
+
let index = index_of_header_in_mandatory_list(mandatory_headers_list.clone(), column_name.to_string()).unwrap();
|
94
|
+
columns.push(CsvMandatoryColumn::new(column_value.to_string(), index));
|
95
|
+
}
|
96
|
+
}
|
97
|
+
columns.sort_by(|a, b| a.index.cmp(&b.index));
|
98
|
+
let columns = columns.iter().map(|c| c.value.to_string()).collect::<StringRecord>();
|
99
|
+
records.push(columns);
|
100
|
+
}
|
101
|
+
|
102
|
+
Ok(records)
|
103
|
+
}
|
104
|
+
|
66
105
|
fn process_datetime(ruby: &Ruby, ri: usize, date_value: NaiveDateTime, c: &str, column_name: &String) -> magnus::error::Result<String> {
|
67
106
|
let maybe_correct = correct_datetime(c);
|
68
107
|
if let Some(correct) = maybe_correct {
|
@@ -110,4 +149,16 @@ fn has_empty_first_col_skip_row(record: &StringRecord) -> bool {
|
|
110
149
|
|
111
150
|
fn has_empty_row_skip(record: &StringRecord) -> bool {
|
112
151
|
record.iter().all(|r| r.is_empty())
|
152
|
+
}
|
153
|
+
|
154
|
+
#[derive(Debug, PartialOrd, PartialEq, Eq, Ord)]
|
155
|
+
struct CsvMandatoryColumn {
|
156
|
+
value: String,
|
157
|
+
index: usize,
|
158
|
+
}
|
159
|
+
|
160
|
+
impl CsvMandatoryColumn {
|
161
|
+
fn new(value: String, index: usize) -> Self {
|
162
|
+
CsvMandatoryColumn { value, index }
|
163
|
+
}
|
113
164
|
}
|
@@ -1,54 +1,51 @@
|
|
1
1
|
use std::fs::File;
|
2
2
|
|
3
|
-
use csv::{StringRecord, Writer};
|
4
|
-
use magnus::Ruby;
|
3
|
+
use csv::{Reader, StringRecord, Writer};
|
4
|
+
use magnus::{RArray, Ruby};
|
5
5
|
|
6
|
-
use crate::utils::{FileExtension, magnus_err};
|
6
|
+
use crate::utils::{FileExtension, magnus_err, check_mandatory_headers, create_header_map, missing_header, missing_value, headers_as_byte_record};
|
7
7
|
|
8
|
-
pub fn dedup(ruby: &Ruby, previous_csv_path: String,
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
8
|
+
pub fn dedup(ruby: &Ruby, previous_csv_path: String,
|
9
|
+
new_csv_path: String,
|
10
|
+
target_path: String,
|
11
|
+
mandatory_headers: RArray,
|
12
|
+
) -> magnus::error::Result<()> {
|
13
|
+
if let Some(value) =
|
14
|
+
check_file_extension(ruby, &previous_csv_path, "previous_csv_path") { return value; }
|
15
|
+
|
16
|
+
if let Some(value) =
|
17
|
+
check_file_extension(ruby, &new_csv_path, "new_csv_path") { return value; }
|
15
18
|
|
16
19
|
let csv1 = File::open(previous_csv_path).map_err(|e| magnus_err(ruby, e, "previous_csv_path"))?;
|
17
20
|
let csv2 = File::open(new_csv_path).map_err(|e| magnus_err(ruby, e, "new_csv_path"))?;
|
18
21
|
|
22
|
+
let mandatory_headers: Vec<String> = RArray::to_vec(mandatory_headers)?;
|
23
|
+
|
19
24
|
let mut previous_csv: csv::Reader<File> = csv::Reader::from_reader(csv1);
|
20
25
|
let mut new_csv: csv::Reader<File> = csv::Reader::from_reader(csv2);
|
21
26
|
|
22
27
|
let mut wtr = Writer::from_path(target_path).map_err(|e| magnus_err(ruby, e, "target_path"))?;
|
23
28
|
|
24
|
-
let previous_headers = previous_csv.headers().map_err(|e| magnus_err(ruby, e, "
|
25
|
-
let
|
29
|
+
let previous_headers = previous_csv.headers().map_err(|e| magnus_err(ruby, e, "previous_csv"))?.clone();
|
30
|
+
let previous_headers_list : Vec<String> = previous_headers.iter().map(|h| h.to_string()).collect();
|
31
|
+
let new_headers = new_csv.headers().map_err(|e| magnus_err(ruby, e, "new_csv"))?.clone();
|
32
|
+
let new_headers_list : Vec<String> = new_headers.iter().map(|h| h.to_string()).collect();
|
26
33
|
|
27
|
-
if previous_headers != new_headers {
|
28
|
-
return Err(magnus::Error::new(ruby.exception_standard_error(), "headers of both csv files must be the same".to_string()));
|
29
|
-
}
|
30
34
|
|
31
|
-
|
35
|
+
if let Some(value) =
|
36
|
+
check_mandatory_headers(ruby, &previous_headers_list, &mandatory_headers, "previous_csv") { return value; }
|
32
37
|
|
33
|
-
let
|
34
|
-
|
35
|
-
let previous_record = previous_record.map_err(|e| magnus_err(ruby, e, "previous_record"))?;
|
38
|
+
if let Some(value) =
|
39
|
+
check_mandatory_headers(ruby, &new_headers_list, &mandatory_headers, "new_csv") { return value; }
|
36
40
|
|
37
|
-
|
38
|
-
|
41
|
+
let csv_headers = headers_as_byte_record(mandatory_headers.clone());
|
42
|
+
wtr.write_byte_record(&csv_headers).map_err(|e| magnus_err(ruby, e, "write_byte_record"))?;
|
39
43
|
|
40
|
-
|
41
|
-
|
42
|
-
}
|
43
|
-
|
44
|
-
for new_record in new_csv.records() {
|
45
|
-
let new_record = new_record.map_err(|e| magnus_err(ruby, e, "new_record"))?;
|
44
|
+
let previous_mandatory_records = get_records(ruby, &mut previous_csv, previous_headers_list, &mandatory_headers)?;
|
45
|
+
let new_mandatory_records = get_records(ruby, &mut new_csv, new_headers_list, &mandatory_headers)?;
|
46
46
|
|
47
|
-
|
48
|
-
if
|
49
|
-
|
50
|
-
let new_record = new_record.into_iter().map(|r| r.trim_end()).collect::<StringRecord>();
|
51
|
-
if !previous_records.contains(&new_record) {
|
47
|
+
for new_record in new_mandatory_records {
|
48
|
+
if !previous_mandatory_records.contains(&new_record) {
|
52
49
|
wtr.write_byte_record(new_record.as_byte_record()).map_err(|e| magnus_err(ruby, e, "write_byte_record"))?;
|
53
50
|
}
|
54
51
|
}
|
@@ -58,6 +55,34 @@ pub fn dedup(ruby: &Ruby, previous_csv_path: String, new_csv_path: String, targe
|
|
58
55
|
Ok(())
|
59
56
|
}
|
60
57
|
|
58
|
+
fn get_records(ruby: &Ruby, csv: &mut Reader<File>, csv_headers: Vec<String>, headers: &Vec<String>) -> magnus::error::Result<Vec<StringRecord>> {
|
59
|
+
let header_map = create_header_map(&csv_headers);
|
60
|
+
let mut records = vec![];
|
61
|
+
for record in csv.records() {
|
62
|
+
let record = record.map_err(|e| magnus_err(ruby, e, "record error"))?;
|
63
|
+
|
64
|
+
if has_empty_row_skip(&record) { continue; }
|
65
|
+
if has_empty_first_col_skip_row(&record) { continue; }
|
66
|
+
|
67
|
+
let mut columns = vec![];
|
68
|
+
for column in headers.iter() {
|
69
|
+
let column_index = header_map.get(column).ok_or(missing_header(ruby, column))?;
|
70
|
+
let column_value = record.get(*column_index).ok_or(missing_value(ruby, column))?;
|
71
|
+
columns.push(column_value.trim_end());
|
72
|
+
}
|
73
|
+
let columns = columns.into_iter().collect::<StringRecord>();
|
74
|
+
records.push(columns);
|
75
|
+
}
|
76
|
+
Ok(records)
|
77
|
+
}
|
78
|
+
|
79
|
+
fn check_file_extension(ruby: &Ruby, csv_path: &String, message: &str) -> Option<magnus::error::Result<()>> {
|
80
|
+
if !csv_path.has_extension(&["csv"]) {
|
81
|
+
return Some(Err(magnus::Error::new(ruby.exception_standard_error(), format!("{} must be a csv file", message))));
|
82
|
+
}
|
83
|
+
None
|
84
|
+
}
|
85
|
+
|
61
86
|
fn has_empty_first_col_skip_row(previous_record: &StringRecord) -> bool {
|
62
87
|
previous_record[0].is_empty()
|
63
88
|
}
|
@@ -1,6 +1,8 @@
|
|
1
|
+
use std::collections::{HashMap, HashSet};
|
1
2
|
use std::error::Error;
|
2
3
|
use std::ffi::OsStr;
|
3
4
|
use std::path::Path;
|
5
|
+
use ::csv::{ByteRecord, StringRecord};
|
4
6
|
use magnus::Ruby;
|
5
7
|
|
6
8
|
pub mod csv;
|
@@ -11,6 +13,15 @@ fn missing_header(ruby: &Ruby, header: &str) -> magnus::Error {
|
|
11
13
|
magnus::Error::new(ruby.exception_standard_error(), format!("Missing '{}' header", header))
|
12
14
|
}
|
13
15
|
|
16
|
+
fn missing_value(ruby: &Ruby, header: &str) -> magnus::Error {
|
17
|
+
magnus::Error::new(ruby.exception_standard_error(), format!("Missing value for '{}' header", header))
|
18
|
+
}
|
19
|
+
|
20
|
+
fn headers_as_byte_record(headers: Vec<String>) -> ByteRecord {
|
21
|
+
let string_record = headers.into_iter().collect::<StringRecord>();
|
22
|
+
string_record.as_byte_record().clone()
|
23
|
+
}
|
24
|
+
|
14
25
|
fn magnus_err<E: Error>(ruby: &Ruby, e: E, msg: &str) -> magnus::Error {
|
15
26
|
magnus::Error::new(ruby.exception_standard_error(), format!("{}: {}", msg, e.to_string()))
|
16
27
|
}
|
@@ -19,6 +30,39 @@ fn to_datetime_error(ruby: &Ruby, value: &str, row: usize, col: &str) -> magnus:
|
|
19
30
|
magnus::Error::new(ruby.exception_standard_error(), format!("Could not parse datetime '{}', row: {}, col: {}", value, row, col))
|
20
31
|
}
|
21
32
|
|
33
|
+
fn check_mandatory_headers(ruby: &Ruby, headers: &Vec<String>, mandatory_headers: &Vec<String>, message: &str) -> Option<magnus::error::Result<()>> {
|
34
|
+
let csv_mandatory_headers = filter_headers(headers, mandatory_headers);
|
35
|
+
|
36
|
+
if csv_mandatory_headers.is_empty() {
|
37
|
+
return Some(Err(magnus::Error::new(ruby.exception_standard_error(), format!("{} has no mandatory headers", message))));
|
38
|
+
}
|
39
|
+
|
40
|
+
let csv_mandatory_headers = csv_mandatory_headers.to_owned().clone();
|
41
|
+
let mandatory_headers = mandatory_headers.to_owned().clone();
|
42
|
+
|
43
|
+
let set1 = csv_mandatory_headers.iter().collect::<HashSet<_>>();
|
44
|
+
let set2 = mandatory_headers.iter().collect::<HashSet<_>>();
|
45
|
+
let difference = set2.difference(&set1).collect::<Vec<_>>();
|
46
|
+
|
47
|
+
if !difference.is_empty() {
|
48
|
+
let missing_headers = difference.iter().map(|h| h.to_string()).collect::<Vec<String>>();
|
49
|
+
return Some(Err(magnus::Error::new(ruby.exception_standard_error(), format!("{} is missing mandatory headers: {}", message, missing_headers.join(", ")))));
|
50
|
+
}
|
51
|
+
None
|
52
|
+
}
|
53
|
+
|
54
|
+
fn index_of_header_in_mandatory_list(mandatory_headers_list: Vec<String>, column_name: String) -> Option<usize> {
|
55
|
+
mandatory_headers_list.iter().position(|h| h.to_string() == column_name)
|
56
|
+
}
|
57
|
+
|
58
|
+
fn filter_headers(csv_headers: &Vec<String>, expected_headers: &Vec<String>) -> Vec<String> {
|
59
|
+
csv_headers.iter().map(|v| v.to_string()).filter(|h| expected_headers.contains(h)).collect::<Vec<String>>()
|
60
|
+
}
|
61
|
+
|
62
|
+
fn create_header_map(headers: &Vec<String>) -> HashMap<String, usize> {
|
63
|
+
headers.iter().enumerate().map(|(i, h)| (h.to_string(), i)).collect()
|
64
|
+
}
|
65
|
+
|
22
66
|
pub trait FileExtension {
|
23
67
|
fn has_extension<S: AsRef<str>>(&self, extensions: &[S]) -> bool;
|
24
68
|
}
|
@@ -33,4 +77,5 @@ impl<P: AsRef<Path>> FileExtension for P {
|
|
33
77
|
|
34
78
|
false
|
35
79
|
}
|
36
|
-
}
|
80
|
+
}
|
81
|
+
|
@@ -6,14 +6,19 @@ use calamine::{Data, open_workbook, Range, Reader, Xls};
|
|
6
6
|
use chrono::{NaiveDateTime, Utc};
|
7
7
|
use magnus::{RArray, Ruby};
|
8
8
|
|
9
|
-
use crate::utils::{FileExtension, magnus_err, missing_header, to_datetime_error};
|
9
|
+
use crate::utils::{FileExtension, magnus_err, missing_header, to_datetime_error, check_mandatory_headers, missing_value, index_of_header_in_mandatory_list};
|
10
10
|
|
11
|
-
pub fn to_csv(ruby: &Ruby, xls_path: String,
|
11
|
+
pub fn to_csv(ruby: &Ruby, xls_path: String,
|
12
|
+
target_path: String,
|
13
|
+
exclusions: RArray,
|
14
|
+
mandatory_headers: RArray
|
15
|
+
) -> magnus::error::Result<()> {
|
12
16
|
if !xls_path.has_extension(&["xls"]) {
|
13
17
|
return Err(magnus::Error::new(ruby.exception_standard_error(), "xls_path must be an xls file".to_string()));
|
14
18
|
}
|
15
19
|
|
16
20
|
let exclusions = RArray::to_vec(exclusions)?;
|
21
|
+
let mandatory_headers: Vec<String> = RArray::to_vec(mandatory_headers)?;
|
17
22
|
|
18
23
|
let mut workbook: Xls<_> = open_workbook(xls_path.clone()).map_err(|e| magnus_err(ruby, e, format!("could not open xls: {}", xls_path).as_str()))?;
|
19
24
|
let range = workbook.worksheet_range_at(0)
|
@@ -21,14 +26,22 @@ pub fn to_csv(ruby: &Ruby, xls_path: String, target_path: String, exclusions: RA
|
|
21
26
|
.and_then(|r| r.map_err(|e| magnus_err(ruby, e, "could not read worksheet range")))?;
|
22
27
|
|
23
28
|
let headers = range.headers().ok_or(magnus::Error::new(ruby.exception_standard_error(), "no headers found in xls".to_string()))?;
|
24
|
-
let
|
29
|
+
let headers_list : Vec<String> = headers.iter().map(|h| h.to_string()).collect();
|
30
|
+
|
31
|
+
if let Some(value) =
|
32
|
+
check_mandatory_headers(ruby, &headers_list, &mandatory_headers, "csv") { return value; }
|
33
|
+
|
34
|
+
let header_map: HashMap<String, usize> = mandatory_headers.iter().enumerate().map(|(i, h)| (h.to_string(), i)).collect();
|
25
35
|
let csv_out_file = File::create(target_path.clone()).map_err(|e| magnus_err(ruby, e, format!("could not create csv file: {}", target_path).as_str()))?;
|
26
36
|
let mut dest = BufWriter::new(csv_out_file);
|
27
37
|
|
28
|
-
write_csv(ruby, &mut dest, &range, header_map, exclusions)
|
38
|
+
write_csv(ruby, &mut dest, &range, header_map, exclusions, mandatory_headers, headers_list)
|
29
39
|
}
|
30
40
|
|
31
|
-
fn write_csv<W: Write>(ruby: &Ruby, dest: &mut W, range: &Range<Data>,
|
41
|
+
fn write_csv<W: Write>(ruby: &Ruby, dest: &mut W, range: &Range<Data>,
|
42
|
+
header_map: HashMap<String, usize>, exclusions: Vec<String>,
|
43
|
+
mandatory_headers: Vec<String>,
|
44
|
+
headers_list: Vec<String>) -> magnus::error::Result<()> {
|
32
45
|
let n = range.get_size().1 - 1;
|
33
46
|
|
34
47
|
let request_id = header_map.get("Request Id").ok_or(missing_header(ruby, "Request Id"))?;
|
@@ -38,17 +51,23 @@ fn write_csv<W: Write>(ruby: &Ruby, dest: &mut W, range: &Range<Data>, header_ma
|
|
38
51
|
let actual_start = header_map.get("Actual Start").ok_or(missing_header(ruby, "Actual Start"))?;
|
39
52
|
let actual_end = header_map.get("Actual End").ok_or(missing_header(ruby, "Actual End"))?;
|
40
53
|
|
41
|
-
|
54
|
+
let mandatory_rows = get_mandatory_records(ruby, range, &headers_list, &mandatory_headers)?;
|
55
|
+
|
56
|
+
for (ri, r) in mandatory_rows.into_iter().enumerate() {
|
42
57
|
let mut date_value = Utc::now().naive_utc();
|
43
58
|
|
44
|
-
if skip_excluded_rows(&request_id, r, &exclusions) { continue; }
|
45
|
-
if skip_empty_rows(r) { continue; }
|
46
|
-
if skip_rows_with_no_request_id(&request_id, r) { continue; }
|
47
|
-
if date_value_is_not_present(&date, r) {
|
59
|
+
if skip_excluded_rows(&request_id, &r, &exclusions) { continue; }
|
60
|
+
if skip_empty_rows(&r) { continue; }
|
61
|
+
if skip_rows_with_no_request_id(&request_id, &r) { continue; }
|
62
|
+
if date_value_is_not_present(&date, &r) {
|
48
63
|
return Err(magnus::Error::new(ruby.exception_standard_error(), format!("Date value is not present in row: {}", ri)));
|
49
64
|
}
|
50
65
|
|
51
|
-
for (i, c) in
|
66
|
+
for (i, c) in mandatory_headers.iter().enumerate() {
|
67
|
+
|
68
|
+
let column_index = header_map.get(c).ok_or(missing_header(ruby, c))?;
|
69
|
+
let c = r.get(*column_index).ok_or(missing_value(ruby, c))?;
|
70
|
+
|
52
71
|
match *c {
|
53
72
|
Data::Empty => Ok(()),
|
54
73
|
Data::String(ref s) | Data::DateTimeIso(ref s) | Data::DurationIso(ref s) => {
|
@@ -77,21 +96,42 @@ fn write_csv<W: Write>(ruby: &Ruby, dest: &mut W, range: &Range<Data>, header_ma
|
|
77
96
|
Ok(())
|
78
97
|
}
|
79
98
|
|
80
|
-
fn
|
81
|
-
|
99
|
+
fn get_mandatory_records<'a>(ruby: &Ruby, range: &'a Range<Data>, csv_header_list: &Vec<String>, mandatory_headers_list: &Vec<String>) -> magnus::error::Result<Vec<Vec<&'a Data>>> {
|
100
|
+
let inverse_header_map: HashMap<usize, String> = csv_header_list.iter().enumerate().map(|(i, h)| (i, h.to_string())).collect();
|
101
|
+
|
102
|
+
let mut records = vec![];
|
103
|
+
for row in range.rows() {
|
104
|
+
let mut columns = vec![];
|
105
|
+
for (i, column_value) in row.iter().enumerate() {
|
106
|
+
let column_name = inverse_header_map.get(&i).ok_or(missing_header(ruby, &i.to_string()))?;
|
107
|
+
if mandatory_headers_list.contains(column_name) {
|
108
|
+
let index = index_of_header_in_mandatory_list(mandatory_headers_list.clone(), column_name.to_string()).unwrap();
|
109
|
+
columns.push(XlsMandatoryColumn::new(column_value, index));
|
110
|
+
}
|
111
|
+
}
|
112
|
+
columns.sort_by(|a, b| a.index.cmp(&b.index));
|
113
|
+
let columns = columns.iter().map(|c| c.value).collect::<Vec<&Data>>();
|
114
|
+
records.push(columns);
|
115
|
+
}
|
116
|
+
|
117
|
+
Ok(records)
|
82
118
|
}
|
83
119
|
|
84
|
-
fn
|
120
|
+
fn date_value_is_not_present(date: &usize, r: &Vec<&Data>) -> bool {
|
121
|
+
r[*date] == &Data::Empty
|
122
|
+
}
|
123
|
+
|
124
|
+
fn skip_excluded_rows(request_id: &usize, r: &Vec<&Data>, exclusions: &Vec<String>) -> bool {
|
85
125
|
let value = r[*request_id].to_string();
|
86
126
|
exclusions.contains(&value.to_string())
|
87
127
|
}
|
88
128
|
|
89
|
-
fn skip_empty_rows(r: &
|
90
|
-
r.
|
129
|
+
fn skip_empty_rows(r: &Vec<&Data>) -> bool {
|
130
|
+
r.into_iter().all(|c| c == &&Data::Empty)
|
91
131
|
}
|
92
132
|
|
93
|
-
fn skip_rows_with_no_request_id(request_id: &usize, r: &
|
94
|
-
r[*request_id] == Data::Empty
|
133
|
+
fn skip_rows_with_no_request_id(request_id: &usize, r: &Vec<&Data>) -> bool {
|
134
|
+
r[*request_id] == &Data::Empty
|
95
135
|
}
|
96
136
|
|
97
137
|
fn transform_time_to_datetime(t1: NaiveDateTime, t2: NaiveDateTime) -> NaiveDateTime {
|
@@ -111,4 +151,15 @@ fn clean_strings(s: &str) -> String {
|
|
111
151
|
.replace("\r", " ")
|
112
152
|
.replace("\"", "")
|
113
153
|
.replace("'", "")
|
114
|
-
}
|
154
|
+
}
|
155
|
+
|
156
|
+
struct XlsMandatoryColumn<'a> {
|
157
|
+
value: &'a Data,
|
158
|
+
index: usize,
|
159
|
+
}
|
160
|
+
|
161
|
+
impl<'a> XlsMandatoryColumn<'a> {
|
162
|
+
fn new(value: &'a Data, index: usize) -> Self {
|
163
|
+
XlsMandatoryColumn { value, index }
|
164
|
+
}
|
165
|
+
}
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/lib/csv_utils/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: patchwork_csv_utils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.12
|
5
5
|
platform: x86_64-darwin
|
6
6
|
authors:
|
7
7
|
- kingsley.hendrickse
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-09-03 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Deduplication of CSV files and XLS to CSV conversion.
|
14
14
|
email:
|