patchwork_csv_utils 0.1.10-aarch64-linux
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +30 -0
- data/Cargo.lock +808 -0
- data/Cargo.toml +7 -0
- data/Gemfile +14 -0
- data/Gemfile.lock +70 -0
- data/README.md +33 -0
- data/Rakefile +36 -0
- data/ext/csv_utils/Cargo.toml +15 -0
- data/ext/csv_utils/extconf.rb +6 -0
- data/ext/csv_utils/src/lib.rs +15 -0
- data/ext/csv_utils/src/utils/csv.rs +113 -0
- data/ext/csv_utils/src/utils/dedup.rs +69 -0
- data/ext/csv_utils/src/utils/mod.rs +36 -0
- data/ext/csv_utils/src/utils/xls.rs +114 -0
- data/lib/csv_utils/2.7/csv_utils.so +0 -0
- data/lib/csv_utils/3.0/csv_utils.so +0 -0
- data/lib/csv_utils/3.1/csv_utils.so +0 -0
- data/lib/csv_utils/3.2/csv_utils.so +0 -0
- data/lib/csv_utils/version.rb +5 -0
- data/lib/csv_utils.rb +14 -0
- metadata +69 -0
data/Cargo.toml
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
source 'https://rubygems.org'
|
4
|
+
|
5
|
+
# Specify your gem's dependencies in csv_utils.gemspec
|
6
|
+
gemspec
|
7
|
+
|
8
|
+
group :development do
|
9
|
+
gem 'rake', '~> 13.0'
|
10
|
+
gem 'rake-compiler'
|
11
|
+
gem 'rb_sys', '~> 0.9.98'
|
12
|
+
gem 'rspec', '~> 3.0'
|
13
|
+
gem 'rubocop', '~> 1.21'
|
14
|
+
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
patchwork_csv_utils (0.1.10)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: https://rubygems.org/
|
8
|
+
specs:
|
9
|
+
ast (2.4.2)
|
10
|
+
diff-lcs (1.5.1)
|
11
|
+
json (2.7.2)
|
12
|
+
language_server-protocol (3.17.0.3)
|
13
|
+
parallel (1.25.1)
|
14
|
+
parser (3.3.4.0)
|
15
|
+
ast (~> 2.4.1)
|
16
|
+
racc
|
17
|
+
racc (1.8.0)
|
18
|
+
rainbow (3.1.1)
|
19
|
+
rake (13.2.1)
|
20
|
+
rake-compiler (1.2.7)
|
21
|
+
rake
|
22
|
+
rb_sys (0.9.98)
|
23
|
+
regexp_parser (2.9.2)
|
24
|
+
rexml (3.3.2)
|
25
|
+
strscan
|
26
|
+
rspec (3.13.0)
|
27
|
+
rspec-core (~> 3.13.0)
|
28
|
+
rspec-expectations (~> 3.13.0)
|
29
|
+
rspec-mocks (~> 3.13.0)
|
30
|
+
rspec-core (3.13.0)
|
31
|
+
rspec-support (~> 3.13.0)
|
32
|
+
rspec-expectations (3.13.1)
|
33
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
34
|
+
rspec-support (~> 3.13.0)
|
35
|
+
rspec-mocks (3.13.1)
|
36
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
37
|
+
rspec-support (~> 3.13.0)
|
38
|
+
rspec-support (3.13.1)
|
39
|
+
rubocop (1.65.0)
|
40
|
+
json (~> 2.3)
|
41
|
+
language_server-protocol (>= 3.17.0)
|
42
|
+
parallel (~> 1.10)
|
43
|
+
parser (>= 3.3.0.2)
|
44
|
+
rainbow (>= 2.2.2, < 4.0)
|
45
|
+
regexp_parser (>= 2.4, < 3.0)
|
46
|
+
rexml (>= 3.2.5, < 4.0)
|
47
|
+
rubocop-ast (>= 1.31.1, < 2.0)
|
48
|
+
ruby-progressbar (~> 1.7)
|
49
|
+
unicode-display_width (>= 2.4.0, < 3.0)
|
50
|
+
rubocop-ast (1.31.3)
|
51
|
+
parser (>= 3.3.1.0)
|
52
|
+
ruby-progressbar (1.13.0)
|
53
|
+
strscan (3.1.0)
|
54
|
+
unicode-display_width (2.5.0)
|
55
|
+
|
56
|
+
PLATFORMS
|
57
|
+
arm64-darwin-22
|
58
|
+
arm64-darwin-23
|
59
|
+
x86_64-linux
|
60
|
+
|
61
|
+
DEPENDENCIES
|
62
|
+
patchwork_csv_utils!
|
63
|
+
rake (~> 13.0)
|
64
|
+
rake-compiler
|
65
|
+
rb_sys (~> 0.9.98)
|
66
|
+
rspec (~> 3.0)
|
67
|
+
rubocop (~> 1.21)
|
68
|
+
|
69
|
+
BUNDLED WITH
|
70
|
+
2.4.10
|
data/README.md
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
# CsvUtils
|
2
|
+
|
3
|
+
* dedup: Given 2 CSV files, this gem will create a third CSV file that contains rows from the first CSV file that are not present in the second CSV file.
|
4
|
+
* xls to csv: Given an XLS file, this gem will create a CSV file with the specified name.
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
```bash
|
9
|
+
gem install patchwork_csv_utils
|
10
|
+
```
|
11
|
+
|
12
|
+
## Usage
|
13
|
+
|
14
|
+
```irb
|
15
|
+
require 'csv_utils'
|
16
|
+
CsvUtils.dedup('file1.csv', 'file2.csv', 'output.csv')
|
17
|
+
CsvUtils.to_csv('file1.xls', 'output_file1.csv', ['request_ids_to_skip']])
|
18
|
+
CsvUtils.transform_csv('file1.xls', 'output_file1.csv', ['request_ids_to_skip']])
|
19
|
+
```
|
20
|
+
|
21
|
+
## Release
|
22
|
+
|
23
|
+
* to release a new version, update the version number in `lib/patchwork_csv_utils/version.rb`
|
24
|
+
* push the changes to github and then create a tag with the version number
|
25
|
+
|
26
|
+
```bash
|
27
|
+
git tag -a v0.1.0 -m "v0.1.0"
|
28
|
+
git push origin --tags
|
29
|
+
```
|
30
|
+
|
31
|
+
## Contributing
|
32
|
+
|
33
|
+
Bug reports and pull requests are welcome on GitHub at http://github.com/patchworkhealth/csv_utils.
|
data/Rakefile
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'bundler/gem_tasks'
|
4
|
+
require 'rspec/core/rake_task'
|
5
|
+
|
6
|
+
RSpec::Core::RakeTask.new(:spec)
|
7
|
+
|
8
|
+
require 'rubocop/rake_task'
|
9
|
+
|
10
|
+
RuboCop::RakeTask.new
|
11
|
+
|
12
|
+
require 'rb_sys/extensiontask'
|
13
|
+
|
14
|
+
task build: :compile
|
15
|
+
|
16
|
+
spec = Bundler.load_gemspec('patchwork_csv_utils.gemspec')
|
17
|
+
|
18
|
+
Rake::ExtensionTask.new('csv_utils', spec) do |c|
|
19
|
+
c.lib_dir = 'lib/csv_utils'
|
20
|
+
c.cross_compile = true
|
21
|
+
c.cross_platform = %w[
|
22
|
+
aarch64-linux
|
23
|
+
arm64-darwin
|
24
|
+
x64-mingw-ucrt
|
25
|
+
x64-mingw32
|
26
|
+
x86_64-darwin
|
27
|
+
x86_64-linux
|
28
|
+
x86_64-linux-musl
|
29
|
+
]
|
30
|
+
end
|
31
|
+
|
32
|
+
RbSys::ExtensionTask.new('csv_utils') do |ext|
|
33
|
+
ext.lib_dir = 'lib/csv_utils'
|
34
|
+
end
|
35
|
+
|
36
|
+
task default: %i[compile spec rubocop]
|
@@ -0,0 +1,15 @@
|
|
1
|
+
[package]
|
2
|
+
name = "csv_utils"
|
3
|
+
version = "0.1.0"
|
4
|
+
edition = "2021"
|
5
|
+
authors = ["kingsley.hendrickse <kingsley.hendrickse@patchwork.health>"]
|
6
|
+
publish = false
|
7
|
+
|
8
|
+
[lib]
|
9
|
+
crate-type = ["cdylib"]
|
10
|
+
|
11
|
+
[dependencies]
|
12
|
+
magnus = { version = "0.7.1" }
|
13
|
+
csv = "1.3.0"
|
14
|
+
calamine = { version = "0.25.0", features = ["dates"] }
|
15
|
+
chrono = "0.4.38"
|
@@ -0,0 +1,15 @@
|
|
1
|
+
use magnus::{define_module, function, prelude::*};
|
2
|
+
use crate::utils::csv::transform_csv;
|
3
|
+
use crate::utils::dedup::dedup;
|
4
|
+
use crate::utils::xls::to_csv;
|
5
|
+
|
6
|
+
pub mod utils;
|
7
|
+
|
8
|
+
#[magnus::init]
|
9
|
+
fn init() -> Result<(), magnus::Error> {
|
10
|
+
let module = define_module("CsvUtils")?;
|
11
|
+
module.define_singleton_method("dedup", function!(dedup, 3))?;
|
12
|
+
module.define_singleton_method("to_csv", function!(to_csv, 3))?;
|
13
|
+
module.define_singleton_method("transform_csv", function!(transform_csv, 3))?;
|
14
|
+
Ok(())
|
15
|
+
}
|
@@ -0,0 +1,113 @@
|
|
1
|
+
use std::collections::HashMap;
|
2
|
+
use std::fs::File;
|
3
|
+
|
4
|
+
use chrono::{NaiveDate, NaiveDateTime, NaiveTime, Utc};
|
5
|
+
use csv::{StringRecord, Writer};
|
6
|
+
use magnus::{Error, RArray, Ruby};
|
7
|
+
|
8
|
+
use crate::utils::{FileExtension, magnus_err, missing_header, to_datetime_error};
|
9
|
+
|
10
|
+
pub fn transform_csv(ruby: &Ruby, csv_path: String, target_path: String, exclusions: RArray) -> magnus::error::Result<()> {
|
11
|
+
if !csv_path.has_extension(&["csv"]) {
|
12
|
+
return Err(magnus::Error::new(ruby.exception_standard_error(), "csv_path must be a csv file".to_string()));
|
13
|
+
}
|
14
|
+
|
15
|
+
let exclusions = RArray::to_vec(exclusions)?;
|
16
|
+
|
17
|
+
let csv_file = File::open(csv_path).map_err(|e| magnus_err(ruby, e, "csv_path"))?;
|
18
|
+
let mut csv: csv::Reader<File> = csv::Reader::from_reader(csv_file);
|
19
|
+
let mut wtr = Writer::from_path(target_path).map_err(|e| magnus_err(ruby, e, "target_path"))?;
|
20
|
+
let headers = csv.headers().map_err(|e| magnus_err(ruby, e, "csv_path headers"))?;
|
21
|
+
let header_map: HashMap<String, usize> = headers.iter().enumerate().map(|(i, h)| (h.to_string(), i)).collect();
|
22
|
+
let inverse_header_map: HashMap<usize, String> = headers.iter().enumerate().map(|(i, h)| (i, h.to_string())).collect();
|
23
|
+
|
24
|
+
wtr.write_byte_record(headers.as_byte_record()).map_err(|e| magnus_err(ruby, e, "write_byte_record"))?;
|
25
|
+
|
26
|
+
let request_id = header_map.get("Request Id").ok_or(missing_header(ruby, "Request Id"))?;
|
27
|
+
let date = header_map.get("Date").ok_or(missing_header(ruby, "Date"))?;
|
28
|
+
let start = header_map.get("Start").ok_or(missing_header(ruby, "Start"))?;
|
29
|
+
let end = header_map.get("End").ok_or(missing_header(ruby, "End"))?;
|
30
|
+
let actual_start = header_map.get("Actual Start").ok_or(missing_header(ruby, "Actual Start"))?;
|
31
|
+
let actual_end = header_map.get("Actual End").ok_or(missing_header(ruby, "Actual End"))?;
|
32
|
+
|
33
|
+
for (ri, record) in csv.records().enumerate() {
|
34
|
+
let record = record.map_err(|e| magnus_err(ruby, e, "record"))?;
|
35
|
+
|
36
|
+
if skip_excluded_rows(request_id, &record, &exclusions) { continue; }
|
37
|
+
if has_empty_row_skip(&record) { continue; }
|
38
|
+
if has_empty_first_col_skip_row(&record) { continue; }
|
39
|
+
|
40
|
+
let mut date_value = Utc::now().naive_utc();
|
41
|
+
|
42
|
+
let record = record.iter().enumerate().map(|(i, c)| {
|
43
|
+
let c = c.trim_end();
|
44
|
+
if i == *date {
|
45
|
+
let current = string_to_datetime(c).ok_or(to_datetime_error(ruby, c, ri, "Date"))?;
|
46
|
+
date_value = current;
|
47
|
+
Ok(current.to_string())
|
48
|
+
} else if i == *start || i == *end || i == *actual_start || i == *actual_end {
|
49
|
+
if c.is_empty() { return Ok(c.to_string()); }
|
50
|
+
let column_name = get_column_name(&inverse_header_map, &i);
|
51
|
+
process_datetime(ruby, ri, date_value, c, &column_name)
|
52
|
+
} else {
|
53
|
+
Ok(c.to_string())
|
54
|
+
}
|
55
|
+
}).collect::<Result<StringRecord, magnus::Error>>()?;
|
56
|
+
|
57
|
+
let record = record.into_iter().map(|r| r.trim_end()).collect::<StringRecord>();
|
58
|
+
wtr.write_byte_record(record.as_byte_record()).map_err(|e| magnus_err(ruby, e, "write_byte_record"))?;
|
59
|
+
}
|
60
|
+
|
61
|
+
wtr.flush().map_err(|e| magnus_err(ruby, e, "flush"))?;
|
62
|
+
|
63
|
+
Ok(())
|
64
|
+
}
|
65
|
+
|
66
|
+
fn process_datetime(ruby: &Ruby, ri: usize, date_value: NaiveDateTime, c: &str, column_name: &String) -> magnus::error::Result<String> {
|
67
|
+
let maybe_correct = correct_datetime(c);
|
68
|
+
if let Some(correct) = maybe_correct {
|
69
|
+
return Ok(correct.to_string());
|
70
|
+
}
|
71
|
+
|
72
|
+
let current_time = string_to_time(c).ok_or(to_datetime_error(ruby, c, ri, column_name))?;
|
73
|
+
let datetime = transform_time_to_datetime(date_value, current_time);
|
74
|
+
Ok(datetime.to_string())
|
75
|
+
}
|
76
|
+
|
77
|
+
fn get_column_name(inverse_header_map: &HashMap<usize, String>, i: &usize) -> String {
|
78
|
+
let unknown = "Unknown".to_string();
|
79
|
+
let column_name = inverse_header_map.get(&i).unwrap_or(&unknown);
|
80
|
+
column_name.to_string()
|
81
|
+
}
|
82
|
+
|
83
|
+
fn skip_excluded_rows(request_id: &usize, r: &StringRecord, exclusions: &Vec<String>) -> bool {
|
84
|
+
let value = r.get(*request_id).unwrap_or_default();
|
85
|
+
exclusions.contains(&value.to_string())
|
86
|
+
}
|
87
|
+
|
88
|
+
fn string_to_datetime(s: &str) -> Option<NaiveDateTime> {
|
89
|
+
let maybe_correct = correct_datetime(s);
|
90
|
+
if maybe_correct.is_some() { return maybe_correct; }
|
91
|
+
|
92
|
+
NaiveDate::parse_from_str(s, "%d-%b-%y").ok().map(|d| d.and_hms_opt(0, 0, 0)).flatten()
|
93
|
+
}
|
94
|
+
|
95
|
+
fn correct_datetime(s: &str) -> Option<NaiveDateTime> {
|
96
|
+
NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S").ok()
|
97
|
+
}
|
98
|
+
|
99
|
+
fn string_to_time(s: &str) -> Option<NaiveTime> {
|
100
|
+
NaiveTime::parse_from_str(s, "%H:%M").ok()
|
101
|
+
}
|
102
|
+
|
103
|
+
fn transform_time_to_datetime(t1: NaiveDateTime, t2: NaiveTime) -> NaiveDateTime {
|
104
|
+
NaiveDateTime::new(t1.date(), t2)
|
105
|
+
}
|
106
|
+
|
107
|
+
fn has_empty_first_col_skip_row(record: &StringRecord) -> bool {
|
108
|
+
record[0].is_empty()
|
109
|
+
}
|
110
|
+
|
111
|
+
fn has_empty_row_skip(record: &StringRecord) -> bool {
|
112
|
+
record.iter().all(|r| r.is_empty())
|
113
|
+
}
|
@@ -0,0 +1,69 @@
|
|
1
|
+
use std::fs::File;
|
2
|
+
|
3
|
+
use csv::{StringRecord, Writer};
|
4
|
+
use magnus::Ruby;
|
5
|
+
|
6
|
+
use crate::utils::{FileExtension, magnus_err};
|
7
|
+
|
8
|
+
pub fn dedup(ruby: &Ruby, previous_csv_path: String, new_csv_path: String, target_path: String) -> magnus::error::Result<()> {
|
9
|
+
if !previous_csv_path.has_extension(&["csv"]) {
|
10
|
+
return Err(magnus::Error::new(ruby.exception_standard_error(), "previous_csv_path must be a csv file".to_string()));
|
11
|
+
}
|
12
|
+
if !new_csv_path.has_extension(&["csv"]) {
|
13
|
+
return Err(magnus::Error::new(ruby.exception_standard_error(), "new_csv_path must be a csv file".to_string()));
|
14
|
+
}
|
15
|
+
|
16
|
+
let csv1 = File::open(previous_csv_path).map_err(|e| magnus_err(ruby, e, "previous_csv_path"))?;
|
17
|
+
let csv2 = File::open(new_csv_path).map_err(|e| magnus_err(ruby, e, "new_csv_path"))?;
|
18
|
+
|
19
|
+
let mut previous_csv: csv::Reader<File> = csv::Reader::from_reader(csv1);
|
20
|
+
let mut new_csv: csv::Reader<File> = csv::Reader::from_reader(csv2);
|
21
|
+
|
22
|
+
let mut wtr = Writer::from_path(target_path).map_err(|e| magnus_err(ruby, e, "target_path"))?;
|
23
|
+
|
24
|
+
let previous_headers = previous_csv.headers().map_err(|e| magnus_err(ruby, e, "previous_csv_path headers"))?;
|
25
|
+
let new_headers = new_csv.headers().map_err(|e| magnus_err(ruby, e, "new_csv_path headers"))?;
|
26
|
+
|
27
|
+
if previous_headers != new_headers {
|
28
|
+
return Err(magnus::Error::new(ruby.exception_standard_error(), "headers of both csv files must be the same".to_string()));
|
29
|
+
}
|
30
|
+
|
31
|
+
wtr.write_byte_record(previous_headers.as_byte_record()).map_err(|e| magnus_err(ruby, e, "write_byte_record"))?;
|
32
|
+
|
33
|
+
let mut previous_records = vec![];
|
34
|
+
for previous_record in previous_csv.records() {
|
35
|
+
let previous_record = previous_record.map_err(|e| magnus_err(ruby, e, "previous_record"))?;
|
36
|
+
|
37
|
+
if has_empty_row_skip(&previous_record) { continue; }
|
38
|
+
if has_empty_first_col_skip_row(&previous_record) { continue; }
|
39
|
+
|
40
|
+
let previous_record = previous_record.into_iter().map(|r| r.trim_end()).collect::<StringRecord>();
|
41
|
+
previous_records.push(previous_record)
|
42
|
+
}
|
43
|
+
|
44
|
+
for new_record in new_csv.records() {
|
45
|
+
let new_record = new_record.map_err(|e| magnus_err(ruby, e, "new_record"))?;
|
46
|
+
|
47
|
+
if has_empty_row_skip(&new_record) { continue; }
|
48
|
+
if has_empty_first_col_skip_row(&new_record) { continue; }
|
49
|
+
|
50
|
+
let new_record = new_record.into_iter().map(|r| r.trim_end()).collect::<StringRecord>();
|
51
|
+
if !previous_records.contains(&new_record) {
|
52
|
+
wtr.write_byte_record(new_record.as_byte_record()).map_err(|e| magnus_err(ruby, e, "write_byte_record"))?;
|
53
|
+
}
|
54
|
+
}
|
55
|
+
|
56
|
+
wtr.flush().map_err(|e| magnus_err(ruby, e, "flush"))?;
|
57
|
+
|
58
|
+
Ok(())
|
59
|
+
}
|
60
|
+
|
61
|
+
fn has_empty_first_col_skip_row(previous_record: &StringRecord) -> bool {
|
62
|
+
previous_record[0].is_empty()
|
63
|
+
}
|
64
|
+
|
65
|
+
fn has_empty_row_skip(record: &StringRecord) -> bool {
|
66
|
+
record.iter().all(|r| r.is_empty())
|
67
|
+
}
|
68
|
+
|
69
|
+
|
@@ -0,0 +1,36 @@
|
|
1
|
+
use std::error::Error;
|
2
|
+
use std::ffi::OsStr;
|
3
|
+
use std::path::Path;
|
4
|
+
use magnus::Ruby;
|
5
|
+
|
6
|
+
pub mod csv;
|
7
|
+
pub mod dedup;
|
8
|
+
pub mod xls;
|
9
|
+
|
10
|
+
fn missing_header(ruby: &Ruby, header: &str) -> magnus::Error {
|
11
|
+
magnus::Error::new(ruby.exception_standard_error(), format!("Missing '{}' header", header))
|
12
|
+
}
|
13
|
+
|
14
|
+
fn magnus_err<E: Error>(ruby: &Ruby, e: E, msg: &str) -> magnus::Error {
|
15
|
+
magnus::Error::new(ruby.exception_standard_error(), format!("{}: {}", msg, e.to_string()))
|
16
|
+
}
|
17
|
+
|
18
|
+
fn to_datetime_error(ruby: &Ruby, value: &str, row: usize, col: &str) -> magnus::Error {
|
19
|
+
magnus::Error::new(ruby.exception_standard_error(), format!("Could not parse datetime '{}', row: {}, col: {}", value, row, col))
|
20
|
+
}
|
21
|
+
|
22
|
+
pub trait FileExtension {
|
23
|
+
fn has_extension<S: AsRef<str>>(&self, extensions: &[S]) -> bool;
|
24
|
+
}
|
25
|
+
|
26
|
+
impl<P: AsRef<Path>> FileExtension for P {
|
27
|
+
fn has_extension<S: AsRef<str>>(&self, extensions: &[S]) -> bool {
|
28
|
+
if let Some(ref extension) = self.as_ref().extension().and_then(OsStr::to_str) {
|
29
|
+
return extensions
|
30
|
+
.iter()
|
31
|
+
.any(|x| x.as_ref().eq_ignore_ascii_case(extension));
|
32
|
+
}
|
33
|
+
|
34
|
+
false
|
35
|
+
}
|
36
|
+
}
|
@@ -0,0 +1,114 @@
|
|
1
|
+
use std::collections::HashMap;
|
2
|
+
use std::fs::File;
|
3
|
+
use std::io::{BufWriter, Write};
|
4
|
+
|
5
|
+
use calamine::{Data, open_workbook, Range, Reader, Xls};
|
6
|
+
use chrono::{NaiveDateTime, Utc};
|
7
|
+
use magnus::{RArray, Ruby};
|
8
|
+
|
9
|
+
use crate::utils::{FileExtension, magnus_err, missing_header, to_datetime_error};
|
10
|
+
|
11
|
+
pub fn to_csv(ruby: &Ruby, xls_path: String, target_path: String, exclusions: RArray) -> magnus::error::Result<()> {
|
12
|
+
if !xls_path.has_extension(&["xls"]) {
|
13
|
+
return Err(magnus::Error::new(ruby.exception_standard_error(), "xls_path must be an xls file".to_string()));
|
14
|
+
}
|
15
|
+
|
16
|
+
let exclusions = RArray::to_vec(exclusions)?;
|
17
|
+
|
18
|
+
let mut workbook: Xls<_> = open_workbook(xls_path.clone()).map_err(|e| magnus_err(ruby, e, format!("could not open xls: {}", xls_path).as_str()))?;
|
19
|
+
let range = workbook.worksheet_range_at(0)
|
20
|
+
.ok_or(magnus::Error::new(ruby.exception_standard_error(), "no worksheet found in xls".to_string()))
|
21
|
+
.and_then(|r| r.map_err(|e| magnus_err(ruby, e, "could not read worksheet range")))?;
|
22
|
+
|
23
|
+
let headers = range.headers().ok_or(magnus::Error::new(ruby.exception_standard_error(), "no headers found in xls".to_string()))?;
|
24
|
+
let header_map: HashMap<String, usize> = headers.iter().enumerate().map(|(i, h)| (h.to_string(), i)).collect();
|
25
|
+
let csv_out_file = File::create(target_path.clone()).map_err(|e| magnus_err(ruby, e, format!("could not create csv file: {}", target_path).as_str()))?;
|
26
|
+
let mut dest = BufWriter::new(csv_out_file);
|
27
|
+
|
28
|
+
write_csv(ruby, &mut dest, &range, header_map, exclusions)
|
29
|
+
}
|
30
|
+
|
31
|
+
fn write_csv<W: Write>(ruby: &Ruby, dest: &mut W, range: &Range<Data>, header_map: HashMap<String, usize>, exclusions: Vec<String>) -> magnus::error::Result<()> {
|
32
|
+
let n = range.get_size().1 - 1;
|
33
|
+
|
34
|
+
let request_id = header_map.get("Request Id").ok_or(missing_header(ruby, "Request Id"))?;
|
35
|
+
let date = header_map.get("Date").ok_or(missing_header(ruby, "Date"))?;
|
36
|
+
let start = header_map.get("Start").ok_or(missing_header(ruby, "Start"))?;
|
37
|
+
let end = header_map.get("End").ok_or(missing_header(ruby, "End"))?;
|
38
|
+
let actual_start = header_map.get("Actual Start").ok_or(missing_header(ruby, "Actual Start"))?;
|
39
|
+
let actual_end = header_map.get("Actual End").ok_or(missing_header(ruby, "Actual End"))?;
|
40
|
+
|
41
|
+
for (ri, r) in range.rows().enumerate() {
|
42
|
+
let mut date_value = Utc::now().naive_utc();
|
43
|
+
|
44
|
+
if skip_excluded_rows(&request_id, r, &exclusions) { continue; }
|
45
|
+
if skip_empty_rows(r) { continue; }
|
46
|
+
if skip_rows_with_no_request_id(&request_id, r) { continue; }
|
47
|
+
if date_value_is_not_present(&date, r) {
|
48
|
+
return Err(magnus::Error::new(ruby.exception_standard_error(), format!("Date value is not present in row: {}", ri)));
|
49
|
+
}
|
50
|
+
|
51
|
+
for (i, c) in r.iter().enumerate() {
|
52
|
+
match *c {
|
53
|
+
Data::Empty => Ok(()),
|
54
|
+
Data::String(ref s) | Data::DateTimeIso(ref s) | Data::DurationIso(ref s) => {
|
55
|
+
handle_commas(dest, s)
|
56
|
+
}
|
57
|
+
Data::Float(ref f) => write!(dest, "{}", f),
|
58
|
+
Data::DateTime(ref d) => {
|
59
|
+
let mut current = d.as_datetime().ok_or(to_datetime_error(ruby, &d.to_string(), ri, "Date"))?;
|
60
|
+
if i == *date {
|
61
|
+
date_value = current;
|
62
|
+
} else if i == *start || i == *end || i == *actual_start || i == *actual_end {
|
63
|
+
current = transform_time_to_datetime(date_value, current);
|
64
|
+
}
|
65
|
+
write!(dest, "{}", current)
|
66
|
+
}
|
67
|
+
Data::Int(ref i) => write!(dest, "{}", i),
|
68
|
+
Data::Error(ref e) => write!(dest, "{:?}", e),
|
69
|
+
Data::Bool(ref b) => write!(dest, "{}", b),
|
70
|
+
}.map_err(|e| magnus_err(ruby, e, format!("error writing xls row: {}, column: {}", ri, i).as_str()))?;
|
71
|
+
if i != n {
|
72
|
+
write!(dest, ",").map_err(|e| magnus_err(ruby, e, format!("error writing csv comma for row: {}, column: {}", ri, i).as_str()))?;
|
73
|
+
}
|
74
|
+
}
|
75
|
+
write!(dest, "\r\n").map_err(|e| magnus_err(ruby, e, format!("error writing end of line for row: {}", ri).as_str()))?;
|
76
|
+
}
|
77
|
+
Ok(())
|
78
|
+
}
|
79
|
+
|
80
|
+
fn date_value_is_not_present(date: &usize, r: &[Data]) -> bool {
|
81
|
+
r[*date] == Data::Empty
|
82
|
+
}
|
83
|
+
|
84
|
+
fn skip_excluded_rows(request_id: &usize, r: &[Data], exclusions: &Vec<String>) -> bool {
|
85
|
+
let value = r[*request_id].to_string();
|
86
|
+
exclusions.contains(&value.to_string())
|
87
|
+
}
|
88
|
+
|
89
|
+
fn skip_empty_rows(r: &[Data]) -> bool {
|
90
|
+
r.iter().all(|c| c == &Data::Empty)
|
91
|
+
}
|
92
|
+
|
93
|
+
fn skip_rows_with_no_request_id(request_id: &usize, r: &[Data]) -> bool {
|
94
|
+
r[*request_id] == Data::Empty
|
95
|
+
}
|
96
|
+
|
97
|
+
fn transform_time_to_datetime(t1: NaiveDateTime, t2: NaiveDateTime) -> NaiveDateTime {
|
98
|
+
NaiveDateTime::new(t1.date(), t2.time())
|
99
|
+
}
|
100
|
+
|
101
|
+
fn handle_commas<W: Write>(dest: &mut W, s: &str) -> std::io::Result<()> {
|
102
|
+
if s.contains(",") {
|
103
|
+
write!(dest, "{:?}", clean_strings(s).trim_end())
|
104
|
+
} else {
|
105
|
+
write!(dest, "{}", clean_strings(s).trim_end())
|
106
|
+
}
|
107
|
+
}
|
108
|
+
|
109
|
+
fn clean_strings(s: &str) -> String {
|
110
|
+
s.replace("\n", " ")
|
111
|
+
.replace("\r", " ")
|
112
|
+
.replace("\"", "")
|
113
|
+
.replace("'", "")
|
114
|
+
}
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/lib/csv_utils.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# load native extension
|
4
|
+
begin
|
5
|
+
ruby_version = /(\d+\.\d+)/.match(RUBY_VERSION)
|
6
|
+
require_relative "csv_utils/#{ruby_version}/csv_utils"
|
7
|
+
rescue LoadError
|
8
|
+
require_relative 'csv_utils/csv_utils'
|
9
|
+
end
|
10
|
+
|
11
|
+
require_relative 'csv_utils/version'
|
12
|
+
|
13
|
+
module CsvUtils
|
14
|
+
end
|
metadata
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: patchwork_csv_utils
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.10
|
5
|
+
platform: aarch64-linux
|
6
|
+
authors:
|
7
|
+
- kingsley.hendrickse
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2024-08-09 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Deduplication of CSV files and XLS to CSV conversion.
|
14
|
+
email:
|
15
|
+
- kingsley.hendrickse@patchwork.health
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- ".rspec"
|
21
|
+
- ".rubocop.yml"
|
22
|
+
- Cargo.lock
|
23
|
+
- Cargo.toml
|
24
|
+
- Gemfile
|
25
|
+
- Gemfile.lock
|
26
|
+
- README.md
|
27
|
+
- Rakefile
|
28
|
+
- ext/csv_utils/Cargo.toml
|
29
|
+
- ext/csv_utils/extconf.rb
|
30
|
+
- ext/csv_utils/src/lib.rs
|
31
|
+
- ext/csv_utils/src/utils/csv.rs
|
32
|
+
- ext/csv_utils/src/utils/dedup.rs
|
33
|
+
- ext/csv_utils/src/utils/mod.rs
|
34
|
+
- ext/csv_utils/src/utils/xls.rs
|
35
|
+
- lib/csv_utils.rb
|
36
|
+
- lib/csv_utils/2.7/csv_utils.so
|
37
|
+
- lib/csv_utils/3.0/csv_utils.so
|
38
|
+
- lib/csv_utils/3.1/csv_utils.so
|
39
|
+
- lib/csv_utils/3.2/csv_utils.so
|
40
|
+
- lib/csv_utils/version.rb
|
41
|
+
homepage: http://github.com/patchworkhealth/csv_utils
|
42
|
+
licenses: []
|
43
|
+
metadata:
|
44
|
+
homepage_uri: http://github.com/patchworkhealth/csv_utils
|
45
|
+
source_code_uri: http://github.com/patchworkhealth/csv_utils
|
46
|
+
rubygems_mfa_required: 'false'
|
47
|
+
post_install_message:
|
48
|
+
rdoc_options: []
|
49
|
+
require_paths:
|
50
|
+
- lib
|
51
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '2.7'
|
56
|
+
- - "<"
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: 3.3.dev
|
59
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - ">="
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '0'
|
64
|
+
requirements: []
|
65
|
+
rubygems_version: 3.4.4
|
66
|
+
signing_key:
|
67
|
+
specification_version: 4
|
68
|
+
summary: Fast CSV utils
|
69
|
+
test_files: []
|