RubyGems - patchwork_csv_utils - Versions diffs - 0.1.23-x86_64-linux → 0.1.24-x86_64-linux - Mend

patchwork_csv_utils 0.1.23-x86_64-linux → 0.1.24-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +4 -4
data/Cargo.lock +238 -278
data/Gemfile +1 -1
data/Gemfile.lock +5 -3
data/ext/csv_utils/Cargo.toml +5 -4
data/ext/csv_utils/src/lib.rs +3 -3
data/ext/csv_utils/src/utils/csv.rs +160 -173
data/ext/csv_utils/src/utils/dedup.rs +102 -67
data/ext/csv_utils/src/utils/mod.rs +68 -21
data/ext/csv_utils/src/utils/shared/datetime.rs +79 -0
data/ext/csv_utils/src/utils/shared/filters.rs +130 -0
data/ext/csv_utils/src/utils/shared/mod.rs +4 -0
data/ext/csv_utils/src/utils/shared/types.rs +97 -0
data/ext/csv_utils/src/utils/shared/validation.rs +34 -0
data/ext/csv_utils/src/utils/xls.rs +272 -211
data/lib/csv_utils/2.7/csv_utils.so +0 -0
data/lib/csv_utils/3.0/csv_utils.so +0 -0
data/lib/csv_utils/3.1/csv_utils.so +0 -0
data/lib/csv_utils/3.2/csv_utils.so +0 -0
data/lib/csv_utils/version.rb +1 -1
metadata +8 -3

data/ext/csv_utils/src/utils/mod.rs CHANGED Viewed

@@ -1,21 +1,28 @@
+use ::csv::{ByteRecord, StringRecord};
+use chrono::{NaiveDate, NaiveDateTime};
+use magnus::Ruby;
 use std::collections::{HashMap, HashSet};
 use std::error::Error;
 use std::ffi::OsStr;
 use std::path::Path;
-use ::csv::{ByteRecord, StringRecord};
-use magnus::Ruby;
-use chrono::{NaiveDate, NaiveDateTime};
 pub mod csv;
 pub mod dedup;
+pub mod shared;
 pub mod xls;
 fn missing_header(ruby: &Ruby, header: &str) -> magnus::Error {
-    magnus::Error::new(ruby.exception_standard_error(), format!("Missing '{}' header", header))
+    magnus::Error::new(
+        ruby.exception_standard_error(),
+        format!("Missing '{}' header", header),
+    )
 }
 fn missing_value(ruby: &Ruby, header: &str) -> magnus::Error {
-    magnus::Error::new(ruby.exception_standard_error(), format!("Missing value for '{}' header", header))
+    magnus::Error::new(
+        ruby.exception_standard_error(),
+        format!("Missing value for '{}' header", header),
+    )
 }
 fn headers_as_byte_record(headers: Vec<String>) -> ByteRecord {
@@ -24,18 +31,32 @@ fn headers_as_byte_record(headers: Vec<String>) -> ByteRecord {
 }
 fn magnus_err<E: Error>(ruby: &Ruby, e: E, msg: &str) -> magnus::Error {
-    magnus::Error::new(ruby.exception_standard_error(), format!("{}: {}", msg, e.to_string()))
+    magnus::Error::new(ruby.exception_standard_error(), format!("{}: {}", msg, e))
 }
 fn to_datetime_error(ruby: &Ruby, value: &str, row: usize, col: &str) -> magnus::Error {
-    magnus::Error::new(ruby.exception_standard_error(), format!("Could not parse datetime '{}', row: {}, col: {}", value, row, col))
+    magnus::Error::new(
+        ruby.exception_standard_error(),
+        format!(
+            "Could not parse datetime '{}', row: {}, col: {}",
+            value, row, col
+        ),
+    )
 }
-fn check_mandatory_headers(ruby: &Ruby, headers: &Vec<String>, mandatory_headers: &Vec<String>, message: &str) -> Option<magnus::error::Result<()>> {
+fn check_mandatory_headers(
+    ruby: &Ruby,
+    headers: &[String],
+    mandatory_headers: &[String],
+    message: &str,
+) -> Option<magnus::error::Result<()>> {
     let csv_mandatory_headers = filter_headers(headers, mandatory_headers);
     if csv_mandatory_headers.is_empty() {
-        return Some(Err(magnus::Error::new(ruby.exception_standard_error(), format!("{} has no mandatory headers", message))));
+        return Some(Err(magnus::Error::new(
+            ruby.exception_standard_error(),
+            format!("{} has no mandatory headers", message),
+        )));
     }
     let csv_mandatory_headers = csv_mandatory_headers.to_owned().clone();
@@ -46,22 +67,45 @@ fn check_mandatory_headers(ruby: &Ruby, headers: &Vec<String>, mandatory_headers
     let difference = set2.difference(&set1).collect::<Vec<_>>();
     if !difference.is_empty() {
-        let missing_headers = difference.iter().map(|h| h.to_string()).collect::<Vec<String>>();
-        return Some(Err(magnus::Error::new(ruby.exception_standard_error(), format!("{} is missing mandatory headers: {}", message, missing_headers.join(", ")))));
+        let missing_headers = difference
+            .iter()
+            .map(|h| h.to_string())
+            .collect::<Vec<String>>();
+        return Some(Err(magnus::Error::new(
+            ruby.exception_standard_error(),
+            format!(
+                "{} is missing mandatory headers: {}",
+                message,
+                missing_headers.join(", ")
+            ),
+        )));
     }
     None
 }
-fn index_of_header_in_mandatory_list(mandatory_headers_list: Vec<String>, column_name: String) -> Option<usize> {
-    mandatory_headers_list.iter().position(|h| h.to_string() == column_name)
+fn index_of_header_in_mandatory_list(
+    mandatory_headers_list: Vec<String>,
+    column_name: String,
+) -> Option<usize> {
+    mandatory_headers_list
+        .iter()
+        .position(|h| h == &column_name)
 }
-fn filter_headers(csv_headers: &Vec<String>, expected_headers: &Vec<String>) -> Vec<String> {
-    csv_headers.iter().map(|v| v.to_string()).filter(|h| expected_headers.contains(h)).collect::<Vec<String>>()
+fn filter_headers(csv_headers: &[String], expected_headers: &[String]) -> Vec<String> {
+    csv_headers
+        .iter()
+        .filter(|h| expected_headers.contains(h))
+        .cloned()
+        .collect()
 }
-fn create_header_map(headers: &Vec<String>) -> HashMap<String, usize> {
-    headers.iter().enumerate().map(|(i, h)| (h.to_string(), i)).collect()
+fn create_header_map(headers: &[String]) -> HashMap<String, usize> {
+    headers
+        .iter()
+        .enumerate()
+        .map(|(i, h)| (h.to_string(), i))
+        .collect()
 }
 pub trait FileExtension {
@@ -71,7 +115,7 @@ pub trait FileExtension {
 impl<P: AsRef<Path>> FileExtension for P {
     fn has_extension<S: AsRef<str>>(&self, extensions: &[S]) -> bool {
-        if let Some(ref extension) = self.as_ref().extension().and_then(OsStr::to_str) {
+        if let Some(extension) = self.as_ref().extension().and_then(OsStr::to_str) {
             return extensions
                 .iter()
                 .any(|x| x.as_ref().eq_ignore_ascii_case(extension));
@@ -86,17 +130,20 @@ impl<P: AsRef<Path>> FileExtension for P {
 pub fn string_to_datetime(s: &str) -> Option<NaiveDateTime> {
     let maybe_correct = correct_datetime(s);
-    if maybe_correct.is_some() { return maybe_correct; }
+    if maybe_correct.is_some() {
+        return maybe_correct;
+    }
     // Try YYYY-MM-DD format
     if let Ok(date) = NaiveDate::parse_from_str(s, "%Y-%m-%d") {
         return date.and_hms_opt(0, 0, 0);
     }
-    NaiveDate::parse_from_str(s, "%d-%b-%y").ok().map(|d| d.and_hms_opt(0, 0, 0)).flatten()
+    NaiveDate::parse_from_str(s, "%d-%b-%y")
+        .ok()
+        .and_then(|d| d.and_hms_opt(0, 0, 0))
 }
 pub fn correct_datetime(s: &str) -> Option<NaiveDateTime> {
     NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S").ok()
 }

data/ext/csv_utils/src/utils/shared/datetime.rs ADDED Viewed

@@ -0,0 +1,79 @@
+use crate::utils::{correct_datetime, to_datetime_error};
+use chrono::{NaiveDateTime, NaiveTime};
+use magnus::Ruby;
+pub struct DateTimeProcessor {
+    date_value: NaiveDateTime,
+}
+impl DateTimeProcessor {
+    pub fn new(date_value: NaiveDateTime) -> Self {
+        DateTimeProcessor { date_value }
+    }
+    pub fn process_time_column(
+        &self,
+        ruby: &Ruby,
+        value: &str,
+        row_index: usize,
+        column_name: &str,
+    ) -> magnus::error::Result<String> {
+        if let Some(correct) = correct_datetime(value) {
+            return Ok(correct.to_string());
+        }
+        let time = string_to_time(value)
+            .ok_or_else(|| to_datetime_error(ruby, value, row_index, column_name))?;
+        Ok(self.combine_date_time(time).to_string())
+    }
+    pub fn combine_date_time(&self, time: NaiveTime) -> NaiveDateTime {
+        NaiveDateTime::new(self.date_value.date(), time)
+    }
+    pub fn combine_datetime_parts(&self, time_source: NaiveDateTime) -> NaiveDateTime {
+        NaiveDateTime::new(self.date_value.date(), time_source.time())
+    }
+}
+fn string_to_time(s: &str) -> Option<NaiveTime> {
+    NaiveTime::parse_from_str(s, "%H:%M").ok()
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use chrono::NaiveDate;
+    #[test]
+    fn test_combine_date_time() {
+        let date = NaiveDate::from_ymd_opt(2024, 1, 15)
+            .unwrap()
+            .and_hms_opt(0, 0, 0)
+            .unwrap();
+        let processor = DateTimeProcessor::new(date);
+        let time = NaiveTime::from_hms_opt(14, 30, 0).unwrap();
+        let result = processor.combine_date_time(time);
+        assert_eq!(result.to_string(), "2024-01-15 14:30:00");
+    }
+    #[test]
+    fn test_combine_datetime_parts() {
+        let date = NaiveDate::from_ymd_opt(2024, 1, 15)
+            .unwrap()
+            .and_hms_opt(0, 0, 0)
+            .unwrap();
+        let processor = DateTimeProcessor::new(date);
+        let time_source = NaiveDate::from_ymd_opt(2020, 1, 1)
+            .unwrap()
+            .and_hms_opt(14, 30, 0)
+            .unwrap();
+        let result = processor.combine_datetime_parts(time_source);
+        assert_eq!(result.to_string(), "2024-01-15 14:30:00");
+    }
+}

data/ext/csv_utils/src/utils/shared/filters.rs ADDED Viewed

@@ -0,0 +1,130 @@
+use crate::utils::string_to_datetime;
+use calamine::{Data, DataType};
+use chrono::NaiveDateTime;
+pub trait FilterableRecord {
+    fn is_empty(&self) -> bool;
+    fn has_empty_first_column(&self) -> bool;
+    fn get_request_id(&self, index: usize) -> Option<String>;
+    fn get_status(&self, index: Option<usize>) -> Option<String>;
+    fn get_date(&self, index: usize) -> Option<NaiveDateTime>;
+}
+pub struct RowFilters {
+    exclusions: Vec<String>,
+    status_exclusions: Vec<String>,
+    earliest_start_date: Option<NaiveDateTime>,
+}
+impl RowFilters {
+    pub fn new(
+        exclusions: Vec<String>,
+        status_exclusions: Vec<String>,
+        earliest_start_date: Option<NaiveDateTime>,
+    ) -> Self {
+        RowFilters {
+            exclusions,
+            status_exclusions,
+            earliest_start_date,
+        }
+    }
+    pub fn should_skip<R: FilterableRecord>(
+        &self,
+        record: &R,
+        request_id_index: usize,
+        status_index: Option<usize>,
+        date_index: usize,
+    ) -> bool {
+        record.is_empty()
+            || record.has_empty_first_column()
+            || self.should_skip_by_exclusion(record, request_id_index, status_index)
+            || self.should_skip_by_status(record, status_index)
+            || self.should_skip_by_date(record, date_index)
+    }
+    fn should_skip_by_exclusion<R: FilterableRecord>(
+        &self,
+        record: &R,
+        request_id_index: usize,
+        status_index: Option<usize>,
+    ) -> bool {
+        if let Some(status) = record.get_status(status_index) {
+            if status == "Recalled" {
+                return false;
+            }
+        }
+        record
+            .get_request_id(request_id_index)
+            .map(|id| self.exclusions.contains(&id))
+            .unwrap_or(false)
+    }
+    fn should_skip_by_status<R: FilterableRecord>(
+        &self,
+        record: &R,
+        status_index: Option<usize>,
+    ) -> bool {
+        record
+            .get_status(status_index)
+            .map(|status| self.status_exclusions.contains(&status))
+            .unwrap_or(false)
+    }
+    fn should_skip_by_date<R: FilterableRecord>(&self, record: &R, date_index: usize) -> bool {
+        self.earliest_start_date
+            .and_then(|earliest| record.get_date(date_index).map(|date| date <= earliest))
+            .unwrap_or(false)
+    }
+}
+use csv::StringRecord;
+impl FilterableRecord for StringRecord {
+    fn is_empty(&self) -> bool {
+        self.iter().all(|r| r.is_empty())
+    }
+    fn has_empty_first_column(&self) -> bool {
+        self.get(0).map(|s| s.is_empty()).unwrap_or(true)
+    }
+    fn get_request_id(&self, index: usize) -> Option<String> {
+        self.get(index).map(|s| s.to_string())
+    }
+    fn get_status(&self, index: Option<usize>) -> Option<String> {
+        index.and_then(|idx| self.get(idx).map(|s| s.to_string()))
+    }
+    fn get_date(&self, index: usize) -> Option<NaiveDateTime> {
+        self.get(index).and_then(string_to_datetime)
+    }
+}
+impl FilterableRecord for Vec<&Data> {
+    fn is_empty(&self) -> bool {
+        self.iter().all(|c| *c == &Data::Empty)
+    }
+    fn has_empty_first_column(&self) -> bool {
+        self.first().map(|d| *d == &Data::Empty).unwrap_or(true)
+    }
+    fn get_request_id(&self, index: usize) -> Option<String> {
+        self.get(index).map(|d| d.to_string())
+    }
+    fn get_status(&self, index: Option<usize>) -> Option<String> {
+        index.and_then(|idx| self.get(idx).and_then(|d| d.as_string()))
+    }
+    fn get_date(&self, index: usize) -> Option<NaiveDateTime> {
+        self.get(index).and_then(|data| match data {
+            Data::DateTime(d) => d.as_datetime(),
+            Data::DateTimeIso(s) => string_to_datetime(s),
+            _ => None,
+        })
+    }
+}

data/ext/csv_utils/src/utils/shared/mod.rs ADDED Viewed

@@ -0,0 +1,4 @@
+pub mod datetime;
+pub mod filters;
+pub mod types;
+pub mod validation;

data/ext/csv_utils/src/utils/shared/types.rs ADDED Viewed

@@ -0,0 +1,97 @@
+use crate::utils::{missing_header, string_to_datetime};
+use chrono::NaiveDateTime;
+use magnus::{RArray, Ruby};
+use std::collections::HashMap;
+#[derive(Debug, PartialOrd, PartialEq, Eq, Ord)]
+pub struct MandatoryColumn<T> {
+    pub value: T,
+    pub index: usize,
+}
+impl<T> MandatoryColumn<T> {
+    pub fn new(value: T, index: usize) -> Self {
+        MandatoryColumn { value, index }
+    }
+}
+pub struct ProcessingConfig {
+    pub exclusions: Vec<String>,
+    pub status_exclusions: Vec<String>,
+    pub mandatory_headers: Vec<String>,
+    pub expected_trust_name: String,
+    pub is_streamed_file: bool,
+    pub earliest_start_date: Option<NaiveDateTime>,
+}
+impl ProcessingConfig {
+    pub fn from_ruby(
+        exclusions: RArray,
+        mandatory_headers: RArray,
+        status_exclusions: RArray,
+        expected_trust_name: String,
+        is_streamed_file: bool,
+        earliest_start_date: Option<String>,
+    ) -> magnus::error::Result<Self> {
+        Ok(ProcessingConfig {
+            exclusions: RArray::to_vec(exclusions)?,
+            status_exclusions: RArray::to_vec(status_exclusions)?,
+            mandatory_headers: RArray::to_vec(mandatory_headers)?,
+            expected_trust_name,
+            is_streamed_file,
+            earliest_start_date: earliest_start_date.and_then(|s| string_to_datetime(&s)),
+        })
+    }
+}
+pub struct HeaderConfig {
+    pub request_id: usize,
+    pub date: usize,
+    pub start: usize,
+    pub end: usize,
+    pub actual_start: usize,
+    pub actual_end: usize,
+    pub status: Option<usize>,
+    pub trust_name: usize,
+}
+impl HeaderConfig {
+    pub fn from_header_map(
+        map: &HashMap<String, usize>,
+        ruby: &Ruby,
+    ) -> magnus::error::Result<Self> {
+        Ok(HeaderConfig {
+            request_id: *map
+                .get("Request Id")
+                .ok_or_else(|| missing_header(ruby, "Request Id"))?,
+            date: *map
+                .get("Date")
+                .ok_or_else(|| missing_header(ruby, "Date"))?,
+            start: *map
+                .get("Start")
+                .ok_or_else(|| missing_header(ruby, "Start"))?,
+            end: *map.get("End").ok_or_else(|| missing_header(ruby, "End"))?,
+            actual_start: *map
+                .get("Actual Start")
+                .ok_or_else(|| missing_header(ruby, "Actual Start"))?,
+            actual_end: *map
+                .get("Actual End")
+                .ok_or_else(|| missing_header(ruby, "Actual End"))?,
+            status: map.get("Status").copied(),
+            trust_name: *map
+                .get("Trust")
+                .ok_or_else(|| missing_header(ruby, "Trust"))?,
+        })
+    }
+    pub fn is_time_column(&self, index: usize) -> bool {
+        index == self.start
+            || index == self.end
+            || index == self.actual_start
+            || index == self.actual_end
+    }
+    pub fn is_date_column(&self, index: usize) -> bool {
+        index == self.date
+    }
+}

data/ext/csv_utils/src/utils/shared/validation.rs ADDED Viewed

@@ -0,0 +1,34 @@
+use magnus::{Error, Ruby};
+pub struct TrustValidator {
+    expected_name: String,
+    is_streamed: bool,
+}
+impl TrustValidator {
+    pub fn new(expected_name: String, is_streamed: bool) -> Self {
+        TrustValidator {
+            expected_name,
+            is_streamed,
+        }
+    }
+    pub fn validate(&self, ruby: &Ruby, actual_name: &str) -> magnus::error::Result<()> {
+        if self.is_streamed {
+            return Ok(());
+        }
+        let trimmed = actual_name.trim();
+        if trimmed != self.expected_name {
+            return Err(Error::new(
+                ruby.exception_standard_error(),
+                format!(
+                    "Trust actual name: '{}' is not as expected: '{}'",
+                    trimmed, self.expected_name
+                ),
+            ));
+        }
+        Ok(())
+    }
+}