PyPI - pysodafair - Versions diffs - 0.1.63__tar.gz → 0.1.65__tar.gz - Mend

pysodafair 0.1.63tar.gz → 0.1.65tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

{pysodafair-0.1.63 → pysodafair-0.1.65}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pysodafair
-Version: 0.1.63
+Version: 0.1.65
 Summary: Pysoda package for Fairdataihub tools
 License: MIT
 License-File: LICENSE

{pysodafair-0.1.63 → pysodafair-0.1.65}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pysodafair"
-version = "0.1.63"
+version = "0.1.65"
 description = "Pysoda package for Fairdataihub tools"
 authors = ["Christopher Marroquin <cmarroquin@calmi2.org>"]
 license = "MIT"

{pysodafair-0.1.63 → pysodafair-0.1.65}/pysoda/core/dataset_generation/upload.py RENAMED Viewed

@@ -40,7 +40,7 @@ from os.path import (
 import pandas as pd
 import time
 from timeit import default_timer as timer
-from datetime import timedelta
+from datetime import timedelta, timezone
 import shutil
 import subprocess
 import gevent
@@ -3830,57 +3830,39 @@ def generate_manifest_file_locally(generate_purpose, soda):
 def generate_manifest_file_data(dataset_structure):
-    # Define common file extensions with special handling
     double_extensions = {
         ".ome.tiff", ".ome.tif", ".ome.tf2", ".ome.tf8", ".ome.btf", ".ome.xml",
         ".brukertiff.gz", ".mefd.gz", ".moberg.gz", ".nii.gz", ".mgh.gz", ".tar.gz", ".bcl.gz"
     }
-    # Helper function: Get the complete file extension
+    # Helper: Determine file extension (handles double extensions)
     def get_file_extension(filename):
         for ext in double_extensions:
             if filename.endswith(ext):
                 base_ext = os.path.splitext(os.path.splitext(filename)[0])[1]
                 return base_ext + ext
         return os.path.splitext(filename)[1]
+    # Helper: Create a manifest row for a folder
     def create_folder_entry(folder_name, path_parts):
         full_path = "/".join(path_parts + [folder_name]) + "/"
-        entry = [
-            full_path.lstrip("/"),  # Remove leading slash for consistency
-            "", # Timestamp
-            "", # Description
-            "folder", # File type
-            "",  # Entity (empty)
-            "",  # Data modality (empty)
-            "",  # Also in dataset (empty)
-            "",  # Data dictionary path (empty)
-            "",  # Entity is transitive (empty)
-            "", # Additional Metadata
+        return [
+            full_path.lstrip("/"),
+            "", "", "folder", "", "", "", "", "", ""
         ]
-        return entry
-    # Helper function: Build a single manifest entry
-    def create_file_entry(item, folder, path_parts, timestamp, filename):
-        full_path = "/".join(path_parts + [filename])
-        file_info = folder["files"][item]
+    # Helper: Create a manifest row for a file
+    def create_file_entry(file_name, file_info, path_parts, timestamp):
         entry = [
-            full_path.lstrip("/"),  # Remove leading slash for consistency
-            timestamp, # Timestamp
-            file_info["description"], # Description
-            get_file_extension(filename), # File type
-            "",  # Entity (empty)
-            "",  # Data modality (empty)
-            "",  # Also in dataset (empty)
-            "",  # Data dictionary path (empty)
-            "",  # Entity is transitive (empty)
-            file_info.get("additional-metadata", "") # Additional Metadata
+            "/".join(path_parts + [file_name]).lstrip("/"),
+            timestamp,
+            file_info["description"],
+            get_file_extension(file_name),
+            "", "", "", "", "",
+            file_info.get("additional-metadata", "")
         ]
-        # Add any extra columns dynamically
+        # Append any extra columns dynamically
         if "extra_columns" in file_info:
             for key, value in file_info["extra_columns"].items():
                 entry.append(value)
@@ -3889,48 +3871,38 @@ def generate_manifest_file_data(dataset_structure):
         return entry
-    # Recursive function: Traverse dataset and collect file data
+    # Recursive traversal of folders and files
     def traverse_folders(folder, path_parts):
-        # Add header row if processing files for the first time
         if not manifest_data:
             manifest_data.append(header_row)
-        if "files" in folder:
-            for item, file_info in folder["files"].items():
-                if "path" in file_info:
-                    file_path = file_info["path"]
-                elif "pspath" in file_info:
-                    file_path = file_info["pspath"]
-                else:
-                    continue
-                # If the file is a manifest file, skip it
-                if item in {"manifest.xlsx", "manifest.csv"}:
-                    continue
-                # Determine timestamp
-                filename = os.path.basename(file_path.replace("\\", "/"))
-                if file_info["location"] == "ps":
-                    timestamp = file_info["timestamp"]
-                else:
-                    local_path = pathlib.Path(file_info["path"])
-                    timestamp = datetime.fromtimestamp(
-                        local_path.stat().st_mtime, tz=local_timezone
-                    ).isoformat().replace(".", ",").replace("+00:00", "Z")
-                # Add file entry
-                manifest_data.append(create_file_entry(item, folder, path_parts, timestamp, filename))
-        if "folders" in folder:
-            for subfolder_name, subfolder in folder["folders"].items():
-                # Add folder entry
-                manifest_data.append(create_folder_entry(subfolder_name, path_parts))
-                traverse_folders(subfolder, path_parts + [subfolder_name])
-    # Initialize variables
-    manifest_data = []  # Collects all rows for the manifest
-    # TODO: Update to SDS 3.0
+        # Process files
+        for file_name, file_info in folder.get("files", {}).items():
+            file_path = file_info.get("path")
+            if not file_path:
+                continue
+            if file_name in {"manifest.xlsx", "manifest.csv"}:
+                continue
+            if file_info["location"] == "ps":
+                timestamp = file_info["timestamp"]
+            else:
+                local_path = pathlib.Path(file_info["path"])
+                # Create proper ISO 8601 timestamp
+                dt = datetime.fromtimestamp(local_path.stat().st_mtime, tz=timezone.utc)
+                # per the SDS spec, replace '.' with ',' in the timestamp fractional seconds section
+                timestamp = dt.isoformat().replace(".", ",").replace("+00:00", "Z")
+            manifest_data.append(create_file_entry(file_name, file_info, path_parts, timestamp))
+        # Process subfolders
+        for subfolder_name, subfolder in folder.get("folders", {}).items():
+            manifest_data.append(create_folder_entry(subfolder_name, path_parts))
+            traverse_folders(subfolder, path_parts + [subfolder_name])
+    # Initialize manifest data and header
+    manifest_data = []
     header_row = [
         "filename", "timestamp", "description", "file type", "entity",
         "data modality", "also in dataset", "data dictionary path",
@@ -3938,9 +3910,6 @@ def generate_manifest_file_data(dataset_structure):
     ]
     local_timezone = TZLOCAL()
-    # Log the dataset structure
-    # Start recursive traversal from the root
     traverse_folders(dataset_structure, [])
     return manifest_data
@@ -3948,4 +3917,3 @@ def generate_manifest_file_data(dataset_structure):

{pysodafair-0.1.63 → pysodafair-0.1.65}/pysoda/core/metadata/dataset_description.py RENAMED Viewed

@@ -42,21 +42,23 @@ def create_excel(
         .get("dataset_type", "")
     )
-    populate_standards_info(ws1, soda)
+    standards_arr_len = populate_standards_info(ws1, soda)
-    keyword_array_len = populate_basic_info(ws1, soda)
+    keyword_funding_array_len = populate_basic_info(ws1, soda)
     study_arr_len = populate_study_info(ws1, soda)
     contributor_arr_len = populate_contributor_info(ws1, soda)
     related_resource_arr_len = populate_related_resource_information(ws1, soda)
     populate_funding_info(ws1, soda)
     populate_participant_information(ws1, soda)
-    data_dictionary_information(ws1, soda)
+    dict_arr_len = data_dictionary_information(ws1, soda)
     max_len = max(
-        keyword_array_len,
+        keyword_funding_array_len,
         study_arr_len,
         contributor_arr_len,
         related_resource_arr_len,
+        standards_arr_len,
+        dict_arr_len,
     )
     # 3 is the first value column position
@@ -102,9 +104,12 @@ def populate_study_info(workbook, soda):
 def populate_standards_info(workbook, soda):
     standards_info = soda["dataset_metadata"]["dataset_description"]["standards_information"]
-    workbook["D5"] = standards_info["data_standard"]
-    workbook["D6"] = standards_info["data_standard_version"]
+    # this is an array with multiple entries
+    for col, standard in zip(excel_columns(start_index=3), standards_info):
+        workbook[col + "5"] = standard.get("data_standard", "")
+        workbook[col + "6"] = standard.get("data_standard_version", "")
+    return max(1, len(standards_info))
 def populate_basic_info(workbook, soda):
     basic_info = soda["dataset_metadata"]["dataset_description"]["basic_information"]
@@ -117,12 +122,14 @@ def populate_basic_info(workbook, soda):
     for col, keyword in zip(excel_columns(start_index=3), keywords):
         workbook[f"{col}11"] = keyword
-    workbook["D12"] = basic_info.get("funding", "")
+    funding = basic_info.get("funding", [])
+    for col, funding_source in zip(excel_columns(start_index=3), funding):
+        workbook[f"{col}12"] = funding_source
     workbook["D13"] = basic_info.get("acknowledgments", "")
     workbook["D14"] = basic_info.get("license", "")
     # Return the length of the keywords array, or 1 if empty
-    return max(1, len(keywords))
+    return max(1, len(keywords), len(funding))
 def populate_funding_info(workbook, soda):
@@ -170,11 +177,14 @@ def data_dictionary_information(workbook, soda):
     It currently does not populate any data in the workbook.
     """
     # Placeholder for future implementation
-    data_dictionary_info = soda["dataset_metadata"]["dataset_description"].get("data_dictionary_information", {})
+    data_dictionary_info = soda["dataset_metadata"]["dataset_description"].get("data_dictionary_information", [])
-    workbook["D43"] = data_dictionary_info.get("data_dictionary_path", "")
-    workbook["D44"] = data_dictionary_info.get("data_dictionary_type", "")
-    workbook["D45"] = data_dictionary_info.get("data_dictionary_description", "")
+    for column, entry in zip(excel_columns(start_index=3), data_dictionary_info):
+        workbook[column + "43"] = entry.get("data_dictionary_path", "")
+        workbook[column + "44"] = entry.get("data_dictionary_type", "")
+        workbook[column + "45"] = entry.get("data_dictionary_description", "")
+    return max(1, len(data_dictionary_info))
 def grayout_subheaders(workbook, col):
     """
@@ -228,9 +238,6 @@ def apply_dashed_border(cell, workbook):
 def extend_value_header(workbook, max_len, start_index):
     """
     The headers starting at G1 are the 'Value' headers that correspond to the maximum number of entries for either the
@@ -253,4 +260,4 @@ def extend_value_header(workbook, max_len, start_index):
         apply_calibri_bold_12(header_cell)
         set_cell_alignment(header_cell, horizontal='center', vertical='center')
         apply_dashed_border(header_cell, workbook)
-        grayout_subheaders(workbook, column_list[i - 1])
+        grayout_subheaders(workbook, column_list[i - 1])

{pysodafair-0.1.63 → pysodafair-0.1.65}/pysoda/core/metadata/manifest.py RENAMED Viewed

@@ -21,7 +21,7 @@ def create_excel(soda, upload_boolean, local_destination):
     wb = load_workbook(destination)
     ws1 = wb["Sheet1"]
     manifest = soda["dataset_metadata"]["manifest_file"]
-    # validate_schema(manifest, SCHEMA_NAME_MANIFEST)
+    validate_schema(manifest, SCHEMA_NAME_MANIFEST)
     ascii_headers = excel_columns(start_index=0)
     custom_headers_to_column = {}

{pysodafair-0.1.63 → pysodafair-0.1.65}/pysoda/schema/dataset_description.json RENAMED Viewed

@@ -8,21 +8,25 @@
     },
     "type": {
       "type": "string",
-      "description": "The type of the dataset. For example, experimental."
+      "enum": ["experimental", "computational"],
+      "description": "The type of the dataset. In short, experimental data should have data collected from subjects and/or samples."
     },
     "standards_information": {
-      "type": "object",
-      "properties": {
-        "data_standard": {
-          "type": "string",
-          "description": "The name of the standard used in the project."
-        },
-        "data_standard_version": {
-          "type": "string",
-          "description": "The version of the standard used in the project."
+      "type": "array",
+      "minItems": 1,
+      "items": {
+        "type": "object",
+        "properties": {
+          "data_standard": {
+            "type": "string",
+            "description": "The name of the standard used in the project. For example, SPARC."
+          },
+          "data_standard_version": {
+            "type": "string",
+            "description": "The version of the standard used in the project. For example, 1.0.0"
+          }
         }
-      },
-      "required": []
+      }
     },
     "basic_information": {
       "type": "object",
@@ -47,8 +51,11 @@
           "description": "A list of keywords related to the project."
         },
         "funding": {
-          "type": "string",
-          "description": "Funding awards for the project."
+          "type": "array",
+          "items": {
+            "type": "string"
+          },
+          "description": "Funding awards for the project. Listed as free text. E.g., OT2OD025349"
         },
         "acknowledgments": {
           "type": "string",
@@ -56,7 +63,7 @@
         },
         "license": {
           "type": "string",
-          "description": "The license under which the project is released."
+          "description": "The license under which the project is released. Use the SPDX license identifier."
         }
       },
       "required": []
@@ -128,10 +135,12 @@
         "properties": {
           "contributor_orcid_id": {
             "type": "string",
-            "description": "The ORCiD for this contributor."
+            "pattern": "^https://orcid.org/000[09]-00[01][0-9]-[0-9]{4}-[0-9]{3}([0-9]|X)$",
+            "description": "The ORCiD for this contributor. Must be a valid ORCID URL format."
           },
           "contributor_affiliation": {
             "type": "string",
+            "pattern": "^https://ror.org/0[0-9a-z]{6}[0-9]{2}$",
             "description": "The institutional affiliation for this contributor."
           },
           "contributor_name": {
@@ -142,22 +151,27 @@
           "contributor_role": {
             "type": "string",
             "enum": [
-              "PrincipalInvestigator",
-              "Creator",
-              "CoInvestigator",
               "CorrespondingAuthor",
+              "ContactPerson",
+              "Creator",
               "DataCollector",
               "DataCurator",
               "DataManager",
               "Distributor",
               "Editor",
+              "HostingInstitution",
+              "PrincipalInvestigator",
+              "CoInvestigator",
               "Producer",
               "ProjectLeader",
               "ProjectManager",
               "ProjectMember",
+              "RegistrationAgency",
+              "RegistrationAuthority",
               "RelatedPerson",
               "Researcher",
               "ResearchGroup",
+              "RightsHolder",
               "Sponsor",
               "Supervisor",
               "WorkPackageLeader",
@@ -256,40 +270,48 @@
       "type": "object",
       "properties": {
         "number_of_subjects": {
-          "type": "number",
+          "type": "integer",
+          "minimum": 0,
           "description": "The number of subjects in the study."
         },
         "number_of_samples": {
-          "type": "number",
+          "type": "integer",
+          "minimum": 0,
           "description": "The number of samples in the study."
         },
         "number_of_sites": {
-          "type": "number",
+          "type": "integer",
+          "minimum": 0,
           "description": "The number of sites in the study."
         },
         "number_of_performances": {
-          "type": "number",
+          "type": "integer",
+          "minimum": 0,
           "description": "The number of performance in the study."
         }
       },
       "required": []
     },
     "data_dictionary_information": {
-      "type": "object",
-      "properties": {
-        "data_dictionary_path": {
-          "type": "string",
-          "description": "The path to the data dictionary file."
-        },
-        "data_dictionary_type": {
-          "type": "string",
-          "description": "The type of the data dictionary. E.g., json-schema"
-        },
-        "data_dictionary_description": {
-          "type": "string",
-          "description": "A descrption of the data dictionary."
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "data_dictionary_path": {
+            "type": "string",
+            "description": "The path to the data dictionary file. Should be relative to the root of the dataset. E.g., 'code/data_dictionary/schema.json'"
+          },
+          "data_dictionary_type": {
+            "type": "string",
+            "description": "The type of the data dictionary. E.g., json-schema"
+          },
+          "data_dictionary_description": {
+            "type": "string",
+            "description": "A description of the data dictionary."
+          }
         }
       }
     }
-  }
+  },
+  "required": ["metadata_version"]
 }

{pysodafair-0.1.63 → pysodafair-0.1.65}/pysoda/schema/manifest.json RENAMED Viewed

@@ -11,7 +11,8 @@
         },
         "timestamp": {
           "type": "string",
-          "description": "Timestamp of when the data was created or last modified. This should be in ISO 8601 format."
+          "description": "Timestamp of when the data was created or last modified. This should be in ISO 8601 format. Per the SDS guidelines, commas are to be used for separating fractional seconds.",
+          "pattern": "^[0-9]{4}-[0-1][0-9]-[0-3][0-9]T[0-2][0-9]:[0-6][0-9](:[0-6][0-9](,[0-9]{1,9})?)?(Z|[+-][0-2][0-9]:[0-6][0-9])$"
         },
         "description": {
           "type": "string",
@@ -23,7 +24,8 @@
         },
         "entity": {
           "type": "string",
-          "description": "Each ID should be taken from the subject.xlsx, samples.xlsx, sites.xlsx, or performances.xlsx files."
+          "description": "Each ID should be taken from the subject.xlsx, samples.xlsx, sites.xlsx, or performances.xlsx files.",
+          "pattern": "^((pop-)?(sub|sam|site|perf)-[A-Za-z0-9]([A-Za-z0-9-]*[A-Za-z0-9])?)?$"
         },
         "data_modality": {
           "type": "string",
@@ -43,7 +45,7 @@
         },
         "entity_is_transitive": {
           "type": "string",
-          "description": "Indicates whether the entity represented in this data file is transitive. This should be either true or false."
+          "description": "Indicates whether the entity represented in this data file is transitive. This should be either true or false if provided ."
         },
         "additional_metadata": {
           "type": "string",