PyPI - dcicutils - Versions diffs - 8.8.5__tar.gz → 8.8.6__tar.gz - Mend

dcicutils 8.8.5tar.gz → 8.8.6tar.gz

Files changed (81) hide show

{dcicutils-8.8.5 → dcicutils-8.8.6}/PKG-INFO RENAMED Viewed

@@ -1,12 +1,12 @@
 Metadata-Version: 2.1
 Name: dcicutils
-Version: 8.8.5
+Version: 8.8.6
 Summary: Utility package for interacting with the 4DN Data Portal and other 4DN resources
 Home-page: https://github.com/4dn-dcic/utils
 License: MIT
 Author: 4DN-DCIC Team
 Author-email: support@4dnucleome.org
-Requires-Python: >=3.8,<3.12
+Requires-Python: >=3.8,<3.13
 Classifier: Development Status :: 4 - Beta
 Classifier: Intended Audience :: Developers
 Classifier: Intended Audience :: Science/Research
@@ -24,9 +24,10 @@ Classifier: Programming Language :: Python :: 3.9
 Classifier: Topic :: Database :: Database Engines/Servers
 Requires-Dist: PyJWT (>=2.6.0,<3.0.0)
 Requires-Dist: PyYAML (>=6.0.1,<7.0.0)
+Requires-Dist: appdirs (>=1.4.4,<2.0.0)
 Requires-Dist: aws-requests-auth (>=0.4.2,<1)
-Requires-Dist: boto3 (>=1.28.57,<2.0.0)
-Requires-Dist: botocore (>=1.31.57,<2.0.0)
+Requires-Dist: boto3 (>=1.34.93,<2.0.0)
+Requires-Dist: botocore (>=1.34.93,<2.0.0)
 Requires-Dist: chardet (>=5.2.0,<6.0.0)
 Requires-Dist: docker (>=4.4.4,<5.0.0)
 Requires-Dist: elasticsearch (==7.13.4)
@@ -42,6 +43,7 @@ Requires-Dist: pytz (>=2020.4)
 Requires-Dist: redis (>=4.5.1,<5.0.0)
 Requires-Dist: requests (>=2.21.0,<3.0.0)
 Requires-Dist: rfc3986 (>=1.4.0,<2.0.0)
+Requires-Dist: shortuuid (>=1.0.13,<2.0.0)
 Requires-Dist: structlog (>=19.2.0,<20.0.0)
 Requires-Dist: toml (>=0.10.1,<1)
 Requires-Dist: tqdm (>=4.66.2,<5.0.0)

{dcicutils-8.8.5 → dcicutils-8.8.6}/dcicutils/ff_utils.py RENAMED Viewed

@@ -895,9 +895,12 @@ def _get_es_metadata(uuids, es_client, filters, sources, chunk_size, auth):
     used to create the generator.
     Should NOT be used directly
     """
+    def get_es_host_local() -> Optional[str]:
+        return os.environ.get("ES_HOST_LOCAL", None)
     health = get_health_page(key=auth)
     if es_client is None:
-        es_url = health['elasticsearch']
+        if not (es_url := get_es_host_local()):
+            es_url = health['elasticsearch']
         es_client = es_utils.create_es_client(es_url, use_aws_auth=True)
     namespace_star = health.get('namespace', '') + '*'
     # match all given uuids to _id fields

dcicutils-8.8.6/dcicutils/file_utils.py ADDED Viewed

@@ -0,0 +1,267 @@
+import glob
+import hashlib
+import io
+import os
+import pathlib
+from datetime import datetime
+import random
+import string
+from tempfile import gettempdir as get_temporary_directory
+from typing import List, Optional, Union
+from uuid import uuid4 as uuid
+HOME_DIRECTORY = str(pathlib.Path().home())
+def search_for_file(file: str,
+                    location: Union[str, pathlib.PosixPath, Optional[List[Union[str, pathlib.PosixPath]]]] = None,
+                    recursive: bool = False,
+                    single: bool = False,
+                    order: bool = True) -> Union[List[str], Optional[str]]:
+    """
+    Searches for the existence of the given file name, first directly in the given directory or list
+    of directories, if specified, and if not then just in the current (working) directory; if the
+    given recursive flag is True then also searches all sub-directories of these directories;
+    returns the full path name to the file if found. If the single flag is True then just the
+    first file which is found is returns (as a string), or None if none; if the single flag
+    is False, then all matched files are returned in a list, or and empty list if none.
+    """
+    def order_by_fewest_number_of_paths_and_then_alphabetically(paths: List[str]) -> List[str]:
+        def order_by(path: str):
+            return len(path.split(os.path.sep)), path
+        return sorted(paths, key=order_by)
+    if not (file and isinstance(file, (str, pathlib.PosixPath))):
+        return None if single is True else []
+    if os.path.isabs(file):
+        if os.path.exists(file):
+            return str(file) if single is True else [str(file)]
+        return None if single is True else []
+    files_found = []
+    if not location:
+        location = ["."]
+    elif isinstance(location, (str, pathlib.PosixPath)):
+        location = [location]
+    elif not isinstance(location, list):
+        location = []
+    location_pruned = []
+    for directory in location:
+        if not isinstance(directory, str):
+            if not isinstance(directory, pathlib.PosixPath):
+                continue
+            directory = str(directory)
+        if not (directory := directory.strip()):
+            continue
+        if os.path.isfile(directory := os.path.abspath(os.path.normpath(directory))):
+            # Actually, allow a file rather then a directory; assume its parent directory was intended.
+            if not (directory := os.path.dirname(directory)):
+                continue
+        if directory not in location_pruned:
+            location_pruned.append(directory)
+    location = location_pruned
+    for directory in location:
+        if os.path.exists(os.path.join(directory, file)):
+            file_found = os.path.abspath(os.path.normpath(os.path.join(directory, file)))
+            if single is True:
+                return file_found
+            if file_found not in files_found:
+                files_found.append(file_found)
+    if recursive is True:
+        for directory in location:
+            if not directory.endswith("/**") and not file.startswith("**/"):
+                path = f"{directory}/**/{file}"
+            else:
+                path = f"{directory}/{file}"
+            files = glob.glob(path, recursive=True if recursive is True else False)
+            if files:
+                for file_found in files:
+                    file_found = os.path.abspath(file_found)
+                    if single is True:
+                        return file_found
+                    if file_found not in files_found:
+                        files_found.append(file_found)
+    if single is True:
+        return files_found[0] if files_found else None
+    elif order is True:
+        return order_by_fewest_number_of_paths_and_then_alphabetically(files_found)
+    else:
+        return files_found
+def normalize_path(value: Union[str, pathlib.Path], absolute: bool = False, expand_home: Optional[bool] = None) -> str:
+    """
+    Normalizes the given path value and returns the result; does things like remove redundant
+    consecutive directory separators and redundant parent paths. If the given absolute argument
+    is True than converts the path to an absolute path. If the given expand_home argument is False
+    and if the path can reasonably be represented with a home directory indicator (i.e. "~"), then
+    converts it to such. If the expand_home argument is True and path starts with the home directory
+    indicator (i.e. "~") then expands it to the actual (absolute) home path of the caller. If the
+    given path value is not actually even a string (or pathlib.Path) then returns an empty string.
+    """
+    if isinstance(value, pathlib.Path):
+        value = str(value)
+    elif not isinstance(value, str):
+        return ""
+    if not (value := value.strip()) or not (value := os.path.normpath(value)):
+        return ""
+    if expand_home is True:
+        value = os.path.expanduser(value)
+    elif (expand_home is False) and (os.name == "posix"):
+        if value.startswith(home := HOME_DIRECTORY + os.sep):
+            value = "~/" + value[len(home):]
+        elif value == HOME_DIRECTORY:
+            value = "~"
+    if absolute is True:
+        value = os.path.abspath(value)
+    return value
+def get_file_size(file: str, raise_exception: bool = True) -> Optional[int]:
+    try:
+        return os.path.getsize(file) if isinstance(file, str) else None
+    except Exception:
+        if raise_exception is True:
+            raise
+        return None
+def get_file_modified_datetime(file: str, raise_exception: bool = True) -> Optional[datetime]:
+    try:
+        return datetime.fromtimestamp(os.path.getmtime(file)) if isinstance(file, str) else None
+    except Exception:
+        if raise_exception is True:
+            raise
+        return None
+def are_files_equal(filea: str, fileb: str, raise_exception: bool = True) -> bool:
+    """
+    Returns True iff the contents of the two given files are exactly the same.
+    """
+    try:
+        with open(filea, "rb") as fa:
+            with open(fileb, "rb") as fb:
+                chunk_size = 4096
+                while True:
+                    chunka = fa.read(chunk_size)
+                    chunkb = fb.read(chunk_size)
+                    if chunka != chunkb:
+                        return False
+                    if not chunka:
+                        break
+        return True
+    except Exception:
+        if raise_exception is True:
+            raise
+        return False
+def compute_file_md5(file: str, raise_exception: bool = True) -> str:
+    """
+    Returns the md5 checksum for the given file.
+    """
+    if not isinstance(file, str):
+        return ""
+    try:
+        md5 = hashlib.md5()
+        with open(file, "rb") as file:
+            for chunk in iter(lambda: file.read(4096), b""):
+                md5.update(chunk)
+        return md5.hexdigest()
+    except Exception:
+        if raise_exception is True:
+            raise
+        return ""
+def compute_file_etag(file: str, raise_exception: bool = True) -> Optional[str]:
+    """
+    Returns the AWS S3 "etag" for the given file; this value is md5-like but
+    not the same as a normal md5. We use this to compare that a file in S3
+    appears to be the exact the same file as a local file.
+    """
+    try:
+        with io.open(file, "rb") as f:
+            return _compute_file_etag(f)
+    except Exception:
+        if raise_exception is True:
+            raise
+        return None
+def _compute_file_etag(f: io.BufferedReader) -> str:
+    # See: https://stackoverflow.com/questions/75723647/calculate-md5-from-aws-s3-etag
+    MULTIPART_THRESHOLD = 8388608
+    MULTIPART_CHUNKSIZE = 8388608
+    # BUFFER_SIZE = 1048576
+    # Verify some assumptions are correct
+    # assert(MULTIPART_CHUNKSIZE >= MULTIPART_THRESHOLD)
+    # assert((MULTIPART_THRESHOLD % BUFFER_SIZE) == 0)
+    # assert((MULTIPART_CHUNKSIZE % BUFFER_SIZE) == 0)
+    hash = hashlib.md5()
+    read = 0
+    chunks = None
+    while True:
+        # Read some from stdin, if we're at the end, stop reading
+        bits = f.read(1048576)
+        if len(bits) == 0:
+            break
+        read += len(bits)
+        hash.update(bits)
+        if chunks is None:
+            # We're handling a multi-part upload, so switch to calculating
+            # hashes of each chunk
+            if read >= MULTIPART_THRESHOLD:
+                chunks = b''
+        if chunks is not None:
+            if (read % MULTIPART_CHUNKSIZE) == 0:
+                # Dont with a chunk, add it to the list of hashes to hash later
+                chunks += hash.digest()
+                hash = hashlib.md5()
+    if chunks is None:
+        # Normal upload, just output the MD5 hash
+        etag = hash.hexdigest()
+    else:
+        # Multipart upload, need to output the hash of the hashes
+        if (read % MULTIPART_CHUNKSIZE) != 0:
+            # Add the last part if we have a partial chunk
+            chunks += hash.digest()
+        etag = hashlib.md5(chunks).hexdigest() + "-" + str(len(chunks) // 16)
+    return etag
+def create_random_file(file: Optional[str] = None, prefix: Optional[str] = None, suffix: Optional[str] = None,
+                       nbytes: int = 1024, binary: bool = False, line_length: Optional[int] = None) -> str:
+    """
+    Write to the given file (name/path) some random content. If the given file is None then writes
+    to a temporary file. In either case, returns the file written to. The of bytes written is 1024
+    by default be can be specified with the nbytes argument; default to writing ASCII text but if
+    the binary argument is True then writes binary data as well; if not binary the content is in
+    lines of 80 characters each; use the line_length argumetn in this case to change the line length.
+    """
+    if not isinstance(nbytes, int) or nbytes < 0:
+        nbytes = 0
+    if not isinstance(file, str) or not file:
+        if not isinstance(prefix, str):
+            prefix = ""
+        if not isinstance(suffix, str):
+            suffix = ""
+        file = f"{datetime.utcnow().strftime('%Y%m%d%H%M%S')}{str(uuid()).replace('-', '')}"
+        file = os.path.join(get_temporary_directory(), file)
+    with open(file, "wb" if binary is True else "w") as f:
+        if binary is True:
+            f.write(os.urandom(nbytes))
+        else:
+            if (not isinstance(line_length, int)) or (line_length < 1):
+                line_length = 80
+            line_length += 1
+            nlines = nbytes // line_length
+            nremainder = nbytes % line_length
+            for n in range(nlines):
+                f.write("".join(random.choices(string.ascii_letters + string.digits, k=line_length - 1)))
+                f.write("\n")
+            if nremainder > 1:
+                f.write("".join(random.choices(string.ascii_letters + string.digits, k=nremainder - 1)))
+            if nremainder > 0:
+                f.write("\n")
+    return file

dcicutils-8.8.6/dcicutils/http_utils.py ADDED Viewed

@@ -0,0 +1,39 @@
+from contextlib import contextmanager
+import requests
+from typing import Callable, Optional
+from dcicutils.tmpfile_utils import temporary_file
+@contextmanager
+def download(url: str, suffix: Optional[str] = None, binary: bool = True,
+             progress: Optional[Callable] = None) -> Optional[str]:
+    """
+    Context manager to download the given URL into a temporary file and yields the file
+    path to it. An optional file suffix may be specified for this temporary file name.
+    Defaults to binary file mode; if not desired then pass False as the binary argument.
+    """
+    with temporary_file(suffix=suffix) as file:
+        download_to(url, file, binary=binary, progress=progress)
+        yield file
+def download_to(url: str, file: str, binary: bool = True, progress: Optional[Callable] = None) -> None:
+    """
+    Download the given URL into the given file. Defaults to binary
+    file mode; if not desired then pass False as the binary argument.
+    """
+    if not callable(progress):
+        progress = None
+    response = requests.get(url, stream=True)
+    if progress:
+        nbytes = 0
+        nbytes_total = None
+        if isinstance(content_length := response.headers.get("Content-Length"), str) and content_length.isdigit():
+            nbytes_total = int(content_length)
+    with open(file, "wb" if binary is True else "w") as f:
+        for chunk in response.iter_content(chunk_size=8192):
+            if chunk:
+                f.write(chunk)
+            if progress:
+                nbytes += len(chunk)
+                progress(nbytes, nbytes_total)

{dcicutils-8.8.5 → dcicutils-8.8.6}/dcicutils/misc_utils.py RENAMED Viewed

@@ -3,6 +3,7 @@ This file contains functions that might be generally useful.
 """
 from collections import namedtuple
+import appdirs
 import contextlib
 import datetime
 import functools
@@ -13,10 +14,12 @@ import json
 import logging
 import math
 import os
+import platform
 import pytz
 import re
 import rfc3986.validators
 import rfc3986.exceptions
+import shortuuid
 import time
 import uuid
 import warnings
@@ -1152,7 +1155,8 @@ def remove_suffix(suffix: str, text: str, required: bool = False):
 def remove_empty_properties(data: Optional[Union[list, dict]],
                             isempty: Optional[Callable] = None,
-                            isempty_array_element: Optional[Callable] = None) -> None:
+                            isempty_array_element: Optional[Callable] = None,
+                            raise_exception_on_nonempty_array_element_after_empty: bool = False) -> None:
     def _isempty(value: Any) -> bool:  # noqa
         return isempty(value) if callable(isempty) else value in [None, "", {}, []]
     if isinstance(data, dict):
@@ -1160,11 +1164,22 @@ def remove_empty_properties(data: Optional[Union[list, dict]],
             if _isempty(value := data[key]):
                 del data[key]
             else:
-                remove_empty_properties(value, isempty=isempty, isempty_array_element=isempty_array_element)
+                remove_empty_properties(value, isempty=isempty, isempty_array_element=isempty_array_element,
+                                        raise_exception_on_nonempty_array_element_after_empty=  # noqa
+                                        raise_exception_on_nonempty_array_element_after_empty)
     elif isinstance(data, list):
         for item in data:
-            remove_empty_properties(item, isempty=isempty, isempty_array_element=isempty_array_element)
+            remove_empty_properties(item, isempty=isempty, isempty_array_element=isempty_array_element,
+                                    raise_exception_on_nonempty_array_element_after_empty=  # noqa
+                                    raise_exception_on_nonempty_array_element_after_empty)
         if callable(isempty_array_element):
+            if raise_exception_on_nonempty_array_element_after_empty is True:
+                empty_element_seen = False
+                for item in data:
+                    if not empty_element_seen and isempty_array_element(item):
+                        empty_element_seen = True
+                    elif empty_element_seen and not isempty_array_element(item):
+                        raise Exception("Non-empty element found after empty element.")
             data[:] = [item for item in data if not isempty_array_element(item)]
@@ -1522,7 +1537,7 @@ def right_trim(list_or_tuple: Union[List[Any], Tuple[Any]],
 def create_dict(**kwargs) -> dict:
     result = {}
     for name in kwargs:
-        if kwargs[name]:
+        if not (kwargs[name] is None):
             result[name] = kwargs[name]
     return result
@@ -2548,6 +2563,19 @@ def normalize_spaces(value: str) -> str:
     return re.sub(r"\s+", " ", value).strip()
+def normalize_string(value: Optional[str]) -> Optional[str]:
+    """
+    Strips leading/trailing spaces, and converts multiple consecutive spaces to a single space
+    in the given string value and returns the result. If the given value is None returns an
+    empty string. If the given value is not actually even a string then return None.
+    """
+    if value is None:
+        return ""
+    elif isinstance(value, str):
+        return re.sub(r"\s+", " ", value).strip()
+    return None
 def find_nth_from_end(string: str, substring: str, nth: int) -> int:
     """
     Returns the index of the nth occurrence of the given substring within
@@ -2590,7 +2618,11 @@ def format_size(nbytes: Union[int, float], precision: int = 2, nospace: bool = F
         nbytes = int(nbytes)
         return f"{nbytes} byte{'s' if nbytes != 1 else ''}"
     unit = (UNITS_TERSE if terse else UNITS)[index]
-    return f"{nbytes:.{precision}f}{'' if nospace else ' '}{unit}"
+    size = f"{nbytes:.{precision}f}"
+    if size.endswith(f".{'0' * precision}"):
+        # Tidy up extraneous zeros.
+        size = size[:-(precision - 1)]
+    return f"{size}{'' if nospace else ' '}{unit}"
 def format_duration(seconds: Union[int, float]) -> str:
@@ -2670,3 +2702,48 @@ class JsonLinesReader:
                 yield line
             else:
                 raise Exception(f"If the first line is not a list, all lines must be dictionaries: {line!r}")
+def get_app_specific_directory() -> str:
+    """
+    Returns the standard system application specific directory:
+    - On MacOS this directory: is: ~/Library/Application Support
+    - On Linux this directory is: ~/.local/share
+    - On Windows this directory is: %USERPROFILE%\\AppData\\Local  # noqa
+    N.B. This is has been tested on MacOS and Linux but not on Windows.
+    """
+    return appdirs.user_data_dir()
+def get_os_name() -> str:
+    if os_name := platform.system():
+        if os_name == "Darwin": return "osx"  # noqa
+        elif os_name == "Linux": return "linux"  # noqa
+        elif os_name == "Windows": return "windows"  # noqa
+    return ""
+def get_cpu_architecture_name() -> str:
+    if os_architecture_name := platform.machine():
+        if os_architecture_name == "x86_64": return "amd64"  # noqa
+        return os_architecture_name
+    return ""
+def create_uuid(nodash: bool = False, upper: bool = False) -> str:
+    value = str(uuid.uuid4())
+    if nodash is True:
+        value = value.replace("-", "")
+    if upper is True:
+        value = value.upper()
+    return value
+def create_short_uuid(length: Optional[int] = None, upper: bool = False):
+    # Not really techincally a uuid of course.
+    if (length is None) or (not isinstance(length, int)) or (length < 1):
+        length = 16
+    value = shortuuid.ShortUUID().random(length=length)
+    if upper is True:
+        value = value.upper()
+    return value

{dcicutils-8.8.5 → dcicutils-8.8.6}/dcicutils/scripts/view_portal_object.py RENAMED Viewed

@@ -57,6 +57,7 @@
 import argparse
 from functools import lru_cache
+import io
 import json
 import pyperclip
 import os
@@ -97,11 +98,18 @@ def main():
                         help="Include all properties for schema usage.")
     parser.add_argument("--raw", action="store_true", required=False, default=False, help="Raw output.")
     parser.add_argument("--tree", action="store_true", required=False, default=False, help="Tree output for schemas.")
+    parser.add_argument("--post", type=str, required=False, default=None,
+                        help="POST data of the main arg type with data from file specified with this option.")
+    parser.add_argument("--patch", type=str, required=False, default=None,
+                        help="PATCH data of the main arg type with data from file specified with this option.")
     parser.add_argument("--database", action="store_true", required=False, default=False,
                         help="Read from database output.")
+    parser.add_argument("--bool", action="store_true", required=False,
+                        default=False, help="Only return whether found or not.")
     parser.add_argument("--yaml", action="store_true", required=False, default=False, help="YAML output.")
     parser.add_argument("--copy", "-c", action="store_true", required=False, default=False,
                         help="Copy object data to clipboard.")
+    parser.add_argument("--indent", required=False, default=False, help="Indent output.", type=int)
     parser.add_argument("--details", action="store_true", required=False, default=False, help="Detailed output.")
     parser.add_argument("--more-details", action="store_true", required=False, default=False,
                         help="More detailed output.")
@@ -151,6 +159,18 @@ def main():
         args.schema = True
     if args.schema:
+        if args.post:
+            if post_data := _read_json_from_file(args.post):
+                if args.verbose:
+                    _print(f"POSTing data from file ({args.post}) as type: {args.uuid}")
+                if isinstance(post_data, dict):
+                    post_data = [post_data]
+                elif not isinstance(post_data, list):
+                    _print(f"POST data neither list nor dictionary: {args.post}")
+                for item in post_data:
+                    portal.post_metadata(args.uuid, item)
+                if args.verbose:
+                    _print(f"Done POSTing data from file ({args.post}) as type: {args.uuid}")
         schema, schema_name = _get_schema(portal, args.uuid)
         if schema:
             if args.copy:
@@ -166,14 +186,50 @@ def main():
             _print_schema(schema, details=args.details, more_details=args.details,
                           all=args.all, raw=args.raw, raw_yaml=args.yaml)
             return
-    data = _get_portal_object(portal=portal, uuid=args.uuid, raw=args.raw, database=args.database, verbose=args.verbose)
+    elif args.patch:
+        if patch_data := _read_json_from_file(args.patch):
+            if args.verbose:
+                _print(f"PATCHing data from file ({args.patch}) for object: {args.uuid}")
+            if isinstance(patch_data, dict):
+                patch_data = [patch_data]
+            elif not isinstance(patch_data, list):
+                _print(f"PATCH data neither list nor dictionary: {args.patch}")
+            for item in patch_data:
+                portal.patch_metadata(args.uuid, item)
+            if args.verbose:
+                _print(f"Done PATCHing data from file ({args.patch}) as type: {args.uuid}")
+            return
+        else:
+            _print(f"No PATCH data found in file: {args.patch}")
+            exit(1)
+    data = _get_portal_object(portal=portal, uuid=args.uuid, raw=args.raw,
+                              database=args.database, check=args.bool, verbose=args.verbose)
+    if args.bool:
+        if data:
+            _print(f"{args.uuid}: found")
+            exit(0)
+        else:
+            _print(f"{args.uuid}: not found")
+            exit(1)
     if args.copy:
         pyperclip.copy(json.dumps(data, indent=4))
     if args.yaml:
         _print(yaml.dump(data))
     else:
-        _print(json.dumps(data, default=str, indent=4))
+        if args.indent > 0:
+            _print(_format_json_with_indent(data, indent=args.indent))
+        else:
+            _print(json.dumps(data, default=str, indent=4))
+def _format_json_with_indent(value: dict, indent: int = 0) -> Optional[str]:
+    if isinstance(value, dict):
+        result = json.dumps(value, indent=4)
+        if indent > 0:
+            result = f"{indent * ' '}{result}"
+            result = result.replace("\n", f"\n{indent * ' '}")
+        return result
 def _create_portal(ini: str, env: Optional[str] = None,
@@ -198,7 +254,8 @@ def _create_portal(ini: str, env: Optional[str] = None,
 def _get_portal_object(portal: Portal, uuid: str,
-                       raw: bool = False, database: bool = False, verbose: bool = False) -> dict:
+                       raw: bool = False, database: bool = False,
+                       check: bool = False, verbose: bool = False) -> dict:
     response = None
     try:
         if not uuid.startswith("/"):
@@ -212,13 +269,18 @@ def _get_portal_object(portal: Portal, uuid: str,
             _exit()
         _exit(f"Exception getting Portal object from {portal.server}: {uuid}\n{get_error_message(e)}")
     if not response:
+        if check:
+            return None
         _exit(f"Null response getting Portal object from {portal.server}: {uuid}")
     if response.status_code not in [200, 307]:
         # TODO: Understand why the /me endpoint returns HTTP status code 307, which is only why we mention it above.
         _exit(f"Invalid status code ({response.status_code}) getting Portal object from {portal.server}: {uuid}")
     if not response.json:
         _exit(f"Invalid JSON getting Portal object: {uuid}")
-    return response.json()
+    response = response.json()
+    if raw:
+        response.pop("schema_version", None)
+    return response
 @lru_cache(maxsize=1)
@@ -257,6 +319,7 @@ def _print_schema_info(schema: dict, level: int = 0,
                        required: Optional[List[str]] = None) -> None:
     if not schema or not isinstance(schema, dict):
         return
+    identifying_properties = schema.get("identifyingProperties")
     if level == 0:
         if required_properties := schema.get("required"):
             _print("- required properties:")
@@ -383,6 +446,8 @@ def _print_schema_info(schema: dict, level: int = 0,
                         suffix += f" | enum"
                     if property_required:
                         suffix += f" | required"
+                    if property_name in (identifying_properties or []):
+                        suffix += f" | identifying"
                     if property.get("uniqueKey"):
                         suffix += f" | unique"
                     if pattern := property.get("pattern"):
@@ -529,6 +594,23 @@ def _print_tree(root_name: Optional[str],
         print(line)
+def _read_json_from_file(file: str) -> Optional[dict]:
+    if not os.path.exists(file):
+        _print(f"Cannot find file: {file}")
+        exit(1)
+    try:
+        with io.open(file, "r") as f:
+            try:
+                return json.load(f)
+            except Exception:
+                _print(f"Cannot parse JSON in file: {file}")
+                exit(1)
+    except Exception as e:
+        print(e)
+        _print(f"Cannot open file: {file}")
+        exit(1)
 def _print(*args, **kwargs):
     with uncaptured_output():
         PRINT(*args, **kwargs)

{dcicutils-8.8.5 → dcicutils-8.8.6}/dcicutils/structured_data.py RENAMED Viewed

@@ -53,6 +53,7 @@ class StructuredDataSet:
     def __init__(self, file: Optional[str] = None, portal: Optional[Union[VirtualApp, TestApp, Portal]] = None,
                  schemas: Optional[List[dict]] = None, autoadd: Optional[dict] = None,
                  order: Optional[List[str]] = None, prune: bool = True,
+                 remove_empty_objects_from_lists: bool = True,
                  ref_lookup_strategy: Optional[Callable] = None,
                  ref_lookup_nocache: bool = False,
                  norefs: bool = False,
@@ -65,7 +66,8 @@ class StructuredDataSet:
                               ref_lookup_nocache=ref_lookup_nocache) if portal else None
         self._ref_lookup_strategy = ref_lookup_strategy
         self._order = order
-        self._prune = prune
+        self._prune = prune is True
+        self._remove_empty_objects_from_lists = remove_empty_objects_from_lists is True
         self._warnings = {}
         self._errors = {}
         self._resolved_refs = set()
@@ -93,12 +95,14 @@ class StructuredDataSet:
     def load(file: str, portal: Optional[Union[VirtualApp, TestApp, Portal]] = None,
              schemas: Optional[List[dict]] = None, autoadd: Optional[dict] = None,
              order: Optional[List[str]] = None, prune: bool = True,
+             remove_empty_objects_from_lists: bool = True,
              ref_lookup_strategy: Optional[Callable] = None,
              ref_lookup_nocache: bool = False,
              norefs: bool = False,
              progress: Optional[Callable] = None,
              debug_sleep: Optional[str] = None) -> StructuredDataSet:
         return StructuredDataSet(file=file, portal=portal, schemas=schemas, autoadd=autoadd, order=order, prune=prune,
+                                 remove_empty_objects_from_lists=remove_empty_objects_from_lists,
                                  ref_lookup_strategy=ref_lookup_strategy, ref_lookup_nocache=ref_lookup_nocache,
                                  norefs=norefs, progress=progress, debug_sleep=debug_sleep)
@@ -346,7 +350,18 @@ class StructuredDataSet:
     def _load_json_file(self, file: str) -> None:
         with open(file) as f:
-            self._add(Schema.type_name(file), json.load(f))
+            file_json = json.load(f)
+            schema_inferred_from_file_name = Schema.type_name(file)
+            if self._portal.get_schema(schema_inferred_from_file_name) is not None:
+                # If the JSON file name looks like a schema name then assume it
+                # contains an object or an array of object of that schema type.
+                self._add(Schema.type_name(file), file_json)
+            elif isinstance(file_json, dict):
+                # Otherwise if the JSON file name does not look like a schema name then
+                # assume it a dictionary where each property is the name of a schema, and
+                # which (each property) contains a list of object of that schema type.
+                for schema_name in file_json:
+                    self._add(schema_name, file_json[schema_name])
     def _load_reader(self, reader: RowReader, type_name: str) -> None:
         schema = None
@@ -368,7 +383,11 @@ class StructuredDataSet:
                 structured_row_template.set_value(structured_row, column_name, value, reader.file, reader.row_number)
                 if self._autoadd_properties:
                     self._add_properties(structured_row, self._autoadd_properties, schema)
-            self._add(type_name, structured_row)
+            if (prune_error := self._prune_structured_row(structured_row)) is not None:
+                self._note_error({"src": create_dict(type=schema_name, row=reader.row_number),
+                                  "error": prune_error}, "validation")
+            else:
+                self._add(type_name, structured_row)
             if self._progress:
                 self._progress({
                     PROGRESS.LOAD_ITEM: self._nrows,
@@ -385,9 +404,20 @@ class StructuredDataSet:
             self._note_error(schema._unresolved_refs, "ref")
             self._resolved_refs.update(schema._resolved_refs)
-    def _add(self, type_name: str, data: Union[dict, List[dict]]) -> None:
-        if self._prune:
+    def _prune_structured_row(self, data: dict) -> Optional[str]:
+        if not self._prune:
+            return None
+        if not self._remove_empty_objects_from_lists:
             remove_empty_properties(data)
+            return None
+        try:
+            remove_empty_properties(data, isempty_array_element=lambda element: element == {},
+                                    raise_exception_on_nonempty_array_element_after_empty=True)
+        except Exception as e:
+            return str(e)
+        return None
+    def _add(self, type_name: str, data: Union[dict, List[dict]]) -> None:
         if type_name in self._data:
             self._data[type_name].extend([data] if isinstance(data, dict) else data)
         else:

dcicutils-8.8.6/dcicutils/tmpfile_utils.py ADDED Viewed

@@ -0,0 +1,76 @@
+from contextlib import contextmanager
+from datetime import datetime
+import os
+import shutil
+import tempfile
+from uuid import uuid4 as uuid
+from typing import List, Optional, Union
+from dcicutils.file_utils import create_random_file
+@contextmanager
+def temporary_directory() -> str:
+    try:
+        with tempfile.TemporaryDirectory() as tmp_directory_name:
+            yield tmp_directory_name
+    finally:
+        remove_temporary_directory(tmp_directory_name)
+@contextmanager
+def temporary_file(name: Optional[str] = None, prefix: Optional[str] = None, suffix: Optional[str] = None,
+                   content: Optional[Union[str, bytes, List[str]]] = None) -> str:
+    with temporary_directory() as tmp_directory_name:
+        tmp_file_name = f"{prefix or ''}{name or tempfile.mktemp(dir='')}{suffix or ''}"
+        tmp_file_path = os.path.join(tmp_directory_name, tmp_file_name)
+        with open(tmp_file_path, "wb" if isinstance(content, bytes) else "w") as tmp_file:
+            if content is not None:
+                tmp_file.write("\n".join(content) if isinstance(content, list) else content)
+        yield tmp_file_path
+def create_temporary_file_name(prefix: Optional[str] = None, suffix: Optional[str] = None) -> str:
+    """
+    Generates and returns the full path to file within the system temporary directory.
+    """
+    random_string = f"{datetime.utcnow().strftime('%Y%m%d%H%M%S')}{str(uuid()).replace('-', '')}"
+    tmp_file_name = f"{prefix or ''}{random_string}{suffix or ''}"
+    return os.path.join(tempfile.gettempdir(), tmp_file_name)
+@contextmanager
+def temporary_random_file(prefix: Optional[str] = None, suffix: Optional[str] = None,
+                          nbytes: int = 1024, binary: bool = False, line_length: Optional[int] = None) -> str:
+    with temporary_file(prefix=prefix, suffix=suffix) as tmp_file_path:
+        create_random_file(tmp_file_path, nbytes=nbytes, binary=binary, line_length=line_length)
+        yield tmp_file_path
+def remove_temporary_directory(tmp_directory_name: str) -> None:
+    """
+    Removes the given directory, recursively; but ONLY if it is (somewhere) within the system temporary directory.
+    """
+    if is_temporary_directory(tmp_directory_name):  # Guard against errant deletion.
+        shutil.rmtree(tmp_directory_name)
+def remove_temporary_file(tmp_file_name: str) -> bool:
+    """
+    Removes the given file; but ONLY if it is (somewhere) within the system temporary directory.
+    """
+    try:
+        tmpdir = tempfile.gettempdir()
+        if (os.path.commonpath([tmpdir, tmp_file_name]) == tmpdir) and os.path.isfile(tmp_file_name):
+            os.remove(tmp_file_name)
+            return True
+        return False
+    except Exception:
+        return False
+def is_temporary_directory(path: str) -> bool:
+    try:
+        tmpdir = tempfile.gettempdir()
+        return os.path.commonpath([path, tmpdir]) == tmpdir and os.path.exists(path) and os.path.isdir(path)
+    except Exception:
+        return False

{dcicutils-8.8.5 → dcicutils-8.8.6}/dcicutils/zip_utils.py RENAMED Viewed

@@ -2,7 +2,9 @@ from contextlib import contextmanager
 from dcicutils.tmpfile_utils import temporary_directory, temporary_file
 import gzip
 import os
+import shutil
 import tarfile
+import tempfile
 from typing import List, Optional
 import zipfile
@@ -45,3 +47,28 @@ def unpack_gz_file_to_temporary_file(file: str, suffix: Optional[str] = None) ->
                     outputf.write(inputf.read())
                     outputf.close()
                     yield tmp_file_name
+def extract_file_from_zip(zip_file: str, file_to_extract: str,
+                          destination_file: str, raise_exception: bool = True) -> bool:
+    """
+    Extracts from the given zip file, the given file to extract, writing it to the
+    given destination file. Returns True if all is well, otherwise False, or if the
+    raise_exception argument is True (the default), then raises and exception on error.
+    """
+    try:
+        if not (destination_directory := os.path.dirname(destination_file)):
+            destination_directory = os.getcwd()
+            destination_file = os.path.join(destination_directory, destination_file)
+        with tempfile.TemporaryDirectory() as tmp_directory_name:
+            with zipfile.ZipFile(zip_file, "r") as zipf:
+                if file_to_extract not in zipf.namelist():
+                    return False
+                zipf.extract(file_to_extract, path=tmp_directory_name)
+                os.makedirs(destination_directory, exist_ok=True)
+                shutil.move(os.path.join(tmp_directory_name, file_to_extract), destination_file)
+            return True
+    except Exception as e:
+        if raise_exception:
+            raise e
+    return False

{dcicutils-8.8.5 → dcicutils-8.8.6}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dcicutils"
-version = "8.8.5"
+version = "8.8.6"
 description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources"
 authors = ["4DN-DCIC Team <support@4dnucleome.org>"]
 license = "MIT"
@@ -37,12 +37,13 @@ classifiers = [
 [tool.poetry.dependencies]
-python = ">=3.8,<3.12"
-boto3 = "^1.28.57"
-botocore = "^1.31.57"
+python = ">=3.8,<3.13"
+boto3 = "^1.34.93"
+botocore = "^1.34.93"
 # The DCIC portals (cgap-portal and fourfront) are very particular about which ElasticSearch version.
 # This value is intentionally pinned and must not be changed casually.
 elasticsearch = "7.13.4"
+appdirs = "^1.4.4"
 aws-requests-auth = ">=0.4.2,<1"
 chardet = "^5.2.0"
 docker = "^4.4.4"
@@ -60,6 +61,7 @@ pyperclip = "^1.8.2"
 PyYAML = "^6.0.1"
 requests = "^2.21.0"
 rfc3986 = "^1.4.0"
+shortuuid = "^1.0.13"
 structlog = "^19.2.0"
 toml = ">=0.10.1,<1"
 tqdm = "^4.66.2"
@@ -69,8 +71,8 @@ webtest = "^2.0.34"
 [tool.poetry.dev-dependencies]
-boto3-stubs = "^1.28.57"
-botocore-stubs = "^1.31.57"
+boto3-stubs = "^1.34.93"
+botocore-stubs = "^1.34.93"
 coverage = ">=7.2.3"
 # Loaded manually in GA workflow for coverage because a dependency on 2to3
 # in its docopts dependency makes a problem for laoding it here in poetry. -kmp 7-Apr-2023

dcicutils-8.8.5/dcicutils/file_utils.py DELETED Viewed

@@ -1,58 +0,0 @@
-import glob
-import os
-import pathlib
-from typing import List, Optional, Union
-def search_for_file(file: str,
-                    location: Union[str, Optional[List[str]]] = None,
-                    recursive: bool = False,
-                    single: bool = False) -> Union[List[str], Optional[str]]:
-    """
-    Searches for the existence of the given file name, first directly in the given directory or list
-    of directories, if specified, and if not then just in the current (working) directory; if the
-    given recursive flag is True then also searches all sub-directories of these directories;
-    returns the full path name to the file if found. If the single flag is True then just the
-    first file which is found is returns (as a string), or None if none; if the single flag
-    is False, then all matched files are returned in a list, or and empty list if none.
-    """
-    if file and isinstance(file, (str, pathlib.PosixPath)):
-        if os.path.isabs(file):
-            if os.path.exists(file):
-                return file if single else [file]
-            return None if single else []
-        files_found = []
-        if not location:
-            location = ["."]
-        elif isinstance(location, (str, pathlib.PosixPath)):
-            location = [location]
-        elif not isinstance(location, list):
-            location = []
-        for directory in location:
-            if not directory:
-                continue
-            if isinstance(directory, (str, pathlib.PosixPath)) and os.path.exists(os.path.join(directory, file)):
-                file_found = os.path.abspath(os.path.normpath(os.path.join(directory, file)))
-                if single:
-                    return file_found
-                if file_found not in files_found:
-                    files_found.append(file_found)
-        if recursive:
-            for directory in location:
-                if not directory:
-                    continue
-                if not directory.endswith("/**") and not file.startswith("**/"):
-                    path = f"{directory}/**/{file}"
-                else:
-                    path = f"{directory}/{file}"
-                files = glob.glob(path, recursive=recursive)
-                if files:
-                    for file_found in files:
-                        file_found = os.path.abspath(file_found)
-                        if single:
-                            return file_found
-                        if file_found not in files_found:
-                            files_found.append(file_found)
-        if files_found:
-            return files_found[0] if single else files_found
-        return None if single else []

dcicutils-8.8.5/dcicutils/tmpfile_utils.py DELETED Viewed

@@ -1,36 +0,0 @@
-from contextlib import contextmanager
-import os
-import shutil
-import tempfile
-from typing import List, Optional, Union
-@contextmanager
-def temporary_directory() -> str:
-    try:
-        with tempfile.TemporaryDirectory() as tmp_directory_name:
-            yield tmp_directory_name
-    finally:
-        remove_temporary_directory(tmp_directory_name)
-@contextmanager
-def temporary_file(name: Optional[str] = None, suffix: Optional[str] = None,
-                   content: Optional[Union[str, bytes, List[str]]] = None) -> str:
-    with temporary_directory() as tmp_directory_name:
-        tmp_file_name = os.path.join(tmp_directory_name, name or tempfile.mktemp(dir="")) + (suffix or "")
-        with open(tmp_file_name, "wb" if isinstance(content, bytes) else "w") as tmp_file:
-            if content is not None:
-                tmp_file.write("\n".join(content) if isinstance(content, list) else content)
-        yield tmp_file_name
-def remove_temporary_directory(tmp_directory_name: str) -> None:
-    def is_temporary_directory(path: str) -> bool:
-        try:
-            tmpdir = tempfile.gettempdir()
-            return os.path.commonpath([path, tmpdir]) == tmpdir and os.path.exists(path) and os.path.isdir(path)
-        except Exception:
-            return False
-    if is_temporary_directory(tmp_directory_name):  # Guard against errant deletion.
-        shutil.rmtree(tmp_directory_name)