fmtr.tools 0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. fmtr.tools-0.1/PKG-INFO +20 -0
  2. fmtr.tools-0.1/fmtr/tools/augmentation_tools.py +10 -0
  3. fmtr.tools-0.1/fmtr/tools/config.py +13 -0
  4. fmtr.tools-0.1/fmtr/tools/config_tools.py +54 -0
  5. fmtr.tools-0.1/fmtr/tools/dataclass_tools.py +53 -0
  6. fmtr.tools-0.1/fmtr/tools/datatype_tools.py +46 -0
  7. fmtr.tools-0.1/fmtr/tools/docker_tools.py +36 -0
  8. fmtr.tools-0.1/fmtr/tools/environment_tools.py +76 -0
  9. fmtr.tools-0.1/fmtr/tools/function_tools.py +30 -0
  10. fmtr.tools-0.1/fmtr/tools/hash_tools.py +13 -0
  11. fmtr.tools-0.1/fmtr/tools/iterator_tools.py +45 -0
  12. fmtr.tools-0.1/fmtr/tools/json_tools.py +23 -0
  13. fmtr.tools-0.1/fmtr/tools/logging_tools.py +64 -0
  14. fmtr.tools-0.1/fmtr/tools/parallel_tools.py +103 -0
  15. fmtr.tools-0.1/fmtr/tools/path_tools.py +147 -0
  16. fmtr.tools-0.1/fmtr/tools/platform_tools.py +14 -0
  17. fmtr.tools-0.1/fmtr/tools/process_tools.py +77 -0
  18. fmtr.tools-0.1/fmtr/tools/profiling_tools.py +8 -0
  19. fmtr.tools-0.1/fmtr/tools/random_tools.py +107 -0
  20. fmtr.tools-0.1/fmtr/tools/string_tools.py +34 -0
  21. fmtr.tools-0.1/fmtr/tools/tests/__init__.py +0 -0
  22. fmtr.tools-0.1/fmtr/tools/tests/conftest.py +0 -0
  23. fmtr.tools-0.1/fmtr/tools/tests/helpers.py +39 -0
  24. fmtr.tools-0.1/fmtr/tools/tests/test_datatype.py +33 -0
  25. fmtr.tools-0.1/fmtr/tools/tests/test_environment.py +76 -0
  26. fmtr.tools-0.1/fmtr/tools/tests/test_json.py +13 -0
  27. fmtr.tools-0.1/fmtr/tools/tests/test_path.py +95 -0
  28. fmtr.tools-0.1/fmtr/tools/tests/test_yaml.py +13 -0
  29. fmtr.tools-0.1/fmtr/tools/tokenization_tools.py +159 -0
  30. fmtr.tools-0.1/fmtr/tools/tools.py +49 -0
  31. fmtr.tools-0.1/fmtr/tools/version +1 -0
  32. fmtr.tools-0.1/fmtr/tools/yaml_tools.py +29 -0
  33. fmtr.tools-0.1/fmtr.tools.egg-info/PKG-INFO +20 -0
  34. fmtr.tools-0.1/fmtr.tools.egg-info/SOURCES.txt +37 -0
  35. fmtr.tools-0.1/fmtr.tools.egg-info/dependency_links.txt +1 -0
  36. fmtr.tools-0.1/fmtr.tools.egg-info/requires.txt +37 -0
  37. fmtr.tools-0.1/fmtr.tools.egg-info/top_level.txt +1 -0
  38. fmtr.tools-0.1/setup.cfg +4 -0
  39. fmtr.tools-0.1/setup.py +32 -0
@@ -0,0 +1,20 @@
1
+ Metadata-Version: 2.1
2
+ Name: fmtr.tools
3
+ Version: 0.1
4
+ Summary: Frontmatter tools for AI projects
5
+ Home-page: https://github.com/fmtr/fmtr.tools
6
+ Author: Frontmatter
7
+ Author-email: innovative.fowler@mask.pro.fmtr.dev
8
+ License: Copyright © 2024 Frontmatter. All rights reserved.
9
+ Platform: UNKNOWN
10
+ Provides-Extra: augmentation
11
+ Provides-Extra: docker.api
12
+ Provides-Extra: logging
13
+ Provides-Extra: parallel
14
+ Provides-Extra: profiling
15
+ Provides-Extra: test
16
+ Provides-Extra: tokenization
17
+ Provides-Extra: yaml
18
+
19
+ UNKNOWN
20
+
@@ -0,0 +1,10 @@
1
+ try:
2
+ from faker import Faker
3
+ import sre_yield
4
+ except ImportError as exception: # pragma: no cover
5
+ from fmtr.tools.tools import raise_missing_extra
6
+
7
+ raise_missing_extra('augmentation', exception)
8
+
9
+ fake = Faker()
10
+ to_generator = sre_yield.AllStrings
@@ -0,0 +1,13 @@
1
+ from datetime import datetime
2
+
3
+ from fmtr.tools.config_tools import ConfigClass
4
+
5
+
6
+ class ToolsConfig(ConfigClass):
7
+ ENCODING = 'UTF-8'
8
+ LIBRARY_NAME = 'fmtr.tools'
9
+ DATE_FILENAME_FORMAT = '%Y-%m-%d'
10
+ DATETIME_FILENAME_FORMAT = f'{DATE_FILENAME_FORMAT}@%H-%M-%S'
11
+ DATETIME_NOW = datetime.utcnow()
12
+ DATETIME_NOW_STR = DATETIME_NOW.strftime(DATETIME_FILENAME_FORMAT)
13
+ SERIALIZATION_INDENT = 4
@@ -0,0 +1,54 @@
1
+ """
2
+
3
+ Config tools for downstream projects
4
+
5
+ """
6
+ from dataclasses import dataclass, fields, Field as DataclassField
7
+
8
+ from typing import List, Type
9
+
10
+
11
+ class ConfigClass:
12
+ """
13
+
14
+ Base class for all config classes.
15
+
16
+ """
17
+
18
+ @classmethod
19
+ def process_field(cls, field):
20
+ """
21
+
22
+ Post-process field
23
+
24
+ """
25
+
26
+ @classmethod
27
+ def process_fields(cls):
28
+ """
29
+
30
+ Post-process fields
31
+
32
+ """
33
+ for field in cls.get_fields():
34
+ cls.process_field(field)
35
+
36
+ @classmethod
37
+ def get_fields(cls) -> List[DataclassField]:
38
+ """
39
+
40
+ Return fields
41
+
42
+ """
43
+ return fields(cls)
44
+
45
+ def __init_subclass__(cls, **kwargs):
46
+ """
47
+
48
+ Decorate subclasses as dataclasses
49
+
50
+ """
51
+ return dataclass(cls)
52
+
53
+
54
+ Field = Type
@@ -0,0 +1,53 @@
1
+ from dataclasses import fields as get_fields_tuple
2
+
3
+ from fmtr.tools.tools import EMPTY
4
+
5
+
6
+ def get_fields(cls, **filters) -> dict:
7
+ """
8
+
9
+ Get a dictionary of fields from a dataclass.
10
+
11
+ """
12
+ fields = {}
13
+ for field in get_fields_tuple(cls):
14
+ if all([getattr(field, key) is value for key, value in filters.items()]):
15
+ fields[field.name] = field
16
+
17
+ return fields
18
+
19
+
20
+ def get_metadata(cls, **filters) -> dict:
21
+ """
22
+
23
+ Get a dictionary of fields metadata from a dataclass.
24
+
25
+ """
26
+ fields = get_fields(cls, **filters)
27
+ metadata = {name: field.metadata for name, field in fields.items()}
28
+ return metadata
29
+
30
+
31
+ def get_enabled_fields(cls, name, enabled=True, default=EMPTY, **filters):
32
+ """
33
+
34
+ Get a dictionary of fields metadata from a dataclass filtered by enabled fields.
35
+
36
+ """
37
+ metadata = get_metadata(cls, **filters)
38
+
39
+ names = []
40
+
41
+ if isinstance(name, dict):
42
+ name = next(iter(name.keys()))
43
+
44
+ for key, field_meta in metadata.items():
45
+
46
+ if default is EMPTY:
47
+ value = field_meta[name]
48
+ else:
49
+ value = field_meta.get(name, default)
50
+ if value is enabled:
51
+ names.append(key)
52
+
53
+ return names
@@ -0,0 +1,46 @@
1
+ from typing import Any
2
+
3
+ from distutils.util import strtobool
4
+
5
+ from fmtr.tools.tools import Raise
6
+
7
+
8
+ class TypeConversionFailed(ValueError):
9
+ """
10
+
11
+ Exception to raise for type conversion failure.
12
+
13
+ """
14
+
15
+
16
+ def get_failure_message(raw, type_type):
17
+ """
18
+
19
+ Create generic type conversion failure message.
20
+
21
+ """
22
+ return f'Failed to convert "{raw}" (type: {type(raw)}) to type {type_type}'
23
+
24
+
25
+ def to_bool(raw: Any, default=None) -> bool:
26
+ """
27
+
28
+ Convert a value to a Boolean
29
+
30
+ """
31
+
32
+ try:
33
+ converted = str(raw)
34
+ converted = strtobool(converted)
35
+ converted = bool(converted)
36
+ return converted
37
+ except ValueError as exception:
38
+ if default is Raise:
39
+ msg = get_failure_message(raw, bool)
40
+ raise TypeConversionFailed(msg) from exception
41
+ else:
42
+ return default
43
+
44
+
45
+ def is_none(value: Any) -> bool:
46
+ return value is None
@@ -0,0 +1,36 @@
1
+ import contextlib
2
+
3
+ try:
4
+ import docker
5
+ except ImportError as exception: # pragma: no cover
6
+ from fmtr.tools.tools import raise_missing_extra
7
+
8
+ raise_missing_extra('docker', exception)
9
+
10
+
11
+ @contextlib.contextmanager
12
+ def Container(image, ports=None, name=None, **kwargs):
13
+ """
14
+
15
+ Run a Docker container in a context manager
16
+
17
+ """
18
+ client = docker.from_env()
19
+
20
+ try:
21
+ container = client.containers.get(name)
22
+ container.stop()
23
+ container.remove()
24
+ except docker.errors.NotFound:
25
+ pass
26
+
27
+ ports = {f'{port}/tcp': port for port in ports}
28
+ container = client.containers.run(image, ports=ports, detach=True, name=name, **kwargs)
29
+
30
+ try:
31
+ yield container
32
+ finally:
33
+ container.stop()
34
+ container.remove()
35
+
36
+
@@ -0,0 +1,76 @@
1
+ """
2
+
3
+ Tools for handling environment variables etc.
4
+
5
+ """
6
+ import os
7
+ from collections.abc import Callable
8
+ from datetime import date, datetime
9
+ from typing import Any, Dict
10
+
11
+ from fmtr.tools.datatype_tools import to_bool
12
+ from fmtr.tools.path_tools import Path
13
+ from fmtr.tools.tools import identity, EMPTY
14
+
15
+
16
+ class MissingEnvironmentVariable(KeyError):
17
+ """
18
+
19
+ Exception for when a required environment variable is missing.
20
+
21
+ """
22
+
23
+
24
+ def get_env_dict() -> Dict[str, str]:
25
+ """
26
+
27
+ Return environment variables as a standard dictionary.
28
+
29
+ """
30
+ environment_dict = dict(os.environ)
31
+ return environment_dict
32
+
33
+
34
+ def get_env(name: str, default: Any = EMPTY, converter: Callable = identity, convert_empty: bool = False) -> Any:
35
+ """
36
+
37
+ Return the specified environment variable, handling default substitution and simple type conversion.
38
+
39
+ """
40
+ value = os.getenv(name, default)
41
+
42
+ if value is EMPTY:
43
+ msg = f'Environment variable "{name}" is required but has not been set'
44
+ raise MissingEnvironmentVariable(msg)
45
+
46
+ if value is not None or convert_empty:
47
+ value = converter(value)
48
+
49
+ return value
50
+
51
+
52
+ def get_env_getter(converter: Callable) -> Callable:
53
+ """
54
+
55
+ Return an environment getter for the specified type.
56
+
57
+ """
58
+
59
+ def func(name: str, default: Any = EMPTY):
60
+ """
61
+
62
+ Environment getter that converts to the specified type
63
+
64
+ """
65
+ value = get_env(name, default=default, converter=converter)
66
+ return value
67
+
68
+ return func
69
+
70
+
71
+ get_env_int = get_env_getter(lambda n: int(float(n)))
72
+ get_env_float = get_env_getter(float)
73
+ get_env_bool = get_env_getter(to_bool)
74
+ get_env_date = get_env_getter(date.fromisoformat)
75
+ get_env_datetime = get_env_getter(datetime.fromisoformat)
76
+ get_env_path = get_env_getter(Path)
@@ -0,0 +1,30 @@
1
+ def combine_args_kwargs(args: dict=None, kwargs: dict=None) -> dict:
2
+ """
3
+
4
+ Combines arguments and keyword arguments into a single dictionary.
5
+
6
+ """
7
+ args = args or []
8
+ kwargs = kwargs or {}
9
+ args = {i: arg for i, arg in enumerate(args)}
10
+ args.update(kwargs)
11
+ if all(isinstance(key, int) for key in args.keys()):
12
+ args = list(args.values())
13
+ return args
14
+
15
+
16
+ def split_args_kwargs(args_kwargs: dict) -> Tuple[list, dict]:
17
+ """
18
+
19
+ Splits arguments and keyword arguments into a list and a dictionary.
20
+
21
+ """
22
+ if isinstance(args_kwargs, list):
23
+ args, kwargs = args_kwargs, {}
24
+ else:
25
+ args = [arg for key, arg in args_kwargs.items() if isinstance(key, int)]
26
+ kwargs = {key: arg for key, arg in args_kwargs.items() if not isinstance(key, int)}
27
+
28
+ return args, kwargs
29
+
30
+
@@ -0,0 +1,13 @@
1
+ from zlib import crc32
2
+
3
+ from fmtr.tools.config import ToolsConfig
4
+
5
+
6
+ def hash_unit(value: str) -> float:
7
+ """
8
+
9
+ Hash the input string to a value between 0.0 and 1.0 (not secure).
10
+
11
+ """
12
+ value = str(value).encode(ToolsConfig.ENCODING)
13
+ return float(crc32(value) & 0xffffffff) / 2 ** 32
@@ -0,0 +1,45 @@
1
+ from itertools import chain
2
+ from typing import List, Dict, Any
3
+
4
+
5
+ def enlist(value) -> List[Any]:
6
+ """
7
+
8
+ Make a non-list into a singleton list
9
+
10
+ """
11
+ enlisted = value if isinstance(value, list) else [value]
12
+ return enlisted
13
+
14
+
15
+ def dict_records_to_lists(data: List[Dict[Any, Any]], missing: Any = None) -> Dict[Any, List[Any]]:
16
+ """
17
+
18
+ Convert a list of dictionaries to lists format
19
+
20
+ """
21
+ keys = set(chain.from_iterable([datum.keys() for datum in data]))
22
+ as_lists = {key: [] for key in keys}
23
+ for datum in data:
24
+ for key in keys:
25
+ as_lists[key].append(datum.get(key, missing))
26
+ return as_lists
27
+
28
+
29
+ def get_batch_sizes(total, num_batches):
30
+ """
31
+
32
+ Calculate the sizes of batches for a given total number of items and number of batches.
33
+
34
+ """
35
+ return [total // num_batches + (1 if x < total % num_batches else 0) for x in range(num_batches)]
36
+
37
+
38
+ def chunk_data(data, size: int):
39
+ """
40
+
41
+ Chunk data into batches of a given size, plus any remainder
42
+
43
+ """
44
+ chunked = [data[offset:offset + size] for offset in range(0, len(data), size)]
45
+ return chunked
@@ -0,0 +1,23 @@
1
+ import json
2
+
3
+ from fmtr.tools.config import ToolsConfig
4
+
5
+
6
+ def to_json(obj):
7
+ """
8
+
9
+ Serialise to JSON
10
+
11
+ """
12
+ json_str = json.dumps(obj, indent=ToolsConfig.SERIALIZATION_INDENT, ensure_ascii=False)
13
+ return json_str
14
+
15
+
16
+ def from_json(json_str: str):
17
+ """
18
+
19
+ Deserialise from JSON
20
+
21
+ """
22
+ obj = json.loads(json_str)
23
+ return obj
@@ -0,0 +1,64 @@
1
+ import sys
2
+
3
+ try:
4
+ from loguru import logger as logger_loguru
5
+ except ImportError as exception: # pragma: no cover
6
+ from fmtr.tools.tools import raise_missing_extra
7
+
8
+ raise_missing_extra('logging', exception)
9
+
10
+ from fmtr.tools.config import ToolsConfig
11
+ from fmtr.tools.config_tools import ConfigClass
12
+ from fmtr.tools.path_tools import Path
13
+ from fmtr.tools.environment_tools import get_env
14
+
15
+
16
+ class LoggingConfig(ConfigClass):
17
+ SEP = ' '
18
+ TIME = '<bold><green>{time:' + ToolsConfig.DATETIME_FILENAME_FORMAT + '}</green></bold>'
19
+ ICON = '<level>{level.icon}</level>'
20
+ LEVEL = '<level>{level:<8}</level>'
21
+ FILE = '{file}:{line}'
22
+ FUNCTION = '{function}(…)'
23
+ MESSAGE = '{message}'
24
+ DEFAULT_LEVEL_KEY = 'FMTR_LOG_LEVEL'
25
+ DEFAULT_LEVEL = get_env(DEFAULT_LEVEL_KEY, 'INFO')
26
+
27
+ FILENAME = f'log-{ToolsConfig.DATETIME_NOW_STR}.log'
28
+
29
+
30
+ def default_filter(record):
31
+ return True
32
+
33
+
34
+ def default_patch(record):
35
+ return record
36
+
37
+
38
+ def get_logger(logger=logger_loguru, terminal=True, level=LoggingConfig.DEFAULT_LEVEL, time_format=LoggingConfig.TIME,
39
+ icon_format=LoggingConfig.ICON,
40
+ level_format=LoggingConfig.LEVEL, file_format=LoggingConfig.FILE, function_format=LoggingConfig.FUNCTION,
41
+ message_format=LoggingConfig.MESSAGE,
42
+ logfile=False, logfile_dir=None):
43
+ """
44
+
45
+
46
+
47
+ """
48
+ components = [time_format, icon_format, level_format, file_format, function_format, message_format]
49
+ format = LoggingConfig.SEP.join([component for component in components if component])
50
+ logger.remove()
51
+
52
+ if terminal:
53
+ logger.add(sys.stderr, format=format, level=level, filter=default_filter)
54
+ logger = logger.patch(default_patch)
55
+
56
+ if logfile:
57
+ logfile_dir = Path(logfile_dir or '.')
58
+ logfile_path = logfile_dir / LoggingConfig.FILENAME
59
+ logger.add(logfile_path, format=format)
60
+
61
+ return logger
62
+
63
+
64
+ logger = get_logger()
@@ -0,0 +1,103 @@
1
+ from contextlib import nullcontext as NullContext
2
+ from multiprocessing import cpu_count
3
+ from typing import List, Callable, Any, Union
4
+
5
+ from fmtr.tools.config_tools import ConfigClass
6
+ from fmtr.tools.iterator_tools import dict_records_to_lists
7
+ from fmtr.tools.logging_tools import logger
8
+ from fmtr.tools.path_tools import Path
9
+
10
+ try:
11
+ import dask
12
+ import dask.bag as db
13
+ from dask.diagnostics import ProgressBar
14
+ except ImportError as exception: # pragma: no cover
15
+ from fmtr.tools.tools import raise_missing_extra
16
+
17
+ raise_missing_extra('parallel', exception)
18
+
19
+
20
+ class ParallelConfig(ConfigClass):
21
+ """
22
+
23
+ Configuration values.
24
+
25
+ """
26
+ THREADS = 'threads'
27
+ PROCESSES = 'processes'
28
+ SINGLE = 'single-threaded'
29
+
30
+
31
+ def get_nullary_wrapper(func: Callable):
32
+ """
33
+
34
+ Dask cannot map a nullary function, as its argument list is empty. Hence this wrapper to force the function to take one dummy argument.
35
+
36
+ """
37
+
38
+ def wrap_nullary(dummy: None, **kwargs):
39
+ """
40
+
41
+ Ignore the dummy argument and run the function.
42
+
43
+ """
44
+ return func(**kwargs)
45
+
46
+ return wrap_nullary
47
+
48
+
49
+ def apply_parallel(func: Callable, data: Union[List[Any], int], *args, num_workers: int = cpu_count(),
50
+ scheduler: str = ParallelConfig.PROCESSES,
51
+ parallelize: bool = True, show_progress: bool = False, return_future: bool = False, **kwargs) -> \
52
+ List[Any]:
53
+ """
54
+
55
+ Helper function for a one-off, intensive parallel computation task.
56
+
57
+ """
58
+
59
+ if not parallelize and scheduler != ParallelConfig.SINGLE:
60
+ msg = f'Scheduler is set to "{scheduler}" but parallelization has been manually disabled.'
61
+ logger.warning(msg)
62
+ scheduler = ParallelConfig.SINGLE
63
+
64
+ data_kwargs = {}
65
+ if type(data) is int: # If data is an integer, assume the function is nullary and just run it the specified number of times.
66
+ data_args = [[None] * data]
67
+ func = get_nullary_wrapper(func)
68
+ else:
69
+ data_args = []
70
+ is_data_lists = all(isinstance(datum, (tuple, list)) for datum in data)
71
+ is_data_dicts = all(isinstance(datum, dict) for datum in data)
72
+ if is_data_lists: # If the data is a list of tuples/lists of arguments.
73
+ data_args += list(zip(*data))
74
+ elif is_data_dicts: # If the data is a list of dictionaries of keyword arguments.
75
+ data_kwargs = dict_records_to_lists(data)
76
+ else:
77
+ data_args.append(data) # Otherwise treat the data as a simple list of arguments.
78
+
79
+ dask.config.set({'temporary-directory': Path.temp()})
80
+
81
+ data_args = [db.from_sequence(value) for value in data_args]
82
+ data_kwargs = {key: db.from_sequence(values) for key, values in data_kwargs.items()}
83
+ future = db.map(func, *data_args, *args, **data_kwargs, **kwargs)
84
+
85
+ def get_results():
86
+ """
87
+
88
+ Function to compute results with the specified configuration.
89
+
90
+ """
91
+ if show_progress:
92
+ context = ProgressBar
93
+ else:
94
+ context = NullContext
95
+
96
+ with context():
97
+ return future.compute(scheduler=scheduler, num_workers=num_workers)
98
+
99
+ if return_future: # Return a delayed function.
100
+ return get_results
101
+ else:
102
+ results = get_results() # Compute and return results.
103
+ return results