PyPI - PyDistintoX - Versions diffs - 0.1.0__py3-none-any.whl - Mend

PyDistintoX 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

pydistintox/.DS_Store +0 -0
pydistintox/__init__.py +51 -0
pydistintox/__main__.py +4 -0
pydistintox/cli.py +258 -0
pydistintox/common/.gitkeep +1 -0
pydistintox/common/__init__.py +0 -0
pydistintox/common/config.py +155 -0
pydistintox/common/utils.py +347 -0
pydistintox/distinct_measures/__init__.py +0 -0
pydistintox/distinct_measures/config.py +57 -0
pydistintox/distinct_measures/core.py +159 -0
pydistintox/distinct_measures/load_matrices.py +50 -0
pydistintox/distinct_measures/measures.py +471 -0
pydistintox/distinct_measures/utils.py +37 -0
pydistintox/main.py +92 -0
pydistintox/td_matrices/__init__.py +0 -0
pydistintox/td_matrices/config.py +16 -0
pydistintox/td_matrices/core.py +204 -0
pydistintox/td_matrices/create_matrices.py +49 -0
pydistintox/td_matrices/io_utils.py +147 -0
pydistintox/td_matrices/parsing.py +88 -0
pydistintox/td_matrices/tfidf_measures.py +167 -0
pydistintox/td_matrices/utils.py +64 -0
pydistintox/visualize/__init__.py +0 -0
pydistintox/visualize/config.py +28 -0
pydistintox/visualize/core.py +205 -0
pydistintox/visualize/utils.py +152 -0
pydistintox-0.1.0.dist-info/METADATA +365 -0
pydistintox-0.1.0.dist-info/RECORD +31 -0
pydistintox-0.1.0.dist-info/WHEEL +4 -0
pydistintox-0.1.0.dist-info/entry_points.txt +3 -0

pydistintox/common/utils.py ADDED Viewed

@@ -0,0 +1,347 @@
+# standard library imports
+import logging
+from pathlib import Path
+from typing import (
+    Any,
+    Callable,
+    Optional,
+    ParamSpec,
+    TypeVar,
+)
+# third-party imports
+import numpy as np
+import pandas as pd
+from scipy import sparse as sp
+# No application-specific imports to avoid circular imports
+# intra-package imports
+import pydistintox.common.config as common_config
+from .config import SparseMatrix
+def dir_is_empty(dir: Path)-> bool:
+    set_of_files = {f.name for f in dir.iterdir()}
+    # ignore .gitignore file:
+    set_of_files.discard('.gitkeep')
+    if not set_of_files:
+        logging.debug(f'The directory {dir} is empty (except .gitkeep)')
+        return True
+    else:
+        logging.debug(f'{dir} contains {list(dir.iterdir())}.')
+        return False
+def create_pathlist(
+        path:Path,
+        suffix: str,
+        )->list[Path]:
+    """
+    Takes a path to a directory and a suffix.
+    Returns a sorted list of file paths with the given suffix.
+    """
+    # checks beforehand
+    if not path.is_dir():
+        logging.error(f'Could not create pathlist from {path}.')
+        logging.debug(f'using {create_pathlist.__name__}')
+    # create pathlist
+    pathlist = sorted(list(path.glob(f'*.{suffix}')))
+    # checks afterhand
+    if pathlist == []:
+        logging.error(f'No files with suffix {suffix} in directory {path}')
+    logging.debug(f'Created pathlist: {pathlist}')
+    return pathlist
+def scan_directory(
+        input_pathlist: list[Path],
+        )->list[tuple[str, str]]:
+    """
+    Scan directory and return list of tuples
+    containing file name and content from file
+    """
+    text_string_tuples = []
+    logging.debug(f'scanning dir: {input_pathlist}')
+    for txt_file_path in input_pathlist:
+        if not txt_file_path.is_file():
+            logging.error(f'Could not scan the following file {txt_file_path}.')
+            logging.debug(f'using {scan_directory.__name__}')
+        else:
+            with open(txt_file_path, 'r', encoding='utf-8') as tf:
+                logging.debug(f'scanning file: {txt_file_path}')
+                file_name = txt_file_path.parts[-1]
+                text_string = tf.read()
+                text_tup = file_name, text_string
+                text_string_tuples.append(text_tup)
+    return text_string_tuples
+"""Sparse matrix functions"""
+# Sparse matrices is a compressed format that saves RAM. It is not necessarily faster.
+# they come in two formats:
+# csr are efficient for row slicing, while csc are efficient for column slicing.
+# Using csr- and csc_arrays is preferable to using csr_matrix and csc_matrix.
+# csr_matrix is an old format of scipy that will not be longer maintained.
+def dense_to_sparse(
+        matrix: np.ndarray,
+        format: str  # 'csc' or 'csr'
+        )-> SparseMatrix:
+    """
+    Transforms a dense numpy array into csc/csr respectively.
+    """
+    if format == 'csr':
+        return sp.csr_array(matrix)
+    elif format == 'csc':
+        return sp.csc_array(matrix)
+    else:
+        raise TypeError("Unsupported format. Use 'csr' or 'csc'.")
+def save_as_sparse(
+        dense_matrix: np.ndarray,
+        output_dir:Path,
+        file_prefix: str,
+        format: str  # 'csc' or 'csr'
+        )-> None:
+    """
+    Saves a dense matrix as sparse in the given directory
+    using scipy.sparse.save_npz with the suffix 'npz'.
+    """
+    # make output_dir if not exists
+    output_dir.mkdir(
+        parents=True,
+        exist_ok=True,
+    )
+    out_name = f'{file_prefix}.npz'
+    if (output_dir/out_name).is_file():
+        logging.warning(f'File {out_name} exists already in {output_dir} and is replaced.')
+    sparse_data = dense_to_sparse(
+        dense_matrix,
+        format,
+    )
+    try:
+        sp.save_npz(output_dir/out_name, sparse_data)
+        logging.debug(f'Saved {str(output_dir/out_name)} as {format}')
+    except Exception as e:
+        logging.warning(f'Error saving {out_name}: {e}')
+""" Logging settings """
+def normal_logging():
+    logging.basicConfig(
+        level=logging.INFO, # show messages of level INFO and above (warnings, errors)
+        format='%(message)s', # show message only
+        force=True,  # force this config even if there have been loggings before
+    )
+def debug_logging():
+    logging.basicConfig(
+        level=logging.DEBUG,  # show messages of all levels
+        format='%(asctime)s - %(levelname)s - %(message)s',  # format: time - level - message
+        force= True,
+    )
+def setup_logging(debug: bool = False) -> None:
+    root_logger = logging.root
+    if debug:
+        debug_logging()
+        logging.getLogger('gensim').setLevel(logging.DEBUG)
+        logging.getLogger('spacy').setLevel(logging.DEBUG)
+    else:
+        normal_logging()
+        # change gensim logging mode to warning,
+        # since logging level of gensim is too verboos in info mode
+        logging.getLogger('gensim').setLevel(logging.WARNING)
+    name_logging_level = logging.getLevelName(root_logger.level)
+    logging.info(f'Current logging level: {name_logging_level}')
+    return
+""" other functions """
+T = TypeVar('T')
+P = ParamSpec('P')
+def handle_errors(
+        func: Callable[..., T],
+        *args: Any,
+        on_error: Optional[Callable[[Exception], Optional[T]]] = None,# only use as keyword argument!
+        **kwargs: Any,
+    )->Optional[T]:
+    """
+    Takes a function with arguments and executes it
+    while logging error messages. If an error occurs,
+    the provided `on_error` callback is executed (if given).
+    """
+    try:
+        return func(*args,**kwargs)
+    except Exception as e:
+        logging.error(f'Error in {func.__name__}: {type(e).__name__}: {e}')
+        if on_error:
+            on_error()
+        return
+def intersect_lists(
+        one: list | None,
+        two: list | None,
+    )-> list | None:
+    if not one or not two:
+        return None
+    else:
+        return list(set(one) & set(two))
+def sord_by_order(
+        to_be_ordered: list | set,
+        order: list | tuple,
+    )-> tuple:
+    assert set(to_be_ordered).issubset(set(order)), (
+        f'The requested ordering of {to_be_ordered}\n'
+        f'is not part of {order}'
+    )
+    return tuple(
+            item
+            for item in order
+            if item in to_be_ordered
+        )
+def get_measure_names(
+        measures_available: list[str],
+        measures_input: list[str] | set[str] | None = None,
+    ) -> tuple[str, ...]:
+    """
+    check if input measures are correct
+    if no specific input measures are requested,
+    calculate all of them
+    """
+    if measures_input is None:
+        logging.debug(
+            f'All measures will be computed. That are:\n\n'
+            f'{measures_available}'
+        )
+        return tuple(measures_available)
+    else:
+        # bring input measures in order of measures_available
+        measures_in_sequence = sord_by_order(
+            to_be_ordered=measures_input,
+            order=measures_available,
+        )
+        return measures_in_sequence
+def open_list(
+        path:Path
+    ) -> list[str]:
+    """
+    Reads a text file and returns its lines as a list of strings.
+    """
+    try:
+        with open(path,'r') as f:
+            terms = f.read().splitlines()
+            if terms[-1]=='':
+                logging.warning(f'Empty string at the end of {path.name}')
+        return terms
+    except FileNotFoundError:
+        logging.warning(
+            f'{__name__}: The file {path.name} in could not be found in {path}.\n'
+        )
+        raise
+def load_csv(
+        path: Path,
+        has_header: bool = True,
+    ) -> pd.DataFrame:
+    # Pandas wants an in in header refering to the line of the header,
+    # and None, if no header is given.
+    header_line = 0 if has_header else None
+    csv_df = pd.read_csv(
+        path,
+        header=header_line,
+        **common_config.CSV_EXPORT_CONFIG
+    )
+    return csv_df
+# Outdated and not used so far
+# def load_results(
+#         path: Path # path to a directory
+#     )-> dict[str,np.ndarray]:
+#     """
+#     Loads data from CSV files in the specified directory
+#     into a dictionary of NumPy arrays.
+#     """
+#     paths = create_pathlist(path,'csv')
+#     file_names = [path.name for path in paths]
+#     logging.debug(f'Found {file_names}')
+#     results = {}
+#     for path in paths:
+#         # exclude terms.csv from being read
+#         if path.stem=='terms':
+#             continue
+#         results[path.stem]=np.array(open_list(path))
+#     logging.debug(f'loaded {list(results.keys())}')
+#     # test the loaded arrays
+#     first_array = list(results.values())[0]
+#     length = len(first_array)
+#     # check if length is plausible
+#     assert len(first_array) > 30, f'Length of {list(results.keys())[0]} is below 30!'
+#     # check if all arrays are of the same length
+#     for key, array in results.items():
+#         assert (
+#             len(array) == length
+#         ), f"Array for key '{key}' has only {len(array)} elements (expected: {length})"
+#     return results
+""" general helpers """
+def sort_arrays_desc(
+        array: np.ndarray,
+    ) -> np.ndarray:
+    return np.sort(array)[::-1]
+def chunk_list(
+        items: list[str],
+        chunk_length: int
+    ) -> list[list[str]]:
+    """
+    Splits a list into chunks of the specified length.
+    """
+    return [
+        items[i : i + chunk_length]
+        for i in range(0, len(items), chunk_length)
+    ]

pydistintox/distinct_measures/__init__.py ADDED Viewed

File without changes

pydistintox/distinct_measures/config.py ADDED Viewed

@@ -0,0 +1,57 @@
+# third-party imports
+import numpy as np
+# application-specific imports
+from pydistintox.common.config import SparseMatrix
+# intra-package imports
+from .measures import (
+    calculate_Eta_deviation_of_proportions,
+    calculate_original_Zeta,
+    calculate_ratio_of_relative_frequencies,
+    calculate_Zeta_log2_transformed,
+    calculate_chi_squared_test,
+    calculate_log_likelihood_ratio_test,
+    calculate_welch_t_test,
+    calculate_wilkoxon_rank_sum_test,
+)
+"""measures"""
+# caution: the oder of the following dict detemines
+# order in which the measures are calculated
+mapping_of_names_and_measure_functions = {
+        'welch_t_value' : calculate_welch_t_test,  # needs dense matrices
+        'ranksumtest_value' : calculate_wilkoxon_rank_sum_test,  # is faster with dense
+        'chi_square_value' : calculate_chi_squared_test,  # is faster with dense
+        'eta_sg0' : calculate_Eta_deviation_of_proportions,
+        'zeta_sd0' : calculate_original_Zeta,
+        'rrf_dr0' : calculate_ratio_of_relative_frequencies,
+        'zeta_sd2' : calculate_Zeta_log2_transformed,
+        'LLR_value' : calculate_log_likelihood_ratio_test,
+}
+measures_available = list(mapping_of_names_and_measure_functions.keys())
+# list of measures (not used)
+# measures =[
+#     'zeta_sd0',
+#     'zeta_sd2',
+#      'rrf_dr0',
+#      'eta_sg0',
+#      'welch_t_value',
+#      'ranksumtest_value',
+#      'chi_square_value',
+#      'LLR_value'
+#      # 'tf_idf'
+# ]
+# names_of_td_matrices = [
+#     'abs_tar',
+#     'abs_ref',
+#     'bin_tar',
+#     'bin_ref',
+#     'rel_tar',
+#     'rel_ref',
+# ]

pydistintox/distinct_measures/core.py ADDED Viewed

@@ -0,0 +1,159 @@
+# standard library imports
+import logging
+from pathlib import Path
+# third-party imports
+import numpy as np
+import pandas as pd
+from scipy.stats import kendalltau
+# application-specific imports
+import pydistintox.common.config as common_config
+from pydistintox.common.config import (
+    SparseMatrix,
+    Config
+)
+import pydistintox.common.utils as common_utils
+from pydistintox.common.utils import (
+    handle_errors,
+)
+# intra-package imports
+from .config import (
+    measures_available,
+    mapping_of_names_and_measure_functions,
+)
+from .measures import (
+    get_indicators,
+    scale_results,
+)
+from .utils import (
+    log_shapes,
+)
+def calculate_scores(
+        matrices: common_config.Matrices,  # containing also docprops and relative
+        config: Config,
+    )-> dict[str, np.ndarray]:
+    """
+    This is the main funktion, doing all the calculation.
+    it takes and gives out a dictionary containing these keys:
+    ['zeta_sd0', 'zeta_sd2', 'rrf_dr0',
+    'eta_sg0', 'welch_t_value', 'ranksumtest_value',
+    'chi_square_value', 'LLR_value', 'tf_idf']
+    This function implements several distinctive measures.
+    """
+    logging.info('Starting to calculate scores...')
+    logging.debug(f'Measures requested: {config.measures_to_calculate}')
+    """ Preprocessing """
+    non_tf_idf_measures_to_calculate = common_utils.intersect_lists(
+        config.measures_to_calculate,
+        measures_available
+    )
+    measures_in_sequence = common_utils.get_measure_names(
+        measures_input=non_tf_idf_measures_to_calculate,
+        measures_available=measures_available,
+    )
+    logging.debug(
+        f'measures in sequence: \n'
+        f'{measures_in_sequence}'
+    )
+    log_shapes(matrices)
+    """ Gather data for analysis"""
+    # data is stored in a dictionary
+    # which will be given to the calulation
+    # calculate indicators (docprops / relative frequencies)
+    data = common_config.CalculationData(
+        matrices,
+        get_indicators(matrices)
+    )
+    # save parameters
+    data.set_params(
+        segmentlength=config.segmentlength,
+        logaddition=config.logaddition,
+        divaddition=config.divaddition,
+    )
+    """ Start calculation"""
+    result = {}
+    for measure_name in measures_in_sequence:
+        # load function for measure
+        fct = mapping_of_names_and_measure_functions[measure_name]
+        # execute function
+        result_measure = handle_errors(
+            fct,
+            data,
+        )
+        # save results only in resutlt dictionary if:
+        if result_measure is not None:
+            logging.debug(f'{measure_name} is None')
+            result[measure_name] = result_measure
+    logging.debug(f'Scores calculated: \n {result.keys()}')
+    # For the time beeing, dense matrices are necessary for welch's t-Test only
+    # However, if they are generated, Chi-squared test and Wilcoxon rank-sum test
+    # use these dense matrices, since they make the computation faster.
+    dense_absolute_matrices_loaded = (
+        data.matrices.abs_dense_tar is not None and data.matrices.abs_dense_ref is not None
+    )
+    logging.debug(
+        f'dense_absolute_matrices_loaded = {dense_absolute_matrices_loaded}'
+    )
+    # results of tfidf measures are given by the other package td_matrices
+    not_computed = set(measures_in_sequence).difference(set(result.keys()))
+    logging.debug(f"Results calculated for: {', '.join(result.keys())}")
+    if not_computed:
+        logging.info(f"Could not compute {', '.join(not_computed)}")
+    # scaling
+    if config.scaling:
+        scale_results(
+            result,
+        )
+    logging.debug(
+        f'statistical measures calculated \n'
+        f' {calculate_scores.__name__} finished')
+    return result
+def calculate_rank_correlation(
+        results: dict[str,np.ndarray],
+    )-> tuple[np.ndarray, list[str]]:
+    """
+    Calculates the Kendall rank correlation coefficients
+    between all pairs of measures in the input dictionary.
+    It returns them as a 2-d matrix (np.ndarray)
+    As the input arrays are uniformly scaled, only rank correlation is meaningful.
+    The resulting matrix is symmetric, with each entry representing the correlation between two measures.
+    """
+    index = list(results.keys())
+    number_of_measures = len(results.keys())
+    rows,cols = number_of_measures, number_of_measures
+    matrix = np.empty((rows,cols))
+    # fill matrix
+    for i in range(rows):
+        for j in range(cols):
+            key_i = index[i]
+            array_i = results[key_i]
+            key_j = index[j]
+            array_j = results[key_j]
+            tau, p_value =kendalltau(array_i,array_j)  # p_value not used
+            matrix[i,j] = tau
+    return matrix, index

pydistintox/distinct_measures/load_matrices.py ADDED Viewed

@@ -0,0 +1,50 @@
+# standard library imports
+import logging
+from pathlib import Path
+# third-party imports
+import scipy.sparse as sp
+# application-specific imports
+from pydistintox.common.config import SparseMatrix
+"""Load functions"""
+def load_sparse(
+        file_names:list[str],
+        input_dir: Path
+        )-> dict[str,sp.csc_array|sp.csr_array]:
+    """
+    takes a list of file names such as ['absfreq_csr.pkl',...],
+    and returns a dictionary with the file names as keys.
+    """
+    matrices = {}
+    logging.debug(
+        'The following matrices will be loaded:',
+        file_names
+    )
+    for name in file_names:
+        file_path = input_dir / name
+        if file_path.exists():
+            matrix = sp.load_npz(file_path)
+            matrices[name.removesuffix('.npz')] = matrix
+        else:
+            logging.error(f'Warning: File {name} not found in {input_dir}')
+    logging.debug(
+        'saved matrices in dictionary:',
+        list(matrices.keys())
+    )
+    return matrices
+def load_sparse_from_path(
+        path:Path
+    )-> SparseMatrix:
+    logging.debug(f'The following matrice is loaded: {path.name}')
+    return sp.load_npz(path)