PyPI - PyDistintoX - Versions diffs - 0.1.0__py3-none-any.whl - Mend

PyDistintoX 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

pydistintox/.DS_Store +0 -0
pydistintox/__init__.py +51 -0
pydistintox/__main__.py +4 -0
pydistintox/cli.py +258 -0
pydistintox/common/.gitkeep +1 -0
pydistintox/common/__init__.py +0 -0
pydistintox/common/config.py +155 -0
pydistintox/common/utils.py +347 -0
pydistintox/distinct_measures/__init__.py +0 -0
pydistintox/distinct_measures/config.py +57 -0
pydistintox/distinct_measures/core.py +159 -0
pydistintox/distinct_measures/load_matrices.py +50 -0
pydistintox/distinct_measures/measures.py +471 -0
pydistintox/distinct_measures/utils.py +37 -0
pydistintox/main.py +92 -0
pydistintox/td_matrices/__init__.py +0 -0
pydistintox/td_matrices/config.py +16 -0
pydistintox/td_matrices/core.py +204 -0
pydistintox/td_matrices/create_matrices.py +49 -0
pydistintox/td_matrices/io_utils.py +147 -0
pydistintox/td_matrices/parsing.py +88 -0
pydistintox/td_matrices/tfidf_measures.py +167 -0
pydistintox/td_matrices/utils.py +64 -0
pydistintox/visualize/__init__.py +0 -0
pydistintox/visualize/config.py +28 -0
pydistintox/visualize/core.py +205 -0
pydistintox/visualize/utils.py +152 -0
pydistintox-0.1.0.dist-info/METADATA +365 -0
pydistintox-0.1.0.dist-info/RECORD +31 -0
pydistintox-0.1.0.dist-info/WHEEL +4 -0
pydistintox-0.1.0.dist-info/entry_points.txt +3 -0

pydistintox/.DS_Store ADDED Viewed

Binary file

pydistintox/__init__.py ADDED Viewed

@@ -0,0 +1,51 @@
+__version__ = "unversioned"
+__author__ = "Leon Glüsing and Stefan Walter-Heßbrüggen"
+__author_email__ = "leongluesing@uni-muenster.de"
+__license__ = "CC0-1.0"
+__homepage__ = "https://www.uni-muenster.de/Wissenschaftstheorie/Forschung/PRODATPHIL/"
+__description__ = (
+    'Pydistinto-Lite is a lightweight, memory-efficient tool '
+    'for comparing two text corpora to identify statistically '
+    'distinctive words. Originally based on '
+    '[Pydistinto](https://github.com/Zeta-and-Company/pydistinto)'
+    ', it uses Gensim and NumPy/SciPy to handle large datasets '
+    'that exceed RAM limits.'
+)
+from pydistintox.common.config import (
+    Config,
+)
+from pydistintox.common.utils import (
+    save_as_sparse,
+    setup_logging,
+)
+from pydistintox.td_matrices.core import (
+    compute_td_matrices_and_measures,
+)
+from pydistintox.distinct_measures.core import (
+    calculate_rank_correlation,
+    calculate_scores,
+)
+from pydistintox.visualize.core import (
+    format_scores,
+    open_html,
+    save_results,
+    visualize_rank_correlations,
+    visualize_score,
+    write_html_index,
+)
+__all__ = [
+    "save_as_sparse",
+    "setup_logging",
+    "Config",
+    "compute_td_matrices_and_measures",
+    "calculate_rank_correlation",
+    "calculate_scores",
+    "format_scores",
+    "save_results",
+    "visualize_rank_correlations",
+    "visualize_score",
+    "write_html_index",
+    "open_html",
+]

pydistintox/__main__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .main import main
+if __name__ == '__main__':
+    main()

pydistintox/cli.py ADDED Viewed

@@ -0,0 +1,258 @@
+# standard library imports
+import argparse
+import logging
+from pathlib import Path
+import sys
+# third-party imports
+# application-specific imports
+import pydistintox.common.utils as common_utils
+import pydistintox.common.config as common_config
+# intra-package imports
+""" Argument handling """
+def arguments()-> argparse.Namespace:
+    # create parser
+    parser = argparse.ArgumentParser(
+        description='Generate matrices for analysis.'
+    )
+    # debug option
+    parser.add_argument(
+        '--debug',
+        action='store_true',
+        help='Changes logging mode to verbose.'
+    )
+    # add package-specific parser options
+    parser.add_argument(
+        '--load-nlp',
+        nargs='?',  # Path is optional
+        type=str,
+        const='__USE_DEFAULT__',  # Default path if only --load-nlp (without path) is specified
+        metavar='PATH',
+        help=(
+            'Load NLP results from a directory (default: ./data/interim/json/). '
+            'The directory must contain tar/ and ref/ subfolders. '
+            'Example:\n'
+            '  --load-nlp           # Uses default path\n'
+            '  --load-nlp ./custom/path/  # Uses custom path\n'
+            'Expected structure:\n'
+            '  ./data/interim/json/\n'
+            '  ├── tar/\n'
+            '  │   ├── file1.json\n'
+            '  │   └── ...\n'
+            '  └── ref/\n'
+            '      ├── file1.json\n'
+            '      └── ...\n'
+        )
+    )
+    parser.add_argument(
+        '--save-nlp',
+        nargs='?',  # Path is optional
+        type=str,
+        const='__USE_DEFAULT__',  # Default path if only --save-nlp (without path) is specified
+        metavar='PATH',
+        help=(
+            'Save NLP results to a directory (default: ./data/interim/json/). '
+            'The program will create tar/ (target corpus) and ref/ (reference corpus) subfolders '
+            'and save the processed JSON files there. '
+            'These results can later be loaded for reuse with --load-nlp by specifying the same path. '
+            'Example:\n'
+            '  --save-nlp           # Uses default path\n'
+            '  --save-nlp ./custom/path/  # Uses custom path\n'
+            'This will create the following structure:\n'
+            '  ./data/interim/json/\n'
+            '  ├── tar/\n'
+            '  │   ├── file1.json\n'
+            '  │   ├── file2.json\n'
+            '  │   └── ...\n'
+            '  └── ref/\n'
+            '      ├── file1.json\n'
+            '      ├── file2.json\n'
+            '      └── ...\n\n'
+            'If the directory does not exist, it will be created automatically.'
+        )
+    )
+    parser.add_argument(
+        '--example',
+        action='store_true',
+        help='Use this flag to use example data instead of your own.'
+    )
+    parser.add_argument(
+        '--input-tar',
+        type=str,
+        help='Path to directory from which to load the data. (default is data/corp_tar)'
+    )
+    parser.add_argument(
+        '--input-ref',
+        type=str,
+        help='Path to directory from which to load the data. (default is data/corp_ref)'
+    )
+    parser.add_argument(
+        '--model',
+        type=str,
+        # required=True,
+        help=(
+            "spaCy model name format: '{lang}_core_{dataset}_{size}'"
+            "Example: 'en_core_web_sm' (lang=en, dataset=web, size=sm)\n"
+            # "'en_core_web_sm' is used when no model is specified.\n\n"
+            "Available sizes: sm, md, lg, trf"
+            "Find models: https://spacy.io/models"
+        )
+    )
+    parser.add_argument(
+        '--raw-scores',
+        action='store_true',
+        help='Scores will not be scaled to -1,1.'
+    )
+    # parse given arguments and return them
+    args = parser.parse_args()
+    # checks
+    if not args.example and not args.model:
+        parser.error('--model is required unless --example is specified')
+    return args
+def handle_input(args: argparse.Namespace) -> common_config.Config:
+    """Resolves input paths based on CLI arguments and returns a structured config."""
+    # local function
+    def _validate_paths(
+            target: Path,
+            reference: Path,
+            error_message: str
+        ) -> None:
+        """Validates that directories exist and are not empty."""
+        if common_utils.dir_is_empty(target):
+            logging.error(
+                f"Directory '{target}' is empty. {error_message}"
+            )
+            sys.exit(1)
+        if common_utils.dir_is_empty(reference):
+            logging.error(
+                f"Directory '{reference}' is empty. {error_message}"
+            )
+            sys.exit(1)
+    # local function
+    def _log_paths(
+            config: common_config.Config,
+            message: str
+        ) -> None:
+        """Logs the paths being used."""
+        logging.info(
+            f'{message}\n'
+            f'Target dir: {config.target}\n'
+            f'Reference dir: {config.reference}'
+        )
+    # --- Case 1: Load NLP results ---
+    if args.load_nlp is not None:
+        assert args.save_nlp is None, (
+            "Error: '--load-nlp' and '--save-nlp' have been raised."
+        )
+        logging.warning(
+            'CAUTION: To ensure all annotations and token mappings are loaded correctly, '
+            'use the same spaCy model (and version) that was used to store the data.'
+        )
+        INPUT_JSON = (
+            common_config.JSON if args.load_nlp == '__USE_DEFAULT__'
+            else Path(args.load_nlp)
+        )
+        config = common_config.Config(
+            skip_nlp=True,
+            save_nlp=False,
+            target=INPUT_JSON / 'tar',
+            reference=INPUT_JSON / 'ref',
+            source='load-nlp',
+            spacy_model_name=args.model,
+        )
+        _validate_paths(
+            config.target,
+            config.reference,
+            error_message=(
+                "The '--load-nlp' flag was set, "
+                "but no corpus files were found."
+            )
+        )
+        _log_paths(
+            config,
+            'The following paths will be used to load JSON:'
+        )
+    # --- Case 2: nlp necessary ---
+    else:
+        # --- Case 2.1: example flag is raised ---
+        if bool(args.example):
+            logging.debug('Example flag was raised')
+            assert not args.input_tar and not args.input_ref, (
+                'Cannot use --example and custom input paths simultaneously.'
+            )
+            assert not (
+                common_utils.dir_is_empty(common_config.EXAMPLE_TAR)
+                or common_utils.dir_is_empty(common_config.EXAMPLE_REF)
+            ), (
+                'Example directories are empty or do not exist.'
+            )
+            config = common_config.Config(
+                skip_nlp=False,
+                target=common_config.EXAMPLE_TAR,
+                reference=common_config.EXAMPLE_REF,
+                source='example',
+                spacy_model_name='en_core_web_sm',
+            )
+        # --- Case 2.2: input paths have been specified --
+        elif bool(args.input_tar) or bool(args.input_ref):
+            assert bool(args.input_tar) == bool(args.input_ref), (
+                'Only one input path was specified!\n'
+                'If you want to specify input paths, '
+                'please specify both target and reference directories.'
+            )
+            config = common_config.Config(
+                skip_nlp=False,
+                target=Path(args.input_tar),
+                reference=Path(args.input_ref),
+                source='external',
+                spacy_model_name=args.model,
+            )
+            _log_paths(
+                config,
+                'The following paths have been given:\n\n'
+            )
+        else:
+            logging.info('No input paths specified. Using default corpus directories.')
+            config = common_config.Config(
+                skip_nlp=False,
+                target=common_config.CORP_TAR,
+                reference=common_config.CORP_REF,
+                source='default',
+                spacy_model_name=args.model,
+            )
+            _validate_paths(
+                config.target,
+                config.reference,
+                'Default corpus directories should not be empty.'
+            )
+            _log_paths(
+                config,
+                'Default paths are used; no input paths have been specified.'
+            )
+    # --- Parameters independent of nlp ---
+    config.debug = bool(args.debug)
+    config.save_nlp = (
+        False if args.save_nlp is None
+        else common_config.JSON if args.save_nlp == '__USE_DEFAULT__'
+        else Path(args.save_nlp)
+    )
+    config.scaling = not(args.raw_scores)
+    return config

pydistintox/common/.gitkeep ADDED Viewed

	@@ -0,0 +1 @@
1	+

pydistintox/common/__init__.py ADDED Viewed

File without changes

pydistintox/common/config.py ADDED Viewed

@@ -0,0 +1,155 @@
+# standard library imports
+import csv
+from dataclasses import (
+    dataclass,
+    field
+)
+import logging
+from pathlib import Path
+from typing import TypeAlias
+# third-party imports
+import numpy as np
+from scipy import sparse as sp
+""" Paths """
+# general
+PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent.parent
+SRC = PROJECT_ROOT / 'src'
+# data
+DATA = PROJECT_ROOT / 'data'
+INTERIM = DATA / 'interim'
+JSON = INTERIM / 'json'
+JSON_TAR = JSON / 'tar'
+JSON_REF = JSON / 'ref'
+TD_MATRICES = INTERIM / 'td_matrices'
+STAGING = INTERIM / 'staging'
+RESULTS = DATA / 'results'
+RESULTS_SCORES = RESULTS / 'scores'
+RESULTS_CORRELATIONS = RESULTS / 'correlations_of_measures'
+SCORES_EXAMPLE = RESULTS_SCORES / 'example'
+CORRELATIONS_EXAMPLE = RESULTS_CORRELATIONS / 'example'
+# texts
+TEXTS = DATA / 'texts'
+CORP_TAR = TEXTS / 'corp_tar'
+CORP_REF = TEXTS / 'corp_ref'
+# example texts
+EXAMPLE_TEXTS = TEXTS / 'example_texts'
+EXAMPLE_TAR = EXAMPLE_TEXTS / 'corp_tar'
+EXAMPLE_REF = EXAMPLE_TEXTS / 'corp_ref'
+EXAMPLE_JSON = JSON / 'example'
+EXAMPLE_JSON_TAR = EXAMPLE_JSON / 'tar'
+EXAMPLE_JSON_REF = EXAMPLE_JSON / 'ref'
+# tests
+TESTS = PROJECT_ROOT / 'tests'
+""" Type declaration """
+SparseMatrix: TypeAlias = sp.csc_array | sp.csr_array
+@dataclass
+class Config:
+    """Configuration for NLP input paths and processing flags."""
+    skip_nlp: bool
+    target: Path
+    reference: Path
+    spacy_model_name: str  # = 'en_core_web_sm'
+    source: str = 'Not specified'  # only for logging / documentation
+    save_nlp: bool | Path = False
+    debug: bool = False
+    measures_to_calculate: list[str] | None = None
+    scaling: bool = True  # scale scores to -1,1
+    segmentlength: int = 5000
+    # the following parameters are relevant to non-tf-idf measures
+    logaddition: np.float64 = np.float64(1 + 1e-11)
+    # logaddition is a normalization factor;
+    # we want log(x + logaddition) > 0,
+    # hence logaddition > 1 for x near 0.
+    divaddition: np.float64 = np.float64(1e-11)
+    # Define division-by-zero avoidance
+    # and convert input into np.float
+    def __post_init__(self):
+        from pydistintox.td_matrices.config import measures_available as tf_idf_measures
+        from pydistintox.distinct_measures.config import measures_available as non_tf_idf_measures
+        measures_available = set(tf_idf_measures + non_tf_idf_measures)
+        if self.measures_to_calculate:
+            assert set(self.measures_to_calculate).issubset(measures_available), (
+                f'The requested measures are not available. Please check for typos.\n'
+                f'Your input:\n\n'
+                f'{self.measures_to_calculate}\n\n'
+                f'Possible measures:\n\n'
+                f'{measures_available}'
+            )
+@dataclass
+class Matrices:
+    # sparse dt-matrices
+    bin_tar: SparseMatrix
+    bin_ref: SparseMatrix
+    rel_tar: SparseMatrix
+    rel_ref: SparseMatrix
+    abs_tar: SparseMatrix
+    abs_ref: SparseMatrix
+    # dense dt-matrices
+    abs_dense_tar: np.ndarray | None = field(init=False, default=None)
+    abs_dense_ref: np.ndarray | None = field(init=False, default=None)
+@dataclass
+class Indicators:
+    docprops_tar: np.ndarray
+    docprops_ref: np.ndarray
+    relative_tar: np.ndarray
+    relative_ref: np.ndarray
+@dataclass
+class CalculationData:
+    matrices: Matrices
+    indicators: Indicators
+    # Parameters added later
+    segmentlength: int | None = field(init=False, default=None)
+    logaddition: np.float64 | None = field(init=False, default=None)
+    divaddition: np.float64 | None = field(init=False, default=None)
+    def set_params(
+            self,
+            logaddition:np.float64,
+            segmentlength:int,
+            divaddition:np.float64,
+        ) ->  None:
+        self.segmentlength = segmentlength
+        self.logaddition = logaddition
+        self.divaddition = divaddition
+        logging.info(
+            f'The following parameters are used:\n\n'
+            f'segmentlength: {segmentlength}\n'
+            f'logaddition: {logaddition}\n'
+            f'divaddition: {divaddition}\n'
+        )
+    # def __post_init__(self) -> None:
+    #     """Optional: Validate data after initialization."""
+    #     required_sparse = {'tar', 'ref'}
+    #     if not required_sparse.issubset(self.sparse_matrices.keys()):
+    #         raise ValueError(f"Missing sparse matrices. Expected: {required_sparse}")
+CSV_EXPORT_CONFIG = {
+    "quoting": csv.QUOTE_NONNUMERIC,
+    "quotechar": '"',
+    "doublequote": True,
+    "escapechar": '\\',
+    "sep": ',',  # Default separator
+    "encoding": 'utf-8',  # Default encoding
+}