PyDistintoX 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pydistintox/.DS_Store ADDED
Binary file
@@ -0,0 +1,51 @@
1
+ __version__ = "unversioned"
2
+ __author__ = "Leon Glüsing and Stefan Walter-Heßbrüggen"
3
+ __author_email__ = "leongluesing@uni-muenster.de"
4
+ __license__ = "CC0-1.0"
5
+ __homepage__ = "https://www.uni-muenster.de/Wissenschaftstheorie/Forschung/PRODATPHIL/"
6
+ __description__ = (
7
+ 'Pydistinto-Lite is a lightweight, memory-efficient tool '
8
+ 'for comparing two text corpora to identify statistically '
9
+ 'distinctive words. Originally based on '
10
+ '[Pydistinto](https://github.com/Zeta-and-Company/pydistinto)'
11
+ ', it uses Gensim and NumPy/SciPy to handle large datasets '
12
+ 'that exceed RAM limits.'
13
+ )
14
+
15
+ from pydistintox.common.config import (
16
+ Config,
17
+ )
18
+ from pydistintox.common.utils import (
19
+ save_as_sparse,
20
+ setup_logging,
21
+ )
22
+ from pydistintox.td_matrices.core import (
23
+ compute_td_matrices_and_measures,
24
+ )
25
+ from pydistintox.distinct_measures.core import (
26
+ calculate_rank_correlation,
27
+ calculate_scores,
28
+ )
29
+ from pydistintox.visualize.core import (
30
+ format_scores,
31
+ open_html,
32
+ save_results,
33
+ visualize_rank_correlations,
34
+ visualize_score,
35
+ write_html_index,
36
+ )
37
+
38
+ __all__ = [
39
+ "save_as_sparse",
40
+ "setup_logging",
41
+ "Config",
42
+ "compute_td_matrices_and_measures",
43
+ "calculate_rank_correlation",
44
+ "calculate_scores",
45
+ "format_scores",
46
+ "save_results",
47
+ "visualize_rank_correlations",
48
+ "visualize_score",
49
+ "write_html_index",
50
+ "open_html",
51
+ ]
@@ -0,0 +1,4 @@
1
+ from .main import main
2
+
3
+ if __name__ == '__main__':
4
+ main()
pydistintox/cli.py ADDED
@@ -0,0 +1,258 @@
1
+ # standard library imports
2
+ import argparse
3
+ import logging
4
+ from pathlib import Path
5
+ import sys
6
+
7
+ # third-party imports
8
+
9
+ # application-specific imports
10
+ import pydistintox.common.utils as common_utils
11
+ import pydistintox.common.config as common_config
12
+
13
+ # intra-package imports
14
+
15
+
16
+ """ Argument handling """
17
+ def arguments()-> argparse.Namespace:
18
+ # create parser
19
+ parser = argparse.ArgumentParser(
20
+ description='Generate matrices for analysis.'
21
+ )
22
+
23
+ # debug option
24
+ parser.add_argument(
25
+ '--debug',
26
+ action='store_true',
27
+ help='Changes logging mode to verbose.'
28
+ )
29
+
30
+ # add package-specific parser options
31
+ parser.add_argument(
32
+ '--load-nlp',
33
+ nargs='?', # Path is optional
34
+ type=str,
35
+ const='__USE_DEFAULT__', # Default path if only --load-nlp (without path) is specified
36
+ metavar='PATH',
37
+ help=(
38
+ 'Load NLP results from a directory (default: ./data/interim/json/). '
39
+ 'The directory must contain tar/ and ref/ subfolders. '
40
+ 'Example:\n'
41
+ ' --load-nlp # Uses default path\n'
42
+ ' --load-nlp ./custom/path/ # Uses custom path\n'
43
+ 'Expected structure:\n'
44
+ ' ./data/interim/json/\n'
45
+ ' ├── tar/\n'
46
+ ' │ ├── file1.json\n'
47
+ ' │ └── ...\n'
48
+ ' └── ref/\n'
49
+ ' ├── file1.json\n'
50
+ ' └── ...\n'
51
+ )
52
+ )
53
+ parser.add_argument(
54
+ '--save-nlp',
55
+ nargs='?', # Path is optional
56
+ type=str,
57
+ const='__USE_DEFAULT__', # Default path if only --save-nlp (without path) is specified
58
+ metavar='PATH',
59
+ help=(
60
+ 'Save NLP results to a directory (default: ./data/interim/json/). '
61
+ 'The program will create tar/ (target corpus) and ref/ (reference corpus) subfolders '
62
+ 'and save the processed JSON files there. '
63
+ 'These results can later be loaded for reuse with --load-nlp by specifying the same path. '
64
+ 'Example:\n'
65
+ ' --save-nlp # Uses default path\n'
66
+ ' --save-nlp ./custom/path/ # Uses custom path\n'
67
+ 'This will create the following structure:\n'
68
+ ' ./data/interim/json/\n'
69
+ ' ├── tar/\n'
70
+ ' │ ├── file1.json\n'
71
+ ' │ ├── file2.json\n'
72
+ ' │ └── ...\n'
73
+ ' └── ref/\n'
74
+ ' ├── file1.json\n'
75
+ ' ├── file2.json\n'
76
+ ' └── ...\n\n'
77
+ 'If the directory does not exist, it will be created automatically.'
78
+ )
79
+ )
80
+ parser.add_argument(
81
+ '--example',
82
+ action='store_true',
83
+ help='Use this flag to use example data instead of your own.'
84
+ )
85
+ parser.add_argument(
86
+ '--input-tar',
87
+ type=str,
88
+ help='Path to directory from which to load the data. (default is data/corp_tar)'
89
+ )
90
+ parser.add_argument(
91
+ '--input-ref',
92
+ type=str,
93
+ help='Path to directory from which to load the data. (default is data/corp_ref)'
94
+ )
95
+ parser.add_argument(
96
+ '--model',
97
+ type=str,
98
+ # required=True,
99
+ help=(
100
+ "spaCy model name format: '{lang}_core_{dataset}_{size}'"
101
+ "Example: 'en_core_web_sm' (lang=en, dataset=web, size=sm)\n"
102
+ # "'en_core_web_sm' is used when no model is specified.\n\n"
103
+ "Available sizes: sm, md, lg, trf"
104
+ "Find models: https://spacy.io/models"
105
+ )
106
+ )
107
+ parser.add_argument(
108
+ '--raw-scores',
109
+ action='store_true',
110
+ help='Scores will not be scaled to -1,1.'
111
+ )
112
+
113
+ # parse given arguments and return them
114
+ args = parser.parse_args()
115
+
116
+ # checks
117
+ if not args.example and not args.model:
118
+ parser.error('--model is required unless --example is specified')
119
+
120
+
121
+ return args
122
+
123
+
124
+ def handle_input(args: argparse.Namespace) -> common_config.Config:
125
+ """Resolves input paths based on CLI arguments and returns a structured config."""
126
+
127
+ # local function
128
+ def _validate_paths(
129
+ target: Path,
130
+ reference: Path,
131
+ error_message: str
132
+ ) -> None:
133
+ """Validates that directories exist and are not empty."""
134
+ if common_utils.dir_is_empty(target):
135
+ logging.error(
136
+ f"Directory '{target}' is empty. {error_message}"
137
+ )
138
+ sys.exit(1)
139
+ if common_utils.dir_is_empty(reference):
140
+ logging.error(
141
+ f"Directory '{reference}' is empty. {error_message}"
142
+ )
143
+ sys.exit(1)
144
+
145
+ # local function
146
+ def _log_paths(
147
+ config: common_config.Config,
148
+ message: str
149
+ ) -> None:
150
+ """Logs the paths being used."""
151
+ logging.info(
152
+ f'{message}\n'
153
+ f'Target dir: {config.target}\n'
154
+ f'Reference dir: {config.reference}'
155
+ )
156
+
157
+ # --- Case 1: Load NLP results ---
158
+ if args.load_nlp is not None:
159
+ assert args.save_nlp is None, (
160
+ "Error: '--load-nlp' and '--save-nlp' have been raised."
161
+ )
162
+ logging.warning(
163
+ 'CAUTION: To ensure all annotations and token mappings are loaded correctly, '
164
+ 'use the same spaCy model (and version) that was used to store the data.'
165
+ )
166
+ INPUT_JSON = (
167
+ common_config.JSON if args.load_nlp == '__USE_DEFAULT__'
168
+ else Path(args.load_nlp)
169
+ )
170
+ config = common_config.Config(
171
+ skip_nlp=True,
172
+ save_nlp=False,
173
+ target=INPUT_JSON / 'tar',
174
+ reference=INPUT_JSON / 'ref',
175
+ source='load-nlp',
176
+ spacy_model_name=args.model,
177
+ )
178
+ _validate_paths(
179
+ config.target,
180
+ config.reference,
181
+ error_message=(
182
+ "The '--load-nlp' flag was set, "
183
+ "but no corpus files were found."
184
+ )
185
+ )
186
+ _log_paths(
187
+ config,
188
+ 'The following paths will be used to load JSON:'
189
+ )
190
+
191
+ # --- Case 2: nlp necessary ---
192
+ else:
193
+ # --- Case 2.1: example flag is raised ---
194
+ if bool(args.example):
195
+ logging.debug('Example flag was raised')
196
+ assert not args.input_tar and not args.input_ref, (
197
+ 'Cannot use --example and custom input paths simultaneously.'
198
+ )
199
+ assert not (
200
+ common_utils.dir_is_empty(common_config.EXAMPLE_TAR)
201
+ or common_utils.dir_is_empty(common_config.EXAMPLE_REF)
202
+ ), (
203
+ 'Example directories are empty or do not exist.'
204
+ )
205
+ config = common_config.Config(
206
+ skip_nlp=False,
207
+ target=common_config.EXAMPLE_TAR,
208
+ reference=common_config.EXAMPLE_REF,
209
+ source='example',
210
+ spacy_model_name='en_core_web_sm',
211
+ )
212
+ # --- Case 2.2: input paths have been specified --
213
+ elif bool(args.input_tar) or bool(args.input_ref):
214
+ assert bool(args.input_tar) == bool(args.input_ref), (
215
+ 'Only one input path was specified!\n'
216
+ 'If you want to specify input paths, '
217
+ 'please specify both target and reference directories.'
218
+ )
219
+ config = common_config.Config(
220
+ skip_nlp=False,
221
+ target=Path(args.input_tar),
222
+ reference=Path(args.input_ref),
223
+ source='external',
224
+ spacy_model_name=args.model,
225
+ )
226
+ _log_paths(
227
+ config,
228
+ 'The following paths have been given:\n\n'
229
+ )
230
+ else:
231
+ logging.info('No input paths specified. Using default corpus directories.')
232
+ config = common_config.Config(
233
+ skip_nlp=False,
234
+ target=common_config.CORP_TAR,
235
+ reference=common_config.CORP_REF,
236
+ source='default',
237
+ spacy_model_name=args.model,
238
+ )
239
+ _validate_paths(
240
+ config.target,
241
+ config.reference,
242
+ 'Default corpus directories should not be empty.'
243
+ )
244
+ _log_paths(
245
+ config,
246
+ 'Default paths are used; no input paths have been specified.'
247
+ )
248
+
249
+ # --- Parameters independent of nlp ---
250
+ config.debug = bool(args.debug)
251
+ config.save_nlp = (
252
+ False if args.save_nlp is None
253
+ else common_config.JSON if args.save_nlp == '__USE_DEFAULT__'
254
+ else Path(args.save_nlp)
255
+ )
256
+ config.scaling = not(args.raw_scores)
257
+
258
+ return config
@@ -0,0 +1 @@
1
+
File without changes
@@ -0,0 +1,155 @@
1
+ # standard library imports
2
+ import csv
3
+ from dataclasses import (
4
+ dataclass,
5
+ field
6
+ )
7
+ import logging
8
+ from pathlib import Path
9
+ from typing import TypeAlias
10
+
11
+ # third-party imports
12
+ import numpy as np
13
+ from scipy import sparse as sp
14
+
15
+
16
+ """ Paths """
17
+
18
+ # general
19
+ PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent.parent
20
+ SRC = PROJECT_ROOT / 'src'
21
+
22
+ # data
23
+ DATA = PROJECT_ROOT / 'data'
24
+ INTERIM = DATA / 'interim'
25
+ JSON = INTERIM / 'json'
26
+ JSON_TAR = JSON / 'tar'
27
+ JSON_REF = JSON / 'ref'
28
+ TD_MATRICES = INTERIM / 'td_matrices'
29
+ STAGING = INTERIM / 'staging'
30
+ RESULTS = DATA / 'results'
31
+ RESULTS_SCORES = RESULTS / 'scores'
32
+ RESULTS_CORRELATIONS = RESULTS / 'correlations_of_measures'
33
+ SCORES_EXAMPLE = RESULTS_SCORES / 'example'
34
+ CORRELATIONS_EXAMPLE = RESULTS_CORRELATIONS / 'example'
35
+
36
+ # texts
37
+ TEXTS = DATA / 'texts'
38
+ CORP_TAR = TEXTS / 'corp_tar'
39
+ CORP_REF = TEXTS / 'corp_ref'
40
+
41
+ # example texts
42
+ EXAMPLE_TEXTS = TEXTS / 'example_texts'
43
+ EXAMPLE_TAR = EXAMPLE_TEXTS / 'corp_tar'
44
+ EXAMPLE_REF = EXAMPLE_TEXTS / 'corp_ref'
45
+ EXAMPLE_JSON = JSON / 'example'
46
+ EXAMPLE_JSON_TAR = EXAMPLE_JSON / 'tar'
47
+ EXAMPLE_JSON_REF = EXAMPLE_JSON / 'ref'
48
+
49
+ # tests
50
+ TESTS = PROJECT_ROOT / 'tests'
51
+
52
+ """ Type declaration """
53
+ SparseMatrix: TypeAlias = sp.csc_array | sp.csr_array
54
+
55
+
56
+ @dataclass
57
+ class Config:
58
+ """Configuration for NLP input paths and processing flags."""
59
+ skip_nlp: bool
60
+ target: Path
61
+ reference: Path
62
+ spacy_model_name: str # = 'en_core_web_sm'
63
+ source: str = 'Not specified' # only for logging / documentation
64
+ save_nlp: bool | Path = False
65
+ debug: bool = False
66
+ measures_to_calculate: list[str] | None = None
67
+ scaling: bool = True # scale scores to -1,1
68
+ segmentlength: int = 5000
69
+ # the following parameters are relevant to non-tf-idf measures
70
+ logaddition: np.float64 = np.float64(1 + 1e-11)
71
+ # logaddition is a normalization factor;
72
+ # we want log(x + logaddition) > 0,
73
+ # hence logaddition > 1 for x near 0.
74
+ divaddition: np.float64 = np.float64(1e-11)
75
+ # Define division-by-zero avoidance
76
+ # and convert input into np.float
77
+ def __post_init__(self):
78
+ from pydistintox.td_matrices.config import measures_available as tf_idf_measures
79
+ from pydistintox.distinct_measures.config import measures_available as non_tf_idf_measures
80
+ measures_available = set(tf_idf_measures + non_tf_idf_measures)
81
+ if self.measures_to_calculate:
82
+ assert set(self.measures_to_calculate).issubset(measures_available), (
83
+ f'The requested measures are not available. Please check for typos.\n'
84
+ f'Your input:\n\n'
85
+ f'{self.measures_to_calculate}\n\n'
86
+ f'Possible measures:\n\n'
87
+ f'{measures_available}'
88
+ )
89
+
90
+ @dataclass
91
+ class Matrices:
92
+ # sparse dt-matrices
93
+ bin_tar: SparseMatrix
94
+ bin_ref: SparseMatrix
95
+ rel_tar: SparseMatrix
96
+ rel_ref: SparseMatrix
97
+ abs_tar: SparseMatrix
98
+ abs_ref: SparseMatrix
99
+
100
+ # dense dt-matrices
101
+ abs_dense_tar: np.ndarray | None = field(init=False, default=None)
102
+ abs_dense_ref: np.ndarray | None = field(init=False, default=None)
103
+
104
+ @dataclass
105
+ class Indicators:
106
+ docprops_tar: np.ndarray
107
+ docprops_ref: np.ndarray
108
+ relative_tar: np.ndarray
109
+ relative_ref: np.ndarray
110
+
111
+ @dataclass
112
+ class CalculationData:
113
+ matrices: Matrices
114
+ indicators: Indicators
115
+ # Parameters added later
116
+ segmentlength: int | None = field(init=False, default=None)
117
+ logaddition: np.float64 | None = field(init=False, default=None)
118
+ divaddition: np.float64 | None = field(init=False, default=None)
119
+
120
+ def set_params(
121
+ self,
122
+ logaddition:np.float64,
123
+ segmentlength:int,
124
+ divaddition:np.float64,
125
+ ) -> None:
126
+
127
+ self.segmentlength = segmentlength
128
+ self.logaddition = logaddition
129
+ self.divaddition = divaddition
130
+
131
+ logging.info(
132
+ f'The following parameters are used:\n\n'
133
+ f'segmentlength: {segmentlength}\n'
134
+ f'logaddition: {logaddition}\n'
135
+ f'divaddition: {divaddition}\n'
136
+ )
137
+
138
+
139
+
140
+ # def __post_init__(self) -> None:
141
+ # """Optional: Validate data after initialization."""
142
+ # required_sparse = {'tar', 'ref'}
143
+ # if not required_sparse.issubset(self.sparse_matrices.keys()):
144
+ # raise ValueError(f"Missing sparse matrices. Expected: {required_sparse}")
145
+
146
+
147
+
148
+ CSV_EXPORT_CONFIG = {
149
+ "quoting": csv.QUOTE_NONNUMERIC,
150
+ "quotechar": '"',
151
+ "doublequote": True,
152
+ "escapechar": '\\',
153
+ "sep": ',', # Default separator
154
+ "encoding": 'utf-8', # Default encoding
155
+ }