PyDistintoX 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,347 @@
1
+ # standard library imports
2
+ import logging
3
+ from pathlib import Path
4
+ from typing import (
5
+ Any,
6
+ Callable,
7
+ Optional,
8
+ ParamSpec,
9
+ TypeVar,
10
+ )
11
+
12
+ # third-party imports
13
+ import numpy as np
14
+ import pandas as pd
15
+ from scipy import sparse as sp
16
+
17
+ # No application-specific imports to avoid circular imports
18
+
19
+ # intra-package imports
20
+ import pydistintox.common.config as common_config
21
+ from .config import SparseMatrix
22
+
23
+ def dir_is_empty(dir: Path)-> bool:
24
+
25
+ set_of_files = {f.name for f in dir.iterdir()}
26
+
27
+ # ignore .gitignore file:
28
+ set_of_files.discard('.gitkeep')
29
+
30
+ if not set_of_files:
31
+ logging.debug(f'The directory {dir} is empty (except .gitkeep)')
32
+ return True
33
+ else:
34
+ logging.debug(f'{dir} contains {list(dir.iterdir())}.')
35
+ return False
36
+
37
+
38
+ def create_pathlist(
39
+ path:Path,
40
+ suffix: str,
41
+ )->list[Path]:
42
+ """
43
+ Takes a path to a directory and a suffix.
44
+ Returns a sorted list of file paths with the given suffix.
45
+ """
46
+
47
+ # checks beforehand
48
+ if not path.is_dir():
49
+ logging.error(f'Could not create pathlist from {path}.')
50
+ logging.debug(f'using {create_pathlist.__name__}')
51
+
52
+ # create pathlist
53
+ pathlist = sorted(list(path.glob(f'*.{suffix}')))
54
+
55
+ # checks afterhand
56
+ if pathlist == []:
57
+ logging.error(f'No files with suffix {suffix} in directory {path}')
58
+ logging.debug(f'Created pathlist: {pathlist}')
59
+ return pathlist
60
+
61
+
62
+ def scan_directory(
63
+ input_pathlist: list[Path],
64
+ )->list[tuple[str, str]]:
65
+ """
66
+ Scan directory and return list of tuples
67
+ containing file name and content from file
68
+ """
69
+ text_string_tuples = []
70
+ logging.debug(f'scanning dir: {input_pathlist}')
71
+
72
+ for txt_file_path in input_pathlist:
73
+
74
+ if not txt_file_path.is_file():
75
+ logging.error(f'Could not scan the following file {txt_file_path}.')
76
+ logging.debug(f'using {scan_directory.__name__}')
77
+ else:
78
+ with open(txt_file_path, 'r', encoding='utf-8') as tf:
79
+ logging.debug(f'scanning file: {txt_file_path}')
80
+ file_name = txt_file_path.parts[-1]
81
+ text_string = tf.read()
82
+ text_tup = file_name, text_string
83
+ text_string_tuples.append(text_tup)
84
+ return text_string_tuples
85
+
86
+
87
+ """Sparse matrix functions"""
88
+ # Sparse matrices is a compressed format that saves RAM. It is not necessarily faster.
89
+ # they come in two formats:
90
+ # csr are efficient for row slicing, while csc are efficient for column slicing.
91
+ # Using csr- and csc_arrays is preferable to using csr_matrix and csc_matrix.
92
+ # csr_matrix is an old format of scipy that will not be longer maintained.
93
+
94
+ def dense_to_sparse(
95
+ matrix: np.ndarray,
96
+ format: str # 'csc' or 'csr'
97
+ )-> SparseMatrix:
98
+ """
99
+ Transforms a dense numpy array into csc/csr respectively.
100
+ """
101
+
102
+ if format == 'csr':
103
+ return sp.csr_array(matrix)
104
+ elif format == 'csc':
105
+ return sp.csc_array(matrix)
106
+ else:
107
+ raise TypeError("Unsupported format. Use 'csr' or 'csc'.")
108
+
109
+
110
+ def save_as_sparse(
111
+ dense_matrix: np.ndarray,
112
+ output_dir:Path,
113
+ file_prefix: str,
114
+ format: str # 'csc' or 'csr'
115
+ )-> None:
116
+ """
117
+ Saves a dense matrix as sparse in the given directory
118
+ using scipy.sparse.save_npz with the suffix 'npz'.
119
+ """
120
+ # make output_dir if not exists
121
+ output_dir.mkdir(
122
+ parents=True,
123
+ exist_ok=True,
124
+ )
125
+ out_name = f'{file_prefix}.npz'
126
+ if (output_dir/out_name).is_file():
127
+ logging.warning(f'File {out_name} exists already in {output_dir} and is replaced.')
128
+ sparse_data = dense_to_sparse(
129
+ dense_matrix,
130
+ format,
131
+ )
132
+ try:
133
+ sp.save_npz(output_dir/out_name, sparse_data)
134
+ logging.debug(f'Saved {str(output_dir/out_name)} as {format}')
135
+ except Exception as e:
136
+ logging.warning(f'Error saving {out_name}: {e}')
137
+
138
+
139
+
140
+ """ Logging settings """
141
+
142
+ def normal_logging():
143
+ logging.basicConfig(
144
+ level=logging.INFO, # show messages of level INFO and above (warnings, errors)
145
+ format='%(message)s', # show message only
146
+ force=True, # force this config even if there have been loggings before
147
+ )
148
+
149
+ def debug_logging():
150
+ logging.basicConfig(
151
+ level=logging.DEBUG, # show messages of all levels
152
+ format='%(asctime)s - %(levelname)s - %(message)s', # format: time - level - message
153
+ force= True,
154
+ )
155
+
156
+ def setup_logging(debug: bool = False) -> None:
157
+ root_logger = logging.root
158
+ if debug:
159
+ debug_logging()
160
+ logging.getLogger('gensim').setLevel(logging.DEBUG)
161
+ logging.getLogger('spacy').setLevel(logging.DEBUG)
162
+ else:
163
+ normal_logging()
164
+ # change gensim logging mode to warning,
165
+ # since logging level of gensim is too verboos in info mode
166
+ logging.getLogger('gensim').setLevel(logging.WARNING)
167
+ name_logging_level = logging.getLevelName(root_logger.level)
168
+ logging.info(f'Current logging level: {name_logging_level}')
169
+ return
170
+
171
+
172
+
173
+ """ other functions """
174
+ T = TypeVar('T')
175
+ P = ParamSpec('P')
176
+
177
+ def handle_errors(
178
+ func: Callable[..., T],
179
+ *args: Any,
180
+ on_error: Optional[Callable[[Exception], Optional[T]]] = None,# only use as keyword argument!
181
+ **kwargs: Any,
182
+ )->Optional[T]:
183
+ """
184
+ Takes a function with arguments and executes it
185
+ while logging error messages. If an error occurs,
186
+ the provided `on_error` callback is executed (if given).
187
+ """
188
+
189
+ try:
190
+ return func(*args,**kwargs)
191
+ except Exception as e:
192
+ logging.error(f'Error in {func.__name__}: {type(e).__name__}: {e}')
193
+ if on_error:
194
+ on_error()
195
+ return
196
+
197
+
198
+ def intersect_lists(
199
+ one: list | None,
200
+ two: list | None,
201
+ )-> list | None:
202
+
203
+ if not one or not two:
204
+ return None
205
+ else:
206
+ return list(set(one) & set(two))
207
+
208
+ def sord_by_order(
209
+ to_be_ordered: list | set,
210
+ order: list | tuple,
211
+ )-> tuple:
212
+
213
+ assert set(to_be_ordered).issubset(set(order)), (
214
+ f'The requested ordering of {to_be_ordered}\n'
215
+ f'is not part of {order}'
216
+ )
217
+
218
+ return tuple(
219
+ item
220
+ for item in order
221
+ if item in to_be_ordered
222
+ )
223
+
224
+
225
+ def get_measure_names(
226
+ measures_available: list[str],
227
+ measures_input: list[str] | set[str] | None = None,
228
+ ) -> tuple[str, ...]:
229
+ """
230
+ check if input measures are correct
231
+ if no specific input measures are requested,
232
+ calculate all of them
233
+ """
234
+
235
+ if measures_input is None:
236
+ logging.debug(
237
+ f'All measures will be computed. That are:\n\n'
238
+ f'{measures_available}'
239
+ )
240
+ return tuple(measures_available)
241
+ else:
242
+ # bring input measures in order of measures_available
243
+ measures_in_sequence = sord_by_order(
244
+ to_be_ordered=measures_input,
245
+ order=measures_available,
246
+ )
247
+
248
+ return measures_in_sequence
249
+
250
+
251
+ def open_list(
252
+ path:Path
253
+ ) -> list[str]:
254
+ """
255
+ Reads a text file and returns its lines as a list of strings.
256
+ """
257
+
258
+ try:
259
+ with open(path,'r') as f:
260
+ terms = f.read().splitlines()
261
+
262
+ if terms[-1]=='':
263
+ logging.warning(f'Empty string at the end of {path.name}')
264
+
265
+ return terms
266
+
267
+ except FileNotFoundError:
268
+ logging.warning(
269
+ f'{__name__}: The file {path.name} in could not be found in {path}.\n'
270
+ )
271
+ raise
272
+
273
+ def load_csv(
274
+ path: Path,
275
+ has_header: bool = True,
276
+ ) -> pd.DataFrame:
277
+
278
+ # Pandas wants an in in header refering to the line of the header,
279
+ # and None, if no header is given.
280
+ header_line = 0 if has_header else None
281
+
282
+ csv_df = pd.read_csv(
283
+ path,
284
+ header=header_line,
285
+ **common_config.CSV_EXPORT_CONFIG
286
+ )
287
+
288
+ return csv_df
289
+
290
+ # Outdated and not used so far
291
+ # def load_results(
292
+ # path: Path # path to a directory
293
+ # )-> dict[str,np.ndarray]:
294
+ # """
295
+ # Loads data from CSV files in the specified directory
296
+ # into a dictionary of NumPy arrays.
297
+ # """
298
+
299
+ # paths = create_pathlist(path,'csv')
300
+ # file_names = [path.name for path in paths]
301
+ # logging.debug(f'Found {file_names}')
302
+ # results = {}
303
+
304
+ # for path in paths:
305
+
306
+ # # exclude terms.csv from being read
307
+ # if path.stem=='terms':
308
+ # continue
309
+ # results[path.stem]=np.array(open_list(path))
310
+
311
+ # logging.debug(f'loaded {list(results.keys())}')
312
+
313
+ # # test the loaded arrays
314
+ # first_array = list(results.values())[0]
315
+ # length = len(first_array)
316
+
317
+ # # check if length is plausible
318
+ # assert len(first_array) > 30, f'Length of {list(results.keys())[0]} is below 30!'
319
+
320
+ # # check if all arrays are of the same length
321
+ # for key, array in results.items():
322
+ # assert (
323
+ # len(array) == length
324
+ # ), f"Array for key '{key}' has only {len(array)} elements (expected: {length})"
325
+
326
+ # return results
327
+
328
+
329
+ """ general helpers """
330
+ def sort_arrays_desc(
331
+ array: np.ndarray,
332
+ ) -> np.ndarray:
333
+
334
+ return np.sort(array)[::-1]
335
+
336
+ def chunk_list(
337
+ items: list[str],
338
+ chunk_length: int
339
+ ) -> list[list[str]]:
340
+ """
341
+ Splits a list into chunks of the specified length.
342
+ """
343
+
344
+ return [
345
+ items[i : i + chunk_length]
346
+ for i in range(0, len(items), chunk_length)
347
+ ]
File without changes
@@ -0,0 +1,57 @@
1
+ # third-party imports
2
+ import numpy as np
3
+
4
+ # application-specific imports
5
+ from pydistintox.common.config import SparseMatrix
6
+
7
+ # intra-package imports
8
+ from .measures import (
9
+ calculate_Eta_deviation_of_proportions,
10
+ calculate_original_Zeta,
11
+ calculate_ratio_of_relative_frequencies,
12
+ calculate_Zeta_log2_transformed,
13
+ calculate_chi_squared_test,
14
+ calculate_log_likelihood_ratio_test,
15
+ calculate_welch_t_test,
16
+ calculate_wilkoxon_rank_sum_test,
17
+ )
18
+
19
+
20
+ """measures"""
21
+
22
+ # caution: the oder of the following dict detemines
23
+ # order in which the measures are calculated
24
+ mapping_of_names_and_measure_functions = {
25
+ 'welch_t_value' : calculate_welch_t_test, # needs dense matrices
26
+ 'ranksumtest_value' : calculate_wilkoxon_rank_sum_test, # is faster with dense
27
+ 'chi_square_value' : calculate_chi_squared_test, # is faster with dense
28
+ 'eta_sg0' : calculate_Eta_deviation_of_proportions,
29
+ 'zeta_sd0' : calculate_original_Zeta,
30
+ 'rrf_dr0' : calculate_ratio_of_relative_frequencies,
31
+ 'zeta_sd2' : calculate_Zeta_log2_transformed,
32
+ 'LLR_value' : calculate_log_likelihood_ratio_test,
33
+ }
34
+
35
+ measures_available = list(mapping_of_names_and_measure_functions.keys())
36
+
37
+ # list of measures (not used)
38
+ # measures =[
39
+ # 'zeta_sd0',
40
+ # 'zeta_sd2',
41
+ # 'rrf_dr0',
42
+ # 'eta_sg0',
43
+ # 'welch_t_value',
44
+ # 'ranksumtest_value',
45
+ # 'chi_square_value',
46
+ # 'LLR_value'
47
+ # # 'tf_idf'
48
+ # ]
49
+
50
+ # names_of_td_matrices = [
51
+ # 'abs_tar',
52
+ # 'abs_ref',
53
+ # 'bin_tar',
54
+ # 'bin_ref',
55
+ # 'rel_tar',
56
+ # 'rel_ref',
57
+ # ]
@@ -0,0 +1,159 @@
1
+ # standard library imports
2
+ import logging
3
+ from pathlib import Path
4
+
5
+ # third-party imports
6
+ import numpy as np
7
+ import pandas as pd
8
+ from scipy.stats import kendalltau
9
+
10
+ # application-specific imports
11
+ import pydistintox.common.config as common_config
12
+ from pydistintox.common.config import (
13
+ SparseMatrix,
14
+ Config
15
+ )
16
+ import pydistintox.common.utils as common_utils
17
+ from pydistintox.common.utils import (
18
+ handle_errors,
19
+ )
20
+
21
+ # intra-package imports
22
+ from .config import (
23
+ measures_available,
24
+ mapping_of_names_and_measure_functions,
25
+ )
26
+ from .measures import (
27
+ get_indicators,
28
+ scale_results,
29
+ )
30
+ from .utils import (
31
+ log_shapes,
32
+ )
33
+
34
+
35
+ def calculate_scores(
36
+ matrices: common_config.Matrices, # containing also docprops and relative
37
+ config: Config,
38
+ )-> dict[str, np.ndarray]:
39
+ """
40
+ This is the main funktion, doing all the calculation.
41
+ it takes and gives out a dictionary containing these keys:
42
+ ['zeta_sd0', 'zeta_sd2', 'rrf_dr0',
43
+ 'eta_sg0', 'welch_t_value', 'ranksumtest_value',
44
+ 'chi_square_value', 'LLR_value', 'tf_idf']
45
+ This function implements several distinctive measures.
46
+ """
47
+
48
+ logging.info('Starting to calculate scores...')
49
+ logging.debug(f'Measures requested: {config.measures_to_calculate}')
50
+
51
+ """ Preprocessing """
52
+
53
+ non_tf_idf_measures_to_calculate = common_utils.intersect_lists(
54
+ config.measures_to_calculate,
55
+ measures_available
56
+ )
57
+ measures_in_sequence = common_utils.get_measure_names(
58
+ measures_input=non_tf_idf_measures_to_calculate,
59
+ measures_available=measures_available,
60
+ )
61
+ logging.debug(
62
+ f'measures in sequence: \n'
63
+ f'{measures_in_sequence}'
64
+ )
65
+
66
+ log_shapes(matrices)
67
+
68
+ """ Gather data for analysis"""
69
+ # data is stored in a dictionary
70
+ # which will be given to the calulation
71
+
72
+ # calculate indicators (docprops / relative frequencies)
73
+ data = common_config.CalculationData(
74
+ matrices,
75
+ get_indicators(matrices)
76
+ )
77
+
78
+ # save parameters
79
+ data.set_params(
80
+ segmentlength=config.segmentlength,
81
+ logaddition=config.logaddition,
82
+ divaddition=config.divaddition,
83
+ )
84
+
85
+ """ Start calculation"""
86
+ result = {}
87
+
88
+ for measure_name in measures_in_sequence:
89
+ # load function for measure
90
+ fct = mapping_of_names_and_measure_functions[measure_name]
91
+ # execute function
92
+ result_measure = handle_errors(
93
+ fct,
94
+ data,
95
+ )
96
+
97
+ # save results only in resutlt dictionary if:
98
+ if result_measure is not None:
99
+ logging.debug(f'{measure_name} is None')
100
+ result[measure_name] = result_measure
101
+
102
+ logging.debug(f'Scores calculated: \n {result.keys()}')
103
+ # For the time beeing, dense matrices are necessary for welch's t-Test only
104
+ # However, if they are generated, Chi-squared test and Wilcoxon rank-sum test
105
+ # use these dense matrices, since they make the computation faster.
106
+
107
+ dense_absolute_matrices_loaded = (
108
+ data.matrices.abs_dense_tar is not None and data.matrices.abs_dense_ref is not None
109
+ )
110
+ logging.debug(
111
+ f'dense_absolute_matrices_loaded = {dense_absolute_matrices_loaded}'
112
+ )
113
+
114
+ # results of tfidf measures are given by the other package td_matrices
115
+ not_computed = set(measures_in_sequence).difference(set(result.keys()))
116
+ logging.debug(f"Results calculated for: {', '.join(result.keys())}")
117
+ if not_computed:
118
+ logging.info(f"Could not compute {', '.join(not_computed)}")
119
+
120
+ # scaling
121
+ if config.scaling:
122
+ scale_results(
123
+ result,
124
+ )
125
+
126
+ logging.debug(
127
+ f'statistical measures calculated \n'
128
+ f' {calculate_scores.__name__} finished')
129
+
130
+ return result
131
+
132
+
133
+ def calculate_rank_correlation(
134
+ results: dict[str,np.ndarray],
135
+ )-> tuple[np.ndarray, list[str]]:
136
+ """
137
+ Calculates the Kendall rank correlation coefficients
138
+ between all pairs of measures in the input dictionary.
139
+ It returns them as a 2-d matrix (np.ndarray)
140
+
141
+ As the input arrays are uniformly scaled, only rank correlation is meaningful.
142
+ The resulting matrix is symmetric, with each entry representing the correlation between two measures.
143
+ """
144
+
145
+ index = list(results.keys())
146
+ number_of_measures = len(results.keys())
147
+ rows,cols = number_of_measures, number_of_measures
148
+ matrix = np.empty((rows,cols))
149
+
150
+ # fill matrix
151
+ for i in range(rows):
152
+ for j in range(cols):
153
+ key_i = index[i]
154
+ array_i = results[key_i]
155
+ key_j = index[j]
156
+ array_j = results[key_j]
157
+ tau, p_value =kendalltau(array_i,array_j) # p_value not used
158
+ matrix[i,j] = tau
159
+ return matrix, index
@@ -0,0 +1,50 @@
1
+ # standard library imports
2
+ import logging
3
+ from pathlib import Path
4
+
5
+ # third-party imports
6
+ import scipy.sparse as sp
7
+
8
+ # application-specific imports
9
+ from pydistintox.common.config import SparseMatrix
10
+
11
+
12
+ """Load functions"""
13
+
14
+ def load_sparse(
15
+ file_names:list[str],
16
+ input_dir: Path
17
+ )-> dict[str,sp.csc_array|sp.csr_array]:
18
+ """
19
+ takes a list of file names such as ['absfreq_csr.pkl',...],
20
+ and returns a dictionary with the file names as keys.
21
+ """
22
+
23
+ matrices = {}
24
+ logging.debug(
25
+ 'The following matrices will be loaded:',
26
+ file_names
27
+ )
28
+
29
+ for name in file_names:
30
+ file_path = input_dir / name
31
+ if file_path.exists():
32
+ matrix = sp.load_npz(file_path)
33
+ matrices[name.removesuffix('.npz')] = matrix
34
+ else:
35
+ logging.error(f'Warning: File {name} not found in {input_dir}')
36
+
37
+ logging.debug(
38
+ 'saved matrices in dictionary:',
39
+ list(matrices.keys())
40
+ )
41
+
42
+ return matrices
43
+
44
+
45
+ def load_sparse_from_path(
46
+ path:Path
47
+ )-> SparseMatrix:
48
+
49
+ logging.debug(f'The following matrice is loaded: {path.name}')
50
+ return sp.load_npz(path)