csv-detective 0.8.1.dev1416__py3-none-any.whl → 0.8.1.dev1440__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {csv_detective-0.8.1.dev1416.data → csv_detective-0.8.1.dev1440.data}/data/share/csv_detective/CHANGELOG.md +2 -1
  2. csv_detective-0.8.1.dev1440.data/data/share/csv_detective/LICENSE +21 -0
  3. {csv_detective-0.8.1.dev1416.data → csv_detective-0.8.1.dev1440.data}/data/share/csv_detective/README.md +6 -39
  4. csv_detective-0.8.1.dev1440.dist-info/METADATA +267 -0
  5. {csv_detective-0.8.1.dev1416.dist-info → csv_detective-0.8.1.dev1440.dist-info}/RECORD +9 -29
  6. csv_detective-0.8.1.dev1440.dist-info/licenses/LICENSE +21 -0
  7. csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
  8. csv_detective/detect_fields/FR/other/csp_insee/csp_insee.txt +0 -571
  9. csv_detective/detect_fields/FR/other/insee_ape700/insee_ape700.txt +0 -733
  10. csv_detective/detect_fields/geo/iso_country_code_alpha2/iso_country_code_alpha2.txt +0 -495
  11. csv_detective/detect_fields/geo/iso_country_code_alpha3/iso_country_code_alpha3.txt +0 -251
  12. csv_detective/detect_fields/geo/iso_country_code_numeric/iso_country_code_numeric.txt +0 -251
  13. csv_detective/detection/columns.py +0 -89
  14. csv_detective/detection/encoding.py +0 -27
  15. csv_detective/detection/engine.py +0 -46
  16. csv_detective/detection/formats.py +0 -145
  17. csv_detective/detection/headers.py +0 -32
  18. csv_detective/detection/rows.py +0 -18
  19. csv_detective/detection/separator.py +0 -44
  20. csv_detective/detection/variables.py +0 -98
  21. csv_detective/parsing/columns.py +0 -139
  22. csv_detective/parsing/compression.py +0 -11
  23. csv_detective/parsing/csv.py +0 -55
  24. csv_detective/parsing/excel.py +0 -169
  25. csv_detective/parsing/load.py +0 -97
  26. csv_detective/parsing/text.py +0 -61
  27. csv_detective-0.8.1.dev1416.data/data/share/csv_detective/LICENSE.AGPL.txt +0 -661
  28. csv_detective-0.8.1.dev1416.dist-info/METADATA +0 -42
  29. csv_detective-0.8.1.dev1416.dist-info/licenses/LICENSE.AGPL.txt +0 -661
  30. {csv_detective-0.8.1.dev1416.dist-info → csv_detective-0.8.1.dev1440.dist-info}/WHEEL +0 -0
  31. {csv_detective-0.8.1.dev1416.dist-info → csv_detective-0.8.1.dev1440.dist-info}/entry_points.txt +0 -0
  32. {csv_detective-0.8.1.dev1416.dist-info → csv_detective-0.8.1.dev1440.dist-info}/top_level.txt +0 -0
@@ -1,251 +0,0 @@
1
- 004
2
- 710
3
- 248
4
- 008
5
- 012
6
- 276
7
- 020
8
- 024
9
- 660
10
- 010
11
- 028
12
- 682
13
- 032
14
- 051
15
- 533
16
- 036
17
- 040
18
- 031
19
- 044
20
- 048
21
- 050
22
- 052
23
- 112
24
- 056
25
- 084
26
- 204
27
- 060
28
- 064
29
- 068
30
- 535
31
- 070
32
- 072
33
- 074
34
- 076
35
- 096
36
- 100
37
- 854
38
- 108
39
- 136
40
- 116
41
- 120
42
- 124
43
- 132
44
- 140
45
- 152
46
- 156
47
- 162
48
- 196
49
- 166
50
- 170
51
- 174
52
- 178
53
- 180
54
- 184
55
- 410
56
- 408
57
- 188
58
- 384
59
- 191
60
- 192
61
- 531
62
- 208
63
- 262
64
- 214
65
- 212
66
- 818
67
- Gum
68
- (Ré
69
- 222
70
- 784
71
- 218
72
- 232
73
- 724
74
- 233
75
- 840
76
- 231
77
- 238
78
- 234
79
- 242
80
- 246
81
- 250
82
- 266
83
- 270
84
- 268
85
- 239
86
- 288
87
- 292
88
- 300
89
- 308
90
- 304
91
- 312
92
- 316
93
- 320
94
- 831
95
- 324
96
- 624
97
- 226
98
- 328
99
- 254
100
- 332
101
- 334
102
- 340
103
- 344
104
- 348
105
- 833
106
- 581
107
- 092
108
- 850
109
- 356
110
- 360
111
- 364
112
- 368
113
- 372
114
- 352
115
- 376
116
- 380
117
- 388
118
- 392
119
- 832
120
- 400
121
- 398
122
- 404
123
- 417
124
- 296
125
- 414
126
- 418
127
- 426
128
- 428
129
- 422
130
- 430
131
- 434
132
- 438
133
- 440
134
- 442
135
- 446
136
- 807
137
- 450
138
- 458
139
- 454
140
- 462
141
- 466
142
- 470
143
- 580
144
- 504
145
- 584
146
- 474
147
- 480
148
- 478
149
- 175
150
- 484
151
- 583
152
- 498
153
- 492
154
- 496
155
- 499
156
- 500
157
- 508
158
- 104
159
- 516
160
- 520
161
- 524
162
- 558
163
- 562
164
- 566
165
- 570
166
- 574
167
- 578
168
- 540
169
- 554
170
- 086
171
- 512
172
- 800
173
- 860
174
- 586
175
- 585
176
- 275
177
- 591
178
- 598
179
- 600
180
- 528
181
- 604
182
- 608
183
- 612
184
- 616
185
- 258
186
- 630
187
- 620
188
- 634
189
- 638
190
- 642
191
- 826
192
- 643
193
- 646
194
- 732
195
- 652
196
- 659
197
- 674
198
- 663
199
- 534
200
- 666
201
- 336
202
- 670
203
- 654
204
- 662
205
- 090
206
- 882
207
- 016
208
- 678
209
- 686
210
- 688
211
- 690
212
- 694
213
- 702
214
- 703
215
- 705
216
- 706
217
- 729
218
- 728
219
- 144
220
- 752
221
- 756
222
- 740
223
- 744
224
- 748
225
- 760
226
- 762
227
- 158
228
- 834
229
- 148
230
- 203
231
- 260
232
- 764
233
- 626
234
- 768
235
- 772
236
- 776
237
- 780
238
- 788
239
- 795
240
- 796
241
- 792
242
- 798
243
- 804
244
- 858
245
- 548
246
- 862
247
- 704
248
- 876
249
- 887
250
- 894
251
- 716
@@ -1,89 +0,0 @@
1
- import logging
2
- from typing import TextIO
3
- from time import time
4
-
5
- from csv_detective.utils import display_logs_depending_process_time
6
-
7
-
8
- def detect_extra_columns(file: TextIO, sep: str):
9
- """regarde s'il y a des colonnes en trop
10
- Attention, file ne doit pas avoir de ligne vide"""
11
- file.seek(0)
12
- retour = False
13
- nb_useless_col = 99999
14
-
15
- for i in range(10):
16
- line = file.readline()
17
- # regarde si on a un retour
18
- if retour:
19
- assert line[-1] == "\n"
20
- if line[-1] == "\n":
21
- retour = True
22
-
23
- # regarde le nombre de derniere colonne inutile
24
- deb = 0 + retour
25
- line = line[::-1][deb:]
26
- k = 0
27
- for sign in line:
28
- if sign != sep:
29
- break
30
- k += 1
31
- if k == 0:
32
- return 0, retour
33
- nb_useless_col = min(k, nb_useless_col)
34
- return nb_useless_col, retour
35
-
36
-
37
- def detect_heading_columns(file: TextIO, sep: str, verbose: bool = False) -> int:
38
- """Tests first 10 lines to see if there are empty heading columns"""
39
- if verbose:
40
- start = time()
41
- logging.info("Detecting heading columns")
42
- file.seek(0)
43
- return_int = float("Inf")
44
- for i in range(10):
45
- line = file.readline()
46
- return_int = min(return_int, len(line) - len(line.strip(sep)))
47
- if return_int == 0:
48
- if verbose:
49
- display_logs_depending_process_time(
50
- f'No heading column detected in {round(time() - start, 3)}s',
51
- time() - start,
52
- )
53
- return 0
54
- if verbose:
55
- display_logs_depending_process_time(
56
- f'{return_int} heading columns detected in {round(time() - start, 3)}s',
57
- time() - start,
58
- )
59
- return return_int
60
-
61
-
62
- def detect_trailing_columns(file: TextIO, sep: str, heading_columns: int, verbose: bool = False) -> int:
63
- """Tests first 10 lines to see if there are empty trailing columns"""
64
- if verbose:
65
- start = time()
66
- logging.info("Detecting trailing columns")
67
- file.seek(0)
68
- return_int = float("Inf")
69
- for i in range(10):
70
- line = file.readline()
71
- return_int = min(
72
- return_int,
73
- len(line.replace("\n", ""))
74
- - len(line.replace("\n", "").strip(sep))
75
- - heading_columns,
76
- )
77
- if return_int == 0:
78
- if verbose:
79
- display_logs_depending_process_time(
80
- f'No trailing column detected in {round(time() - start, 3)}s',
81
- time() - start,
82
- )
83
- return 0
84
- if verbose:
85
- display_logs_depending_process_time(
86
- f'{return_int} trailing columns detected in {round(time() - start, 3)}s',
87
- time() - start,
88
- )
89
- return return_int
@@ -1,27 +0,0 @@
1
- import logging
2
- from time import time
3
- from io import BytesIO
4
-
5
- from cchardet import detect
6
-
7
- from csv_detective.utils import display_logs_depending_process_time
8
-
9
-
10
- def detect_encoding(binary_file: BytesIO, verbose: bool = False) -> str:
11
- """
12
- Detects file encoding using faust-cchardet (forked from the original cchardet)
13
- """
14
- if verbose:
15
- start = time()
16
- logging.info("Detecting encoding")
17
- encoding_dict = detect(binary_file.read())
18
- if not encoding_dict["encoding"]:
19
- raise ValueError("Could not detect the file's encoding. Consider specifying it in the routine call.")
20
- if verbose:
21
- message = f'Detected encoding: "{encoding_dict["encoding"]}"'
22
- message += f' in {round(time() - start, 3)}s (confidence: {round(encoding_dict["confidence"]*100)}%)'
23
- display_logs_depending_process_time(
24
- message,
25
- time() - start,
26
- )
27
- return encoding_dict['encoding']
@@ -1,46 +0,0 @@
1
- from time import time
2
- from typing import Optional
3
-
4
- import magic
5
- import requests
6
-
7
- from csv_detective.utils import display_logs_depending_process_time, is_url
8
-
9
- COMPRESSION_ENGINES = ["gzip"]
10
- EXCEL_ENGINES = ["openpyxl", "xlrd", "odf"]
11
- engine_to_file = {
12
- "openpyxl": "Excel",
13
- "xlrd": "old Excel",
14
- "odf": "OpenOffice",
15
- "gzip": "csv.gz",
16
- }
17
-
18
-
19
- def detect_engine(file_path: str, verbose=False) -> Optional[str]:
20
- if verbose:
21
- start = time()
22
- mapping = {
23
- "application/gzip": "gzip",
24
- "application/x-gzip": "gzip",
25
- 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'openpyxl',
26
- 'application/vnd.ms-excel': 'xlrd',
27
- 'application/vnd.oasis.opendocument.spreadsheet': 'odf',
28
- # all these files could be recognized as zip, may need to check all cases then
29
- 'application/zip': 'openpyxl',
30
- }
31
- # if none of the above, we move forwards with the csv process
32
- if is_url(file_path):
33
- remote_content = requests.get(file_path).content
34
- engine = mapping.get(magic.from_buffer(remote_content, mime=True))
35
- else:
36
- engine = mapping.get(magic.from_file(file_path, mime=True))
37
- if verbose:
38
- message = (
39
- f"File is not csv, detected {engine_to_file.get(engine, 'csv')}"
40
- if engine else "Processing the file as a csv"
41
- )
42
- display_logs_depending_process_time(
43
- message,
44
- time() - start,
45
- )
46
- return engine
@@ -1,145 +0,0 @@
1
- from collections import defaultdict
2
- from typing import Union
3
-
4
- import numpy as np
5
- import pandas as pd
6
- from csv_detective.detection.variables import (
7
- detect_categorical_variable,
8
- # detect_continuous_variable,
9
- )
10
- from csv_detective.load_tests import return_all_tests
11
- from csv_detective.output.utils import prepare_output_dict
12
- from csv_detective.parsing.columns import test_col, test_label
13
-
14
-
15
- def detect_formats(
16
- table: pd.DataFrame,
17
- analysis: dict,
18
- user_input_tests: Union[str, list[str]] = "ALL",
19
- limited_output: bool = True,
20
- skipna: bool = True,
21
- verbose: bool = False,
22
- ):
23
-
24
- if table.empty:
25
- res_categorical = []
26
- # res_continuous = []
27
- else:
28
- # Detects columns that are categorical
29
- res_categorical, categorical_mask = detect_categorical_variable(table, verbose=verbose)
30
- res_categorical = list(res_categorical)
31
- # Detect columns that are continuous (we already know the categorical) :
32
- # we don't need this for now, cuts processing time
33
- # res_continuous = list(
34
- # detect_continuous_variable(table.iloc[:, ~categorical_mask.values], verbose=verbose)
35
- # )
36
-
37
- analysis.update({
38
- "categorical": res_categorical,
39
- # "continuous": res_continuous,
40
- })
41
-
42
- # list testing to be performed
43
- all_tests_fields = return_all_tests(
44
- user_input_tests, detect_type="detect_fields"
45
- ) # list all tests for the fields
46
- all_tests_labels = return_all_tests(
47
- user_input_tests, detect_type="detect_labels"
48
- ) # list all tests for the labels
49
-
50
- # if no testing then return
51
- if not all_tests_fields and not all_tests_labels:
52
- return analysis
53
-
54
- # Perform testing on fields
55
- scores_table_fields = test_col(table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose)
56
- analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output)
57
-
58
- # Perform testing on labels
59
- scores_table_labels = test_label(table, all_tests_labels, limited_output, verbose=verbose)
60
- analysis["columns_labels"] = prepare_output_dict(scores_table_labels, limited_output)
61
-
62
- # Multiply the results of the fields by 1 + 0.5 * the results of the labels.
63
- # This is because the fields are more important than the labels and yields a max
64
- # of 1.5 for the final score.
65
- scores_table = scores_table_fields * (
66
- 1
67
- + scores_table_labels.reindex(
68
- index=scores_table_fields.index, fill_value=0
69
- ).values / 2
70
- )
71
-
72
- # To reduce false positives: ensure these formats are detected only if the label yields
73
- # a detection (skipping the ones that have been excluded by the users).
74
- formats_with_mandatory_label = [
75
- f for f in [
76
- "code_departement",
77
- "code_commune_insee",
78
- "code_postal",
79
- "latitude_wgs",
80
- "longitude_wgs",
81
- "latitude_wgs_fr_metropole",
82
- "longitude_wgs_fr_metropole",
83
- "latitude_l93",
84
- "longitude_l93",
85
- ] if f in scores_table.index
86
- ]
87
- scores_table.loc[formats_with_mandatory_label, :] = np.where(
88
- scores_table_labels.loc[formats_with_mandatory_label, :],
89
- scores_table.loc[formats_with_mandatory_label, :],
90
- 0,
91
- )
92
- analysis["columns"] = prepare_output_dict(scores_table, limited_output)
93
-
94
- metier_to_python_type = {
95
- "booleen": "bool",
96
- "int": "int",
97
- "float": "float",
98
- "string": "string",
99
- "json": "json",
100
- "json_geojson": "json",
101
- "datetime": "datetime",
102
- "datetime_iso": "datetime",
103
- "datetime_rfc822": "datetime",
104
- "date": "date",
105
- "latitude": "float",
106
- "latitude_l93": "float",
107
- "latitude_wgs": "float",
108
- "latitude_wgs_fr_metropole": "float",
109
- "longitude": "float",
110
- "longitude_l93": "float",
111
- "longitude_wgs": "float",
112
- "longitude_wgs_fr_metropole": "float",
113
- }
114
-
115
- if not limited_output:
116
- for detection_method in ["columns_fields", "columns_labels", "columns"]:
117
- analysis[detection_method] = {
118
- col_name: [
119
- {
120
- "python_type": metier_to_python_type.get(
121
- detection["format"], "string"
122
- ),
123
- **detection,
124
- }
125
- for detection in detections
126
- ]
127
- for col_name, detections in analysis[detection_method].items()
128
- }
129
- else:
130
- for detection_method in ["columns_fields", "columns_labels", "columns"]:
131
- analysis[detection_method] = {
132
- col_name: {
133
- "python_type": metier_to_python_type.get(
134
- detection["format"], "string"
135
- ),
136
- **detection,
137
- }
138
- for col_name, detection in analysis[detection_method].items()
139
- }
140
-
141
- # Add detection with formats as keys
142
- analysis["formats"] = defaultdict(list)
143
- for header, col_metadata in analysis["columns"].items():
144
- analysis["formats"][col_metadata["format"]].append(header)
145
- return analysis
@@ -1,32 +0,0 @@
1
- import logging
2
- from time import time
3
- from typing import Optional, TextIO
4
-
5
- from csv_detective.utils import display_logs_depending_process_time
6
-
7
-
8
- def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, Optional[list]]:
9
- """Tests 10 first rows for possible header (in case header is not 1st row)"""
10
- if verbose:
11
- start = time()
12
- logging.info("Detecting headers")
13
- file.seek(0)
14
- for i in range(10):
15
- header = file.readline()
16
- position = file.tell()
17
- chaine = [c for c in header.replace("\n", "").split(sep) if c]
18
- if chaine[-1] not in ["", "\n"] and all(
19
- [mot not in ["", "\n"] for mot in chaine[1:-1]]
20
- ):
21
- next_row = file.readline()
22
- file.seek(position)
23
- if header != next_row:
24
- if verbose:
25
- display_logs_depending_process_time(
26
- f'Detected headers in {round(time() - start, 3)}s',
27
- time() - start,
28
- )
29
- return i, chaine
30
- if verbose:
31
- logging.info('No header detected')
32
- return 0, None
@@ -1,18 +0,0 @@
1
- import pandas as pd
2
-
3
-
4
- def remove_empty_first_rows(table: pd.DataFrame) -> tuple[pd.DataFrame, int]:
5
- """Analog process to detect_headers for csv files, determines how many rows to skip
6
- to end up with the header at the right place"""
7
- idx = 0
8
- if all([str(c).startswith('Unnamed:') for c in table.columns]):
9
- # there is on offset between the index in the file (idx here)
10
- # and the index in the dataframe, because of the header
11
- idx = 1
12
- while table.iloc[idx - 1].isna().all():
13
- idx += 1
14
- cols = table.iloc[idx - 1]
15
- table = table.iloc[idx:]
16
- table.columns = cols.to_list()
17
- # +1 here because the headers should count as a row
18
- return table, idx
@@ -1,44 +0,0 @@
1
- import csv
2
- import logging
3
- from time import time
4
- from typing import TextIO
5
-
6
- from csv_detective.utils import display_logs_depending_process_time
7
-
8
-
9
- def detect_separator(file: TextIO, verbose: bool = False) -> str:
10
- """Detects csv separator"""
11
- # TODO: add a robust detection:
12
- # si on a un point virgule comme texte et \t comme séparateur, on renvoie
13
- # pour l'instant un point virgule
14
- if verbose:
15
- start = time()
16
- logging.info("Detecting separator")
17
- file.seek(0)
18
- header = file.readline()
19
- possible_separators = [";", ",", "|", "\t"]
20
- sep_count = dict()
21
- for sep in possible_separators:
22
- sep_count[sep] = header.count(sep)
23
- sep = max(sep_count, key=sep_count.get)
24
- # testing that the first 10 (arbitrary) rows all have the same number of fields
25
- # as the header. Prevents downstream unwanted behaviour where pandas can load
26
- # the file (in a weird way) but the process is irrelevant.
27
- file.seek(0)
28
- reader = csv.reader(file, delimiter=sep)
29
- rows_lengths = set()
30
- for idx, row in enumerate(reader):
31
- if idx > 10:
32
- break
33
- rows_lengths.add(len(row))
34
- if len(rows_lengths) > 1:
35
- raise ValueError(
36
- f"Number of columns is not even across the first 10 rows (detected separator: {sep})."
37
- )
38
-
39
- if verbose:
40
- display_logs_depending_process_time(
41
- f'Detected separator: "{sep}" in {round(time() - start, 3)}s',
42
- time() - start,
43
- )
44
- return sep