csv-detective 0.7.5.dev1197__py3-none-any.whl → 0.7.5.dev1228__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/adresse/__init__.py +1 -1
- csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +1 -1
- csv_detective/detect_fields/FR/other/csp_insee/__init__.py +1 -1
- csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +1 -1
- csv_detective/detect_fields/FR/other/sexe/__init__.py +1 -1
- csv_detective/detect_fields/other/float/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/adresse/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/code_departement/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/code_postal/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/code_region/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/commune/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/departement/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/pays/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/region/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/code_rna/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/code_waldec/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/csp_insee/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/date_fr/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/sexe/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/siren/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/siret/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/uai/__init__.py +1 -1
- csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +1 -1
- csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +1 -1
- csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +1 -1
- csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +1 -1
- csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +1 -1
- csv_detective/detect_labels/geo/json_geojson/__init__.py +1 -1
- csv_detective/detect_labels/geo/latitude_wgs/__init__.py +1 -1
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py +1 -1
- csv_detective/detect_labels/geo/longitude_wgs/__init__.py +1 -1
- csv_detective/detect_labels/other/booleen/__init__.py +1 -1
- csv_detective/detect_labels/other/email/__init__.py +1 -1
- csv_detective/detect_labels/other/float/__init__.py +1 -1
- csv_detective/detect_labels/other/int/__init__.py +1 -1
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +1 -1
- csv_detective/detect_labels/other/twitter/__init__.py +1 -1
- csv_detective/detect_labels/other/url/__init__.py +1 -1
- csv_detective/detect_labels/other/uuid/__init__.py +1 -1
- csv_detective/detect_labels/temp/date/__init__.py +1 -1
- csv_detective/detect_labels/temp/datetime_iso/__init__.py +1 -1
- csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +1 -1
- csv_detective/detect_labels/temp/year/__init__.py +1 -1
- csv_detective/detection/columns.py +89 -0
- csv_detective/detection/encoding.py +27 -0
- csv_detective/detection/engine.py +46 -0
- csv_detective/detection/headers.py +32 -0
- csv_detective/detection/rows.py +18 -0
- csv_detective/detection/separator.py +44 -0
- csv_detective/detection/variables.py +98 -0
- csv_detective/explore_csv.py +40 -124
- csv_detective/output/dataframe.py +55 -0
- csv_detective/{create_example.py → output/example.py} +10 -9
- csv_detective/output/profile.py +87 -0
- csv_detective/{schema_generation.py → output/schema.py} +344 -343
- csv_detective/output/utils.py +51 -0
- csv_detective/parsing/columns.py +141 -0
- csv_detective/parsing/compression.py +11 -0
- csv_detective/parsing/csv.py +55 -0
- csv_detective/parsing/excel.py +169 -0
- csv_detective/parsing/load.py +97 -0
- csv_detective/utils.py +10 -236
- {csv_detective-0.7.5.dev1197.data → csv_detective-0.7.5.dev1228.data}/data/share/csv_detective/CHANGELOG.md +3 -0
- {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1228.dist-info}/METADATA +1 -1
- {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1228.dist-info}/RECORD +85 -71
- tests/test_fields.py +8 -7
- tests/test_file.py +15 -14
- csv_detective/detection.py +0 -633
- /csv_detective/{process_text.py → parsing/text.py} +0 -0
- {csv_detective-0.7.5.dev1197.data → csv_detective-0.7.5.dev1228.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev1197.data → csv_detective-0.7.5.dev1228.data}/data/share/csv_detective/README.md +0 -0
- {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1228.dist-info}/WHEEL +0 -0
- {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1228.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1228.dist-info}/licenses/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1228.dist-info}/top_level.txt +0 -0
csv_detective/__init__.py
CHANGED
|
@@ -12,7 +12,7 @@ def _is(val):
|
|
|
12
12
|
if (
|
|
13
13
|
not isinstance(val, str)
|
|
14
14
|
or any([k in val for k in ['_', '+', 'e', 'E']])
|
|
15
|
-
or (val.startswith(
|
|
15
|
+
or (val.startswith("0") and len(val) > 1 and val[1] not in [".", ","])
|
|
16
16
|
):
|
|
17
17
|
return False
|
|
18
18
|
float_casting(val)
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import TextIO
|
|
3
|
+
from time import time
|
|
4
|
+
|
|
5
|
+
from csv_detective.utils import display_logs_depending_process_time
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def detect_extra_columns(file: TextIO, sep: str):
|
|
9
|
+
"""regarde s'il y a des colonnes en trop
|
|
10
|
+
Attention, file ne doit pas avoir de ligne vide"""
|
|
11
|
+
file.seek(0)
|
|
12
|
+
retour = False
|
|
13
|
+
nb_useless_col = 99999
|
|
14
|
+
|
|
15
|
+
for i in range(10):
|
|
16
|
+
line = file.readline()
|
|
17
|
+
# regarde si on a un retour
|
|
18
|
+
if retour:
|
|
19
|
+
assert line[-1] == "\n"
|
|
20
|
+
if line[-1] == "\n":
|
|
21
|
+
retour = True
|
|
22
|
+
|
|
23
|
+
# regarde le nombre de derniere colonne inutile
|
|
24
|
+
deb = 0 + retour
|
|
25
|
+
line = line[::-1][deb:]
|
|
26
|
+
k = 0
|
|
27
|
+
for sign in line:
|
|
28
|
+
if sign != sep:
|
|
29
|
+
break
|
|
30
|
+
k += 1
|
|
31
|
+
if k == 0:
|
|
32
|
+
return 0, retour
|
|
33
|
+
nb_useless_col = min(k, nb_useless_col)
|
|
34
|
+
return nb_useless_col, retour
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def detect_heading_columns(file: TextIO, sep: str, verbose: bool = False) -> int:
|
|
38
|
+
"""Tests first 10 lines to see if there are empty heading columns"""
|
|
39
|
+
if verbose:
|
|
40
|
+
start = time()
|
|
41
|
+
logging.info("Detecting heading columns")
|
|
42
|
+
file.seek(0)
|
|
43
|
+
return_int = float("Inf")
|
|
44
|
+
for i in range(10):
|
|
45
|
+
line = file.readline()
|
|
46
|
+
return_int = min(return_int, len(line) - len(line.strip(sep)))
|
|
47
|
+
if return_int == 0:
|
|
48
|
+
if verbose:
|
|
49
|
+
display_logs_depending_process_time(
|
|
50
|
+
f'No heading column detected in {round(time() - start, 3)}s',
|
|
51
|
+
time() - start,
|
|
52
|
+
)
|
|
53
|
+
return 0
|
|
54
|
+
if verbose:
|
|
55
|
+
display_logs_depending_process_time(
|
|
56
|
+
f'{return_int} heading columns detected in {round(time() - start, 3)}s',
|
|
57
|
+
time() - start,
|
|
58
|
+
)
|
|
59
|
+
return return_int
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def detect_trailing_columns(file: TextIO, sep: str, heading_columns: int, verbose: bool = False) -> int:
|
|
63
|
+
"""Tests first 10 lines to see if there are empty trailing columns"""
|
|
64
|
+
if verbose:
|
|
65
|
+
start = time()
|
|
66
|
+
logging.info("Detecting trailing columns")
|
|
67
|
+
file.seek(0)
|
|
68
|
+
return_int = float("Inf")
|
|
69
|
+
for i in range(10):
|
|
70
|
+
line = file.readline()
|
|
71
|
+
return_int = min(
|
|
72
|
+
return_int,
|
|
73
|
+
len(line.replace("\n", ""))
|
|
74
|
+
- len(line.replace("\n", "").strip(sep))
|
|
75
|
+
- heading_columns,
|
|
76
|
+
)
|
|
77
|
+
if return_int == 0:
|
|
78
|
+
if verbose:
|
|
79
|
+
display_logs_depending_process_time(
|
|
80
|
+
f'No trailing column detected in {round(time() - start, 3)}s',
|
|
81
|
+
time() - start,
|
|
82
|
+
)
|
|
83
|
+
return 0
|
|
84
|
+
if verbose:
|
|
85
|
+
display_logs_depending_process_time(
|
|
86
|
+
f'{return_int} trailing columns detected in {round(time() - start, 3)}s',
|
|
87
|
+
time() - start,
|
|
88
|
+
)
|
|
89
|
+
return return_int
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from time import time
|
|
3
|
+
from io import BytesIO
|
|
4
|
+
|
|
5
|
+
from cchardet import detect
|
|
6
|
+
|
|
7
|
+
from csv_detective.utils import display_logs_depending_process_time
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def detect_encoding(binary_file: BytesIO, verbose: bool = False) -> str:
|
|
11
|
+
"""
|
|
12
|
+
Detects file encoding using faust-cchardet (forked from the original cchardet)
|
|
13
|
+
"""
|
|
14
|
+
if verbose:
|
|
15
|
+
start = time()
|
|
16
|
+
logging.info("Detecting encoding")
|
|
17
|
+
encoding_dict = detect(binary_file.read())
|
|
18
|
+
if not encoding_dict["encoding"]:
|
|
19
|
+
raise ValueError("Could not detect the file's encoding. Consider specifying it in the routine call.")
|
|
20
|
+
if verbose:
|
|
21
|
+
message = f'Detected encoding: "{encoding_dict["encoding"]}"'
|
|
22
|
+
message += f' in {round(time() - start, 3)}s (confidence: {round(encoding_dict["confidence"]*100)}%)'
|
|
23
|
+
display_logs_depending_process_time(
|
|
24
|
+
message,
|
|
25
|
+
time() - start,
|
|
26
|
+
)
|
|
27
|
+
return encoding_dict['encoding']
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from time import time
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
import magic
|
|
5
|
+
import requests
|
|
6
|
+
|
|
7
|
+
from csv_detective.utils import display_logs_depending_process_time, is_url
|
|
8
|
+
|
|
9
|
+
COMPRESSION_ENGINES = ["gzip"]
|
|
10
|
+
EXCEL_ENGINES = ["openpyxl", "xlrd", "odf"]
|
|
11
|
+
engine_to_file = {
|
|
12
|
+
"openpyxl": "Excel",
|
|
13
|
+
"xlrd": "old Excel",
|
|
14
|
+
"odf": "OpenOffice",
|
|
15
|
+
"gzip": "csv.gz",
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def detect_engine(file_path: str, verbose=False) -> Optional[str]:
|
|
20
|
+
if verbose:
|
|
21
|
+
start = time()
|
|
22
|
+
mapping = {
|
|
23
|
+
"application/gzip": "gzip",
|
|
24
|
+
"application/x-gzip": "gzip",
|
|
25
|
+
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'openpyxl',
|
|
26
|
+
'application/vnd.ms-excel': 'xlrd',
|
|
27
|
+
'application/vnd.oasis.opendocument.spreadsheet': 'odf',
|
|
28
|
+
# all these files could be recognized as zip, may need to check all cases then
|
|
29
|
+
'application/zip': 'openpyxl',
|
|
30
|
+
}
|
|
31
|
+
# if none of the above, we move forwards with the csv process
|
|
32
|
+
if is_url(file_path):
|
|
33
|
+
remote_content = requests.get(file_path).content
|
|
34
|
+
engine = mapping.get(magic.from_buffer(remote_content, mime=True))
|
|
35
|
+
else:
|
|
36
|
+
engine = mapping.get(magic.from_file(file_path, mime=True))
|
|
37
|
+
if verbose:
|
|
38
|
+
message = (
|
|
39
|
+
f"File is not csv, detected {engine_to_file.get(engine, 'csv')}"
|
|
40
|
+
if engine else "Processing the file as a csv"
|
|
41
|
+
)
|
|
42
|
+
display_logs_depending_process_time(
|
|
43
|
+
message,
|
|
44
|
+
time() - start,
|
|
45
|
+
)
|
|
46
|
+
return engine
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from time import time
|
|
3
|
+
from typing import Optional, TextIO
|
|
4
|
+
|
|
5
|
+
from csv_detective.utils import display_logs_depending_process_time
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, Optional[list]]:
|
|
9
|
+
"""Tests 10 first rows for possible header (in case header is not 1st row)"""
|
|
10
|
+
if verbose:
|
|
11
|
+
start = time()
|
|
12
|
+
logging.info("Detecting headers")
|
|
13
|
+
file.seek(0)
|
|
14
|
+
for i in range(10):
|
|
15
|
+
header = file.readline()
|
|
16
|
+
position = file.tell()
|
|
17
|
+
chaine = [c for c in header.replace("\n", "").split(sep) if c]
|
|
18
|
+
if chaine[-1] not in ["", "\n"] and all(
|
|
19
|
+
[mot not in ["", "\n"] for mot in chaine[1:-1]]
|
|
20
|
+
):
|
|
21
|
+
next_row = file.readline()
|
|
22
|
+
file.seek(position)
|
|
23
|
+
if header != next_row:
|
|
24
|
+
if verbose:
|
|
25
|
+
display_logs_depending_process_time(
|
|
26
|
+
f'Detected headers in {round(time() - start, 3)}s',
|
|
27
|
+
time() - start,
|
|
28
|
+
)
|
|
29
|
+
return i, chaine
|
|
30
|
+
if verbose:
|
|
31
|
+
logging.info('No header detected')
|
|
32
|
+
return 0, None
|