csv-detective 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +7 -1
- csv_detective/cli.py +33 -21
- csv_detective/{detect_fields/FR → detection}/__init__.py +0 -0
- csv_detective/detection/columns.py +89 -0
- csv_detective/detection/encoding.py +29 -0
- csv_detective/detection/engine.py +46 -0
- csv_detective/detection/formats.py +156 -0
- csv_detective/detection/headers.py +28 -0
- csv_detective/detection/rows.py +18 -0
- csv_detective/detection/separator.py +44 -0
- csv_detective/detection/variables.py +97 -0
- csv_detective/explore_csv.py +151 -377
- csv_detective/format.py +67 -0
- csv_detective/formats/__init__.py +9 -0
- csv_detective/formats/adresse.py +116 -0
- csv_detective/formats/binary.py +26 -0
- csv_detective/formats/booleen.py +35 -0
- csv_detective/formats/code_commune_insee.py +26 -0
- csv_detective/formats/code_csp_insee.py +36 -0
- csv_detective/formats/code_departement.py +29 -0
- csv_detective/formats/code_fantoir.py +21 -0
- csv_detective/formats/code_import.py +17 -0
- csv_detective/formats/code_postal.py +25 -0
- csv_detective/formats/code_region.py +22 -0
- csv_detective/formats/code_rna.py +29 -0
- csv_detective/formats/code_waldec.py +17 -0
- csv_detective/formats/commune.py +27 -0
- csv_detective/formats/csp_insee.py +31 -0
- csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
- csv_detective/formats/date.py +99 -0
- csv_detective/formats/date_fr.py +22 -0
- csv_detective/formats/datetime_aware.py +45 -0
- csv_detective/formats/datetime_naive.py +48 -0
- csv_detective/formats/datetime_rfc822.py +24 -0
- csv_detective/formats/departement.py +37 -0
- csv_detective/formats/email.py +28 -0
- csv_detective/formats/float.py +29 -0
- csv_detective/formats/geojson.py +36 -0
- csv_detective/formats/insee_ape700.py +31 -0
- csv_detective/formats/insee_canton.py +28 -0
- csv_detective/formats/int.py +23 -0
- csv_detective/formats/iso_country_code_alpha2.py +30 -0
- csv_detective/formats/iso_country_code_alpha3.py +30 -0
- csv_detective/formats/iso_country_code_numeric.py +31 -0
- csv_detective/formats/jour_de_la_semaine.py +41 -0
- csv_detective/formats/json.py +20 -0
- csv_detective/formats/latitude_l93.py +48 -0
- csv_detective/formats/latitude_wgs.py +42 -0
- csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
- csv_detective/formats/latlon_wgs.py +53 -0
- csv_detective/formats/longitude_l93.py +39 -0
- csv_detective/formats/longitude_wgs.py +32 -0
- csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
- csv_detective/formats/lonlat_wgs.py +36 -0
- csv_detective/formats/mois_de_lannee.py +48 -0
- csv_detective/formats/money.py +18 -0
- csv_detective/formats/mongo_object_id.py +14 -0
- csv_detective/formats/pays.py +35 -0
- csv_detective/formats/percent.py +16 -0
- csv_detective/formats/region.py +70 -0
- csv_detective/formats/sexe.py +17 -0
- csv_detective/formats/siren.py +37 -0
- csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -29
- csv_detective/formats/tel_fr.py +36 -0
- csv_detective/formats/uai.py +36 -0
- csv_detective/formats/url.py +46 -0
- csv_detective/formats/username.py +14 -0
- csv_detective/formats/uuid.py +16 -0
- csv_detective/formats/year.py +28 -0
- csv_detective/output/__init__.py +65 -0
- csv_detective/output/dataframe.py +96 -0
- csv_detective/output/example.py +250 -0
- csv_detective/output/profile.py +119 -0
- csv_detective/{schema_generation.py → output/schema.py} +268 -343
- csv_detective/output/utils.py +74 -0
- csv_detective/{detect_fields/FR/geo → parsing}/__init__.py +0 -0
- csv_detective/parsing/columns.py +235 -0
- csv_detective/parsing/compression.py +11 -0
- csv_detective/parsing/csv.py +56 -0
- csv_detective/parsing/excel.py +167 -0
- csv_detective/parsing/load.py +111 -0
- csv_detective/parsing/text.py +56 -0
- csv_detective/utils.py +23 -196
- csv_detective/validate.py +138 -0
- csv_detective-0.9.3.dev2438.dist-info/METADATA +267 -0
- csv_detective-0.9.3.dev2438.dist-info/RECORD +92 -0
- csv_detective-0.9.3.dev2438.dist-info/WHEEL +4 -0
- {csv_detective-0.6.7.dist-info → csv_detective-0.9.3.dev2438.dist-info}/entry_points.txt +1 -0
- csv_detective/all_packages.txt +0 -104
- csv_detective/detect_fields/FR/geo/adresse/__init__.py +0 -100
- csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -24
- csv_detective/detect_fields/FR/geo/code_commune_insee/code_commune_insee.txt +0 -37600
- csv_detective/detect_fields/FR/geo/code_departement/__init__.py +0 -11
- csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -15
- csv_detective/detect_fields/FR/geo/code_fantoir/code_fantoir.txt +0 -26122
- csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/code_postal/code_postal.txt +0 -36822
- csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -27
- csv_detective/detect_fields/FR/geo/commune/__init__.py +0 -21
- csv_detective/detect_fields/FR/geo/commune/commune.txt +0 -36745
- csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/departement/departement.txt +0 -101
- csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +0 -20
- csv_detective/detect_fields/FR/geo/insee_canton/canton2017.txt +0 -2055
- csv_detective/detect_fields/FR/geo/insee_canton/cantons.txt +0 -2055
- csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -17
- csv_detective/detect_fields/FR/geo/pays/pays.txt +0 -248
- csv_detective/detect_fields/FR/geo/region/__init__.py +0 -16
- csv_detective/detect_fields/FR/geo/region/region.txt +0 -44
- csv_detective/detect_fields/FR/other/__init__.py +0 -0
- csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +0 -26
- csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
- csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -8
- csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -12
- csv_detective/detect_fields/FR/other/csp_insee/__init__.py +0 -16
- csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
- csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +0 -16
- csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -9
- csv_detective/detect_fields/FR/other/siren/__init__.py +0 -18
- csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -15
- csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
- csv_detective/detect_fields/FR/temp/__init__.py +0 -0
- csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +0 -23
- csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +0 -37
- csv_detective/detect_fields/__init__.py +0 -57
- csv_detective/detect_fields/geo/__init__.py +0 -0
- csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
- csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
- csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
- csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -22
- csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -15
- csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/other/__init__.py +0 -0
- csv_detective/detect_fields/other/booleen/__init__.py +0 -21
- csv_detective/detect_fields/other/email/__init__.py +0 -8
- csv_detective/detect_fields/other/float/__init__.py +0 -17
- csv_detective/detect_fields/other/int/__init__.py +0 -12
- csv_detective/detect_fields/other/json/__init__.py +0 -24
- csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
- csv_detective/detect_fields/other/twitter/__init__.py +0 -8
- csv_detective/detect_fields/other/url/__init__.py +0 -11
- csv_detective/detect_fields/other/uuid/__init__.py +0 -11
- csv_detective/detect_fields/temp/__init__.py +0 -0
- csv_detective/detect_fields/temp/date/__init__.py +0 -62
- csv_detective/detect_fields/temp/datetime_iso/__init__.py +0 -18
- csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +0 -21
- csv_detective/detect_fields/temp/year/__init__.py +0 -10
- csv_detective/detect_labels/FR/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -40
- csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -42
- csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -41
- csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -47
- csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -54
- csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -55
- csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -44
- csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -45
- csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -45
- csv_detective/detect_labels/FR/geo/region/__init__.py +0 -45
- csv_detective/detect_labels/FR/other/__init__.py +0 -0
- csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -38
- csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -37
- csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -40
- csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/siren/__init__.py +0 -41
- csv_detective/detect_labels/FR/other/siret/__init__.py +0 -40
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -45
- csv_detective/detect_labels/FR/other/uai/__init__.py +0 -50
- csv_detective/detect_labels/FR/temp/__init__.py +0 -0
- csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -41
- csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -33
- csv_detective/detect_labels/__init__.py +0 -43
- csv_detective/detect_labels/geo/__init__.py +0 -0
- csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -41
- csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -41
- csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -41
- csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -42
- csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -55
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -67
- csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -45
- csv_detective/detect_labels/other/__init__.py +0 -0
- csv_detective/detect_labels/other/booleen/__init__.py +0 -34
- csv_detective/detect_labels/other/email/__init__.py +0 -45
- csv_detective/detect_labels/other/float/__init__.py +0 -33
- csv_detective/detect_labels/other/int/__init__.py +0 -33
- csv_detective/detect_labels/other/money/__init__.py +0 -11
- csv_detective/detect_labels/other/money/check_col_name.py +0 -8
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -33
- csv_detective/detect_labels/other/twitter/__init__.py +0 -33
- csv_detective/detect_labels/other/url/__init__.py +0 -48
- csv_detective/detect_labels/other/uuid/__init__.py +0 -33
- csv_detective/detect_labels/temp/__init__.py +0 -0
- csv_detective/detect_labels/temp/date/__init__.py +0 -51
- csv_detective/detect_labels/temp/datetime_iso/__init__.py +0 -45
- csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -44
- csv_detective/detect_labels/temp/year/__init__.py +0 -44
- csv_detective/detection.py +0 -361
- csv_detective/process_text.py +0 -39
- csv_detective/s3_utils.py +0 -48
- csv_detective-0.6.7.data/data/share/csv_detective/CHANGELOG.md +0 -118
- csv_detective-0.6.7.data/data/share/csv_detective/LICENSE.AGPL.txt +0 -661
- csv_detective-0.6.7.data/data/share/csv_detective/README.md +0 -247
- csv_detective-0.6.7.dist-info/LICENSE.AGPL.txt +0 -661
- csv_detective-0.6.7.dist-info/METADATA +0 -23
- csv_detective-0.6.7.dist-info/RECORD +0 -150
- csv_detective-0.6.7.dist-info/WHEEL +0 -5
- csv_detective-0.6.7.dist-info/top_level.txt +0 -2
- tests/__init__.py +0 -0
- tests/test_fields.py +0 -360
- tests/test_file.py +0 -116
- tests/test_labels.py +0 -7
- /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from datetime import date, datetime
|
|
3
|
+
from time import time
|
|
4
|
+
from typing import Iterator
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from csv_detective.formats.binary import binary_casting
|
|
9
|
+
from csv_detective.formats.booleen import bool_casting
|
|
10
|
+
from csv_detective.formats.date import date_casting
|
|
11
|
+
from csv_detective.formats.float import float_casting
|
|
12
|
+
from csv_detective.parsing.csv import CHUNK_SIZE
|
|
13
|
+
from csv_detective.utils import display_logs_depending_process_time
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def cast(value: str, _type: str) -> str | float | bool | date | datetime | bytes | None:
|
|
17
|
+
if not isinstance(value, str) or not value:
|
|
18
|
+
# None is the current default value in hydra, should we keep this?
|
|
19
|
+
return None
|
|
20
|
+
match _type:
|
|
21
|
+
case "float":
|
|
22
|
+
return float_casting(value)
|
|
23
|
+
case "bool":
|
|
24
|
+
return bool_casting(value)
|
|
25
|
+
case "json":
|
|
26
|
+
# in hydra json are given to postgres as strings, conversion is done by postgres
|
|
27
|
+
return json.loads(value)
|
|
28
|
+
case "date":
|
|
29
|
+
_date = date_casting(value)
|
|
30
|
+
return _date.date() if _date else None
|
|
31
|
+
case "datetime":
|
|
32
|
+
return date_casting(value)
|
|
33
|
+
case "binary":
|
|
34
|
+
return binary_casting(value)
|
|
35
|
+
case _:
|
|
36
|
+
raise ValueError(f"Unknown type `{_type}`")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def cast_df(
|
|
40
|
+
df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bool = False
|
|
41
|
+
) -> pd.DataFrame:
|
|
42
|
+
# for efficiency this modifies the dataframe in place as we don't need it anymore afterwards
|
|
43
|
+
if verbose:
|
|
44
|
+
start = time()
|
|
45
|
+
for col_name, detection in columns.items():
|
|
46
|
+
if detection["python_type"] == "string" or (
|
|
47
|
+
detection["python_type"] == "json" and not cast_json
|
|
48
|
+
):
|
|
49
|
+
# no change if detected type is string
|
|
50
|
+
continue
|
|
51
|
+
elif detection["python_type"] == "int":
|
|
52
|
+
# to allow having ints and NaN in the same column
|
|
53
|
+
df[col_name] = df[col_name].astype(pd.Int64Dtype())
|
|
54
|
+
else:
|
|
55
|
+
df[col_name] = df[col_name].apply(lambda col: cast(col, _type=detection["python_type"]))
|
|
56
|
+
if verbose:
|
|
57
|
+
display_logs_depending_process_time(
|
|
58
|
+
f"Casting columns completed in {round(time() - start, 3)}s",
|
|
59
|
+
time() - start,
|
|
60
|
+
)
|
|
61
|
+
return df
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def cast_df_chunks(
|
|
65
|
+
df: pd.DataFrame,
|
|
66
|
+
analysis: dict,
|
|
67
|
+
file_path: str,
|
|
68
|
+
cast_json: bool = True,
|
|
69
|
+
verbose: bool = False,
|
|
70
|
+
) -> Iterator[pd.DataFrame]:
|
|
71
|
+
if analysis.get("engine") or analysis["total_lines"] <= CHUNK_SIZE:
|
|
72
|
+
# the file is loaded in one chunk, so returning the cast df
|
|
73
|
+
yield cast_df(
|
|
74
|
+
df=df,
|
|
75
|
+
columns=analysis["columns"],
|
|
76
|
+
cast_json=cast_json,
|
|
77
|
+
verbose=verbose,
|
|
78
|
+
)
|
|
79
|
+
else:
|
|
80
|
+
# loading the csv in chunks using the analysis
|
|
81
|
+
chunks = pd.read_csv(
|
|
82
|
+
file_path,
|
|
83
|
+
dtype=str,
|
|
84
|
+
sep=analysis["separator"],
|
|
85
|
+
encoding=analysis["encoding"],
|
|
86
|
+
skiprows=analysis["header_row_idx"],
|
|
87
|
+
compression=analysis.get("compression"),
|
|
88
|
+
chunksize=CHUNK_SIZE,
|
|
89
|
+
)
|
|
90
|
+
for chunk in chunks:
|
|
91
|
+
yield cast_df(
|
|
92
|
+
df=chunk,
|
|
93
|
+
columns=analysis["columns"],
|
|
94
|
+
cast_json=cast_json,
|
|
95
|
+
verbose=verbose,
|
|
96
|
+
)
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import random
|
|
3
|
+
import string
|
|
4
|
+
import uuid
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from typing import Any, Type
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
import requests
|
|
10
|
+
import rstr
|
|
11
|
+
from faker import Faker
|
|
12
|
+
|
|
13
|
+
from csv_detective.utils import is_url
|
|
14
|
+
|
|
15
|
+
fake = Faker()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def create_example_csv_file(
|
|
19
|
+
fields: dict | None = None,
|
|
20
|
+
schema_path: str | None = None,
|
|
21
|
+
file_length: int = 10,
|
|
22
|
+
output_name: str | None = "example_file.csv",
|
|
23
|
+
output_sep: str = ";",
|
|
24
|
+
encoding: str = "utf-8",
|
|
25
|
+
ignore_required: bool = False,
|
|
26
|
+
) -> pd.DataFrame:
|
|
27
|
+
"""
|
|
28
|
+
Create an example file based on a list of dicts like follows:
|
|
29
|
+
fields = [
|
|
30
|
+
{
|
|
31
|
+
"name": "column_name",
|
|
32
|
+
"type": "column_type",
|
|
33
|
+
"args": {dict_of_args} # optional
|
|
34
|
+
},
|
|
35
|
+
...
|
|
36
|
+
]
|
|
37
|
+
Or from a TableSchema
|
|
38
|
+
"""
|
|
39
|
+
# need to make a CLI command
|
|
40
|
+
|
|
41
|
+
if not (fields or schema_path):
|
|
42
|
+
raise ValueError("At least fields or schema_path must be specified.")
|
|
43
|
+
|
|
44
|
+
def potential_skip(required: bool) -> bool:
|
|
45
|
+
if ignore_required:
|
|
46
|
+
return False
|
|
47
|
+
if not required:
|
|
48
|
+
# for now 30% chance to have an optional value, this could go as an argument
|
|
49
|
+
return random.randint(1, 100) <= 30
|
|
50
|
+
|
|
51
|
+
def _string(
|
|
52
|
+
length: int = 10,
|
|
53
|
+
required: bool = True,
|
|
54
|
+
pattern: str | None = None,
|
|
55
|
+
enum: str | None = None,
|
|
56
|
+
) -> str:
|
|
57
|
+
if potential_skip(required):
|
|
58
|
+
return ""
|
|
59
|
+
if pattern is not None:
|
|
60
|
+
return rstr.xeger(pattern)
|
|
61
|
+
elif enum is not None:
|
|
62
|
+
return random.choice(enum)
|
|
63
|
+
else:
|
|
64
|
+
letters = string.ascii_lowercase
|
|
65
|
+
return "".join(random.choice(letters) for i in range(length))
|
|
66
|
+
|
|
67
|
+
def _id(
|
|
68
|
+
required: bool = True,
|
|
69
|
+
) -> str:
|
|
70
|
+
if potential_skip(required):
|
|
71
|
+
return ""
|
|
72
|
+
return str(uuid.uuid4())
|
|
73
|
+
|
|
74
|
+
def _date(
|
|
75
|
+
date_range: list[str] | None = None,
|
|
76
|
+
format: str = "%Y-%m-%d",
|
|
77
|
+
required: bool = True,
|
|
78
|
+
) -> str:
|
|
79
|
+
# the bounds specified in date_range are expected in the same format as the desired output format
|
|
80
|
+
assert all([k in format for k in ["%d", "%m", "%Y"]])
|
|
81
|
+
if potential_skip(required):
|
|
82
|
+
return ""
|
|
83
|
+
if date_range is None:
|
|
84
|
+
return fake.date(format)
|
|
85
|
+
else:
|
|
86
|
+
if len(date_range) != 2:
|
|
87
|
+
raise ValueError("'date_range' must have exactly two elements.")
|
|
88
|
+
return fake.date_between_dates(
|
|
89
|
+
datetime.strptime(date_range[0], format),
|
|
90
|
+
datetime.strptime(date_range[1], format),
|
|
91
|
+
).strftime(format)
|
|
92
|
+
|
|
93
|
+
def _time(
|
|
94
|
+
format: str = "%H:%M:%S",
|
|
95
|
+
required: bool = True,
|
|
96
|
+
) -> str:
|
|
97
|
+
assert all([k in format for k in ["%H", "%M", "%S"]])
|
|
98
|
+
if potential_skip(required):
|
|
99
|
+
return ""
|
|
100
|
+
# maybe add a time_range argument?
|
|
101
|
+
return fake.time(format)
|
|
102
|
+
|
|
103
|
+
def _datetime(
|
|
104
|
+
datetime_range: list[str] | None = None,
|
|
105
|
+
format: str = "%Y-%m-%d %H-%M-%S",
|
|
106
|
+
required: bool = True,
|
|
107
|
+
) -> str:
|
|
108
|
+
# the bounds specified in datetime_range are expected in the same format as the desired output format
|
|
109
|
+
assert all([k in format for k in ["%d", "%m", "%Y", "%H", "%M", "%S"]])
|
|
110
|
+
if potential_skip(required):
|
|
111
|
+
return ""
|
|
112
|
+
if datetime_range is None:
|
|
113
|
+
return fake.date_time().strftime(format)
|
|
114
|
+
else:
|
|
115
|
+
if len(datetime_range) != 2:
|
|
116
|
+
raise ValueError("'date_range' must have exactly two elements.")
|
|
117
|
+
return fake.date_time_between(
|
|
118
|
+
datetime.strptime(datetime_range[0], format),
|
|
119
|
+
datetime.strptime(datetime_range[1], format),
|
|
120
|
+
).strftime(format)
|
|
121
|
+
|
|
122
|
+
def _url(required: bool = True) -> str:
|
|
123
|
+
if potential_skip(required):
|
|
124
|
+
return ""
|
|
125
|
+
return f"http://{rstr.domainsafe()}.{rstr.letters(3)}/{rstr.urlsafe()}"
|
|
126
|
+
|
|
127
|
+
def _number(
|
|
128
|
+
num_type: Type[int | float] = int,
|
|
129
|
+
num_range: list[float] | None = None,
|
|
130
|
+
enum: list | None = None,
|
|
131
|
+
required: bool = True,
|
|
132
|
+
) -> int | float:
|
|
133
|
+
assert num_range is None or len(num_range) == 2
|
|
134
|
+
if potential_skip(required):
|
|
135
|
+
return ""
|
|
136
|
+
if enum:
|
|
137
|
+
return random.choice(enum)
|
|
138
|
+
if num_range is None:
|
|
139
|
+
num_range = [0, 1000]
|
|
140
|
+
if num_type is int:
|
|
141
|
+
return random.randint(num_range[0], num_range[1])
|
|
142
|
+
else:
|
|
143
|
+
return round(random.uniform(num_range[0], num_range[1]), 1)
|
|
144
|
+
|
|
145
|
+
def _bool(required: bool = True) -> bool:
|
|
146
|
+
if potential_skip(required):
|
|
147
|
+
return ""
|
|
148
|
+
return random.randint(0, 1) == 0
|
|
149
|
+
|
|
150
|
+
def _array(enum: list[Any], required: bool = True) -> str:
|
|
151
|
+
if potential_skip(required):
|
|
152
|
+
return ""
|
|
153
|
+
return f"[{','.join(random.sample(enum, random.randint(1, len(enum))))}]"
|
|
154
|
+
|
|
155
|
+
def build_args_from_constraints(constraints: dict) -> dict:
|
|
156
|
+
args = {}
|
|
157
|
+
args["required"] = constraints.get("required", False)
|
|
158
|
+
for _ in ["pattern", "enum", "format"]:
|
|
159
|
+
if _ in constraints:
|
|
160
|
+
args[_] = constraints[_]
|
|
161
|
+
if "minimum" in constraints and "maximum" in constraints:
|
|
162
|
+
args["num_range"] = [constraints["minimum"], constraints["maximum"]]
|
|
163
|
+
# maybe there are better values than these?
|
|
164
|
+
elif "minimum" in constraints:
|
|
165
|
+
args["num_range"] = [constraints["minimum"], 10 + constraints["minimum"]]
|
|
166
|
+
elif "maximum" in constraints:
|
|
167
|
+
args["num_range"] = [constraints["maximum"] - 10, constraints["maximum"]]
|
|
168
|
+
if "minLength" in constraints:
|
|
169
|
+
args["length"] = constraints["minLength"]
|
|
170
|
+
if "maxLength" in constraints:
|
|
171
|
+
args["length"] = constraints["maxLength"]
|
|
172
|
+
return args
|
|
173
|
+
|
|
174
|
+
schema_types_to_python = {
|
|
175
|
+
"number": "float",
|
|
176
|
+
"integer": "int",
|
|
177
|
+
"string": "str",
|
|
178
|
+
"year": "year",
|
|
179
|
+
"boolean": "bool",
|
|
180
|
+
"date": "date",
|
|
181
|
+
"yearmonth": "date",
|
|
182
|
+
"time": "time",
|
|
183
|
+
"datetime": "datetime",
|
|
184
|
+
"array": "array",
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
if schema_path:
|
|
188
|
+
if is_url(schema_path):
|
|
189
|
+
schema = requests.get(schema_path).json()
|
|
190
|
+
else:
|
|
191
|
+
with open(schema_path, encoding=encoding) as jsonfile:
|
|
192
|
+
schema = json.load(jsonfile)
|
|
193
|
+
if "fields" not in schema.keys():
|
|
194
|
+
raise ValueError("The schema must have a 'fields' key.")
|
|
195
|
+
else:
|
|
196
|
+
fields = [
|
|
197
|
+
{
|
|
198
|
+
"name": f["name"],
|
|
199
|
+
"type": schema_types_to_python.get(f["type"], "str"),
|
|
200
|
+
# when frformat is supported in TableSchema, we can build args for French standards
|
|
201
|
+
# linked to https://github.com/datagouv/fr-format/issues/26
|
|
202
|
+
"args": (
|
|
203
|
+
build_args_from_constraints(f["constraints"])
|
|
204
|
+
if "constraints" in f.keys()
|
|
205
|
+
else build_args_from_constraints(f["arrayItem"]["constraints"])
|
|
206
|
+
if "arrayItem" in f.keys() and "constraints" in f["arrayItem"].keys()
|
|
207
|
+
else {}
|
|
208
|
+
),
|
|
209
|
+
}
|
|
210
|
+
for f in schema["fields"]
|
|
211
|
+
]
|
|
212
|
+
|
|
213
|
+
for k in range(len(fields)):
|
|
214
|
+
if "args" not in fields[k]:
|
|
215
|
+
fields[k]["args"] = {}
|
|
216
|
+
if fields[k]["type"] == "float":
|
|
217
|
+
fields[k]["args"]["num_type"] = float
|
|
218
|
+
elif fields[k]["type"] == "int":
|
|
219
|
+
fields[k]["args"]["num_type"] = int
|
|
220
|
+
elif fields[k]["type"] == "year":
|
|
221
|
+
fields[k]["args"]["num_type"] = int
|
|
222
|
+
fields[k]["args"]["num_range"] = [1990, 2050]
|
|
223
|
+
|
|
224
|
+
types_to_func = {
|
|
225
|
+
"int": _number,
|
|
226
|
+
"float": _number,
|
|
227
|
+
"date": _date,
|
|
228
|
+
"time": _time,
|
|
229
|
+
"str": _string,
|
|
230
|
+
"url": _url,
|
|
231
|
+
"id": _id,
|
|
232
|
+
"year": _number,
|
|
233
|
+
"bool": _bool,
|
|
234
|
+
"datetime": _datetime,
|
|
235
|
+
"array": _array,
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
# would it be better to create by column or by row (as for now)?
|
|
239
|
+
output = pd.DataFrame(
|
|
240
|
+
[
|
|
241
|
+
[types_to_func.get(f["type"], "str")(**f["args"]) for f in fields]
|
|
242
|
+
for _ in range(file_length)
|
|
243
|
+
],
|
|
244
|
+
columns=[f["name"] for f in fields],
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
if output_name:
|
|
248
|
+
output.to_csv(output_name, sep=output_sep, index=False)
|
|
249
|
+
|
|
250
|
+
return output
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from time import time
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from csv_detective.formats.float import float_casting
|
|
9
|
+
from csv_detective.utils import cast_prevent_nan, display_logs_depending_process_time
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def create_profile(
|
|
13
|
+
table: pd.DataFrame,
|
|
14
|
+
columns: dict,
|
|
15
|
+
num_rows: int,
|
|
16
|
+
limited_output: bool = True,
|
|
17
|
+
cast_json: bool = True,
|
|
18
|
+
verbose: bool = False,
|
|
19
|
+
_col_values: dict[str, pd.Series] | None = None,
|
|
20
|
+
) -> dict:
|
|
21
|
+
if verbose:
|
|
22
|
+
start = time()
|
|
23
|
+
logging.info("Creating profile")
|
|
24
|
+
|
|
25
|
+
if num_rows > 0:
|
|
26
|
+
raise ValueError("To create profiles num_rows has to be set to -1")
|
|
27
|
+
if not limited_output:
|
|
28
|
+
columns = {
|
|
29
|
+
k: v[0] if v else {"python_type": "string", "format": "string", "score": 1.0}
|
|
30
|
+
for k, v in columns.items()
|
|
31
|
+
}
|
|
32
|
+
# value_counts().reset_index() tries to insert a "count" column, and fails if it's already here
|
|
33
|
+
_count_col = "count"
|
|
34
|
+
while _count_col in table.columns:
|
|
35
|
+
_count_col = "_" + _count_col
|
|
36
|
+
profile = defaultdict(dict)
|
|
37
|
+
for c in table.columns:
|
|
38
|
+
# for numerical formats we want min, max, mean, std
|
|
39
|
+
if columns[c]["python_type"] in ["float", "int"]:
|
|
40
|
+
# if we have read the file in chunks we already have what we need
|
|
41
|
+
if _col_values is None:
|
|
42
|
+
# we locally cast the column to perform the operations,
|
|
43
|
+
# using the same method as in cast_df
|
|
44
|
+
cast_col = (
|
|
45
|
+
table[c].astype(pd.Int64Dtype())
|
|
46
|
+
if columns[c]["python_type"] == "int"
|
|
47
|
+
else table[c].apply(lambda x: float_casting(x) if isinstance(x, str) else pd.NA)
|
|
48
|
+
)
|
|
49
|
+
stats = {
|
|
50
|
+
"min": cast_prevent_nan(cast_col.min(), columns[c]["python_type"]),
|
|
51
|
+
"mean": cast_prevent_nan(cast_col.mean(), columns[c]["python_type"]),
|
|
52
|
+
"max": cast_prevent_nan(cast_col.max(), columns[c]["python_type"]),
|
|
53
|
+
"std": cast_prevent_nan(cast_col.std(), columns[c]["python_type"]),
|
|
54
|
+
}
|
|
55
|
+
else:
|
|
56
|
+
cast_col = _col_values[c].reset_index()
|
|
57
|
+
cast_col = cast_col.loc[cast_col[c].notna()]
|
|
58
|
+
cast_col[c] = (
|
|
59
|
+
cast_col[c].astype(pd.Int64Dtype())
|
|
60
|
+
if columns[c]["python_type"] == "int"
|
|
61
|
+
else cast_col[c].apply(
|
|
62
|
+
lambda x: float_casting(x) if isinstance(x, str) else pd.NA
|
|
63
|
+
)
|
|
64
|
+
)
|
|
65
|
+
stats = {
|
|
66
|
+
"min": cast_prevent_nan(cast_col[c].min(), columns[c]["python_type"]),
|
|
67
|
+
"mean": cast_prevent_nan(
|
|
68
|
+
(cast_col[c] * cast_col["count"]).sum() / sum(cast_col["count"]),
|
|
69
|
+
columns[c]["python_type"],
|
|
70
|
+
),
|
|
71
|
+
"max": cast_prevent_nan(cast_col[c].max(), columns[c]["python_type"]),
|
|
72
|
+
}
|
|
73
|
+
stats["std"] = cast_prevent_nan(
|
|
74
|
+
np.sqrt(
|
|
75
|
+
sum(cast_col["count"] * (cast_col[c] - stats["mean"]) ** 2)
|
|
76
|
+
/ sum(cast_col["count"])
|
|
77
|
+
),
|
|
78
|
+
columns[c]["python_type"],
|
|
79
|
+
)
|
|
80
|
+
profile[c].update(**stats)
|
|
81
|
+
del cast_col
|
|
82
|
+
# for all formats we want most frequent values, nb unique values and nb missing values
|
|
83
|
+
tops_bruts = (
|
|
84
|
+
(table[c].value_counts() if _col_values is None else _col_values[c].sort_values())
|
|
85
|
+
.reset_index(name=_count_col)
|
|
86
|
+
.iloc[:10]
|
|
87
|
+
.to_dict(orient="records")
|
|
88
|
+
)
|
|
89
|
+
profile[c].update(
|
|
90
|
+
tops=[
|
|
91
|
+
{
|
|
92
|
+
"count": tb[_count_col],
|
|
93
|
+
"value": tb[c],
|
|
94
|
+
}
|
|
95
|
+
for tb in tops_bruts
|
|
96
|
+
],
|
|
97
|
+
nb_distinct=(
|
|
98
|
+
(
|
|
99
|
+
table[c].nunique()
|
|
100
|
+
if columns[c]["python_type"] != "json" or not cast_json
|
|
101
|
+
# a column containing cast json is not serializable
|
|
102
|
+
else table[c].astype(str).nunique()
|
|
103
|
+
)
|
|
104
|
+
if _col_values is None
|
|
105
|
+
else len(_col_values)
|
|
106
|
+
),
|
|
107
|
+
nb_missing_values=(
|
|
108
|
+
len(table[c].loc[table[c].isna()])
|
|
109
|
+
if _col_values is None
|
|
110
|
+
else (_col_values[c].loc[pd.NA] if pd.NA in _col_values[c].index else 0)
|
|
111
|
+
),
|
|
112
|
+
)
|
|
113
|
+
if verbose:
|
|
114
|
+
display_logs_depending_process_time(
|
|
115
|
+
f"Created profile in {round(time() - start, 3)}s",
|
|
116
|
+
time() - start,
|
|
117
|
+
)
|
|
118
|
+
del _col_values
|
|
119
|
+
return profile
|