csv-detective 0.10.3.dev7__py3-none-any.whl → 0.10.2549__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detection/__init__.py +0 -0
- csv_detective/detection/columns.py +0 -0
- csv_detective/detection/encoding.py +0 -0
- csv_detective/detection/engine.py +0 -0
- csv_detective/detection/formats.py +38 -11
- csv_detective/detection/headers.py +14 -12
- csv_detective/detection/rows.py +1 -1
- csv_detective/detection/separator.py +0 -0
- csv_detective/detection/variables.py +0 -0
- csv_detective/explore_csv.py +6 -18
- csv_detective/format.py +5 -12
- csv_detective/formats/__init__.py +0 -0
- csv_detective/formats/adresse.py +9 -9
- csv_detective/formats/binary.py +1 -2
- csv_detective/formats/booleen.py +2 -3
- csv_detective/formats/code_commune_insee.py +10 -12
- csv_detective/formats/code_csp_insee.py +1 -1
- csv_detective/formats/code_departement.py +7 -8
- csv_detective/formats/code_fantoir.py +5 -6
- csv_detective/formats/code_import.py +1 -1
- csv_detective/formats/code_postal.py +9 -10
- csv_detective/formats/code_region.py +6 -7
- csv_detective/formats/code_rna.py +6 -7
- csv_detective/formats/code_waldec.py +1 -1
- csv_detective/formats/commune.py +5 -5
- csv_detective/formats/csp_insee.py +5 -6
- csv_detective/formats/data/insee_ape700.txt +1 -1
- csv_detective/formats/data/iso_country_code_alpha2.txt +397 -153
- csv_detective/formats/data/iso_country_code_alpha3.txt +132 -132
- csv_detective/formats/data/iso_country_code_numeric.txt +94 -94
- csv_detective/formats/date.py +18 -28
- csv_detective/formats/date_fr.py +1 -1
- csv_detective/formats/datetime_aware.py +2 -7
- csv_detective/formats/datetime_naive.py +0 -3
- csv_detective/formats/datetime_rfc822.py +0 -1
- csv_detective/formats/departement.py +15 -15
- csv_detective/formats/email.py +13 -13
- csv_detective/formats/float.py +1 -2
- csv_detective/formats/geojson.py +10 -10
- csv_detective/formats/insee_ape700.py +8 -10
- csv_detective/formats/insee_canton.py +6 -6
- csv_detective/formats/int.py +1 -2
- csv_detective/formats/iso_country_code_alpha2.py +14 -14
- csv_detective/formats/iso_country_code_alpha3.py +13 -6
- csv_detective/formats/iso_country_code_numeric.py +9 -2
- csv_detective/formats/jour_de_la_semaine.py +12 -11
- csv_detective/formats/json.py +0 -6
- csv_detective/formats/latitude_l93.py +22 -8
- csv_detective/formats/latitude_wgs.py +29 -31
- csv_detective/formats/latitude_wgs_fr_metropole.py +30 -7
- csv_detective/formats/latlon_wgs.py +28 -30
- csv_detective/formats/longitude_l93.py +13 -8
- csv_detective/formats/longitude_wgs.py +19 -34
- csv_detective/formats/longitude_wgs_fr_metropole.py +19 -6
- csv_detective/formats/lonlat_wgs.py +11 -12
- csv_detective/formats/mois_de_lannee.py +1 -1
- csv_detective/formats/money.py +1 -1
- csv_detective/formats/mongo_object_id.py +1 -1
- csv_detective/formats/pays.py +13 -11
- csv_detective/formats/percent.py +1 -1
- csv_detective/formats/region.py +13 -13
- csv_detective/formats/sexe.py +1 -1
- csv_detective/formats/siren.py +10 -9
- csv_detective/formats/siret.py +9 -9
- csv_detective/formats/tel_fr.py +13 -7
- csv_detective/formats/uai.py +18 -17
- csv_detective/formats/url.py +16 -16
- csv_detective/formats/username.py +1 -1
- csv_detective/formats/uuid.py +1 -1
- csv_detective/formats/year.py +12 -7
- csv_detective/output/__init__.py +0 -0
- csv_detective/output/dataframe.py +3 -8
- csv_detective/output/example.py +0 -0
- csv_detective/output/profile.py +2 -6
- csv_detective/output/schema.py +0 -0
- csv_detective/output/utils.py +0 -0
- csv_detective/parsing/__init__.py +0 -0
- csv_detective/parsing/columns.py +1 -1
- csv_detective/parsing/compression.py +0 -0
- csv_detective/parsing/csv.py +0 -0
- csv_detective/parsing/excel.py +1 -1
- csv_detective/parsing/load.py +12 -11
- csv_detective/parsing/text.py +12 -13
- csv_detective/validate.py +36 -71
- {csv_detective-0.10.3.dev7.dist-info → csv_detective-0.10.2549.dist-info}/METADATA +18 -15
- csv_detective-0.10.2549.dist-info/RECORD +92 -0
- csv_detective-0.10.2549.dist-info/WHEEL +4 -0
- {csv_detective-0.10.3.dev7.dist-info → csv_detective-0.10.2549.dist-info}/entry_points.txt +1 -0
- csv_detective-0.10.3.dev7.dist-info/RECORD +0 -111
- csv_detective-0.10.3.dev7.dist-info/WHEEL +0 -5
- csv_detective-0.10.3.dev7.dist-info/licenses/LICENSE +0 -21
- csv_detective-0.10.3.dev7.dist-info/top_level.txt +0 -3
- tests/__init__.py +0 -0
- tests/data/a_test_file.csv +0 -407
- tests/data/a_test_file.json +0 -394
- tests/data/b_test_file.csv +0 -7
- tests/data/c_test_file.csv +0 -2
- tests/data/csv_file +0 -7
- tests/data/file.csv.gz +0 -0
- tests/data/file.ods +0 -0
- tests/data/file.xls +0 -0
- tests/data/file.xlsx +0 -0
- tests/data/xlsx_file +0 -0
- tests/test_example.py +0 -67
- tests/test_fields.py +0 -175
- tests/test_file.py +0 -468
- tests/test_labels.py +0 -26
- tests/test_structure.py +0 -45
- tests/test_validation.py +0 -163
|
@@ -3,39 +3,37 @@ from csv_detective.formats.longitude_wgs import _is as is_lon
|
|
|
3
3
|
|
|
4
4
|
proportion = 1
|
|
5
5
|
tags = ["geo"]
|
|
6
|
-
mandatory_label = True
|
|
7
|
-
|
|
8
|
-
SHARED_COORDS_LABELS = {
|
|
9
|
-
"ban": 1,
|
|
10
|
-
"coordinates": 1,
|
|
11
|
-
"coordonnees": 1,
|
|
12
|
-
"coordonnees insee": 1,
|
|
13
|
-
"coord": 1,
|
|
14
|
-
"geo": 0.5,
|
|
15
|
-
"geopoint": 1,
|
|
16
|
-
"geoloc": 1,
|
|
17
|
-
"geolocalisation": 1,
|
|
18
|
-
"geom": 0.75,
|
|
19
|
-
"geometry": 1,
|
|
20
|
-
"gps": 1,
|
|
21
|
-
"localisation": 1,
|
|
22
|
-
"point": 1,
|
|
23
|
-
"position": 1,
|
|
24
|
-
"wgs84": 1,
|
|
25
|
-
}
|
|
26
6
|
|
|
27
|
-
|
|
28
|
-
"
|
|
29
|
-
"
|
|
30
|
-
"
|
|
31
|
-
"
|
|
32
|
-
|
|
7
|
+
SHARED_COORDS_LABELS = [
|
|
8
|
+
"ban",
|
|
9
|
+
"coordinates",
|
|
10
|
+
"coordonnees",
|
|
11
|
+
"coordonnees insee",
|
|
12
|
+
"geo",
|
|
13
|
+
"geopoint",
|
|
14
|
+
"geoloc",
|
|
15
|
+
"geolocalisation",
|
|
16
|
+
"geom",
|
|
17
|
+
"geometry",
|
|
18
|
+
"gps",
|
|
19
|
+
"localisation",
|
|
20
|
+
"point",
|
|
21
|
+
"position",
|
|
22
|
+
"wgs84",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
specific = [
|
|
26
|
+
"latlon",
|
|
27
|
+
"lat lon",
|
|
28
|
+
"x y",
|
|
29
|
+
"xy",
|
|
30
|
+
]
|
|
33
31
|
|
|
34
32
|
# we aim wide to catch exact matches if possible for the highest possible score
|
|
35
33
|
labels = (
|
|
36
34
|
SHARED_COORDS_LABELS
|
|
37
|
-
|
|
38
|
-
|
|
35
|
+
+ specific
|
|
36
|
+
+ [w + sep + suf for suf in specific for w in SHARED_COORDS_LABELS for sep in ["", " "]]
|
|
39
37
|
)
|
|
40
38
|
|
|
41
39
|
|
|
@@ -50,6 +48,6 @@ def _is(val):
|
|
|
50
48
|
|
|
51
49
|
|
|
52
50
|
_test_values = {
|
|
53
|
-
True: ["43.2,-22.6", "-10.
|
|
54
|
-
False: ["0.1,192", "-102, 92", "[23.02,4.1", "23.02,4.1]", "160.1,-27"
|
|
51
|
+
True: ["43.2,-22.6", "-10.7,140", "-40.7, 10.8", "[12,-0.28]"],
|
|
52
|
+
False: ["0.1,192", "-102, 92", "[23.02,4.1", "23.02,4.1]", "160.1,-27"],
|
|
55
53
|
}
|
|
@@ -2,17 +2,22 @@ from frformat import LongitudeL93
|
|
|
2
2
|
|
|
3
3
|
from csv_detective.formats.float import _is as is_float
|
|
4
4
|
from csv_detective.formats.float import float_casting
|
|
5
|
-
from csv_detective.formats.longitude_wgs import SHARED_LONGITUDE_LABELS
|
|
6
5
|
|
|
7
6
|
proportion = 1
|
|
8
7
|
tags = ["fr", "geo"]
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
"
|
|
13
|
-
"
|
|
14
|
-
"
|
|
15
|
-
|
|
8
|
+
labels = [
|
|
9
|
+
"longitude",
|
|
10
|
+
"lon",
|
|
11
|
+
"long",
|
|
12
|
+
"geocodage x gps",
|
|
13
|
+
"location longitude",
|
|
14
|
+
"xlongitude",
|
|
15
|
+
"lng",
|
|
16
|
+
"xlong",
|
|
17
|
+
"x",
|
|
18
|
+
"xf",
|
|
19
|
+
"xd",
|
|
20
|
+
]
|
|
16
21
|
|
|
17
22
|
_longitudel93 = LongitudeL93()
|
|
18
23
|
|
|
@@ -1,47 +1,32 @@
|
|
|
1
1
|
from csv_detective.formats.float import _is as is_float
|
|
2
|
-
from csv_detective.formats.int import _is as is_int
|
|
3
2
|
|
|
4
3
|
proportion = 1
|
|
5
4
|
tags = ["geo"]
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
"
|
|
10
|
-
"
|
|
11
|
-
"
|
|
12
|
-
"
|
|
13
|
-
"
|
|
14
|
-
"
|
|
15
|
-
"
|
|
16
|
-
"
|
|
17
|
-
"
|
|
18
|
-
|
|
19
|
-
"xlon": 1,
|
|
20
|
-
"xlong": 1,
|
|
21
|
-
}
|
|
22
|
-
labels = SHARED_LONGITUDE_LABELS | {
|
|
23
|
-
"x gps": 1,
|
|
24
|
-
"longitude wgs84": 1,
|
|
25
|
-
"x wgs84": 1,
|
|
26
|
-
"wsg": 0.75,
|
|
27
|
-
"gps": 0.5,
|
|
28
|
-
}
|
|
5
|
+
labels = [
|
|
6
|
+
"longitude",
|
|
7
|
+
"lon",
|
|
8
|
+
"long",
|
|
9
|
+
"geocodage x gps",
|
|
10
|
+
"location longitude",
|
|
11
|
+
"xlongitude",
|
|
12
|
+
"lng",
|
|
13
|
+
"xlong",
|
|
14
|
+
"x",
|
|
15
|
+
"xf",
|
|
16
|
+
"xd",
|
|
17
|
+
]
|
|
29
18
|
|
|
30
19
|
|
|
31
20
|
def _is(val):
|
|
32
21
|
try:
|
|
33
|
-
return (
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
# but 1.200 is saved as 1.2 in csv so we just discriminate ints
|
|
38
|
-
and not is_int(val)
|
|
39
|
-
)
|
|
40
|
-
except Exception:
|
|
22
|
+
return is_float(val) and float(val) >= -180 and float(val) <= 180
|
|
23
|
+
except ValueError:
|
|
24
|
+
return False
|
|
25
|
+
except OverflowError:
|
|
41
26
|
return False
|
|
42
27
|
|
|
43
28
|
|
|
44
29
|
_test_values = {
|
|
45
|
-
True: ["120
|
|
46
|
-
False: ["-200"
|
|
30
|
+
True: ["120", "-20.2"],
|
|
31
|
+
False: ["-200"],
|
|
47
32
|
}
|
|
@@ -1,19 +1,32 @@
|
|
|
1
|
-
from csv_detective.formats.
|
|
1
|
+
from csv_detective.formats.float import _is as is_float
|
|
2
2
|
|
|
3
3
|
proportion = 1
|
|
4
4
|
tags = ["fr", "geo"]
|
|
5
|
-
|
|
6
|
-
|
|
5
|
+
labels = [
|
|
6
|
+
"longitude",
|
|
7
|
+
"lon",
|
|
8
|
+
"long",
|
|
9
|
+
"geocodage x gps",
|
|
10
|
+
"location longitude",
|
|
11
|
+
"xlongitude",
|
|
12
|
+
"lng",
|
|
13
|
+
"xlong",
|
|
14
|
+
"x",
|
|
15
|
+
"xf",
|
|
16
|
+
"xd",
|
|
17
|
+
]
|
|
7
18
|
|
|
8
19
|
|
|
9
20
|
def _is(val):
|
|
10
21
|
try:
|
|
11
|
-
return
|
|
12
|
-
except
|
|
22
|
+
return is_float(val) and float(val) >= -5.5 and float(val) <= 9.8
|
|
23
|
+
except ValueError:
|
|
24
|
+
return False
|
|
25
|
+
except OverflowError:
|
|
13
26
|
return False
|
|
14
27
|
|
|
15
28
|
|
|
16
29
|
_test_values = {
|
|
17
|
-
True: ["-2.
|
|
30
|
+
True: ["-2.5"],
|
|
18
31
|
False: ["12.8"],
|
|
19
32
|
}
|
|
@@ -4,20 +4,19 @@ from csv_detective.formats.longitude_wgs import _is as is_lon
|
|
|
4
4
|
|
|
5
5
|
proportion = 1
|
|
6
6
|
tags = ["geo"]
|
|
7
|
-
mandatory_label = True
|
|
8
7
|
|
|
9
|
-
specific =
|
|
10
|
-
"lonlat"
|
|
11
|
-
"lon lat"
|
|
12
|
-
"y x"
|
|
13
|
-
"yx"
|
|
14
|
-
|
|
8
|
+
specific = [
|
|
9
|
+
"lonlat",
|
|
10
|
+
"lon lat",
|
|
11
|
+
"y x",
|
|
12
|
+
"yx",
|
|
13
|
+
]
|
|
15
14
|
|
|
16
15
|
# we aim wide to catch exact matches if possible for the highest possible score
|
|
17
|
-
|
|
16
|
+
words = (
|
|
18
17
|
SHARED_COORDS_LABELS
|
|
19
|
-
|
|
20
|
-
|
|
18
|
+
+ specific
|
|
19
|
+
+ [w + sep + suf for suf in specific for w in SHARED_COORDS_LABELS for sep in ["", " "]]
|
|
21
20
|
)
|
|
22
21
|
|
|
23
22
|
|
|
@@ -32,6 +31,6 @@ def _is(val):
|
|
|
32
31
|
|
|
33
32
|
|
|
34
33
|
_test_values = {
|
|
35
|
-
True: ["-22.6,43.
|
|
36
|
-
False: ["192,0.1", "92, -102", "[4.1,23.02", "4.1,23.02]", "-27,160.1"
|
|
34
|
+
True: ["-22.6,43.2", "140,-10.7", "10.8, -40.7", "[-0.28,12]"],
|
|
35
|
+
False: ["192,0.1", "92, -102", "[4.1,23.02", "4.1,23.02]", "-27,160.1"],
|
|
37
36
|
}
|
csv_detective/formats/money.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from csv_detective.formats.float import _is as is_float
|
|
2
2
|
|
|
3
3
|
proportion = 0.8
|
|
4
|
-
labels =
|
|
4
|
+
labels = ["budget", "salaire", "euro", "euros", "prêt", "montant"]
|
|
5
5
|
|
|
6
6
|
currencies = {"€", "$", "£", "¥"}
|
|
7
7
|
|
csv_detective/formats/pays.py
CHANGED
|
@@ -2,17 +2,19 @@ from frformat import Millesime, Options, Pays
|
|
|
2
2
|
|
|
3
3
|
proportion = 0.6
|
|
4
4
|
tags = ["fr", "geo"]
|
|
5
|
-
labels =
|
|
6
|
-
"pays"
|
|
7
|
-
"payslieu"
|
|
8
|
-
"paysorg"
|
|
9
|
-
"country"
|
|
10
|
-
"pays lib"
|
|
11
|
-
"lieupays"
|
|
12
|
-
"pays beneficiaire"
|
|
13
|
-
"nom du pays"
|
|
14
|
-
"
|
|
15
|
-
|
|
5
|
+
labels = [
|
|
6
|
+
"pays",
|
|
7
|
+
"payslieu",
|
|
8
|
+
"paysorg",
|
|
9
|
+
"country",
|
|
10
|
+
"pays lib",
|
|
11
|
+
"lieupays",
|
|
12
|
+
"pays beneficiaire",
|
|
13
|
+
"nom du pays",
|
|
14
|
+
"journey start country",
|
|
15
|
+
"libelle pays",
|
|
16
|
+
"journey end country",
|
|
17
|
+
]
|
|
16
18
|
|
|
17
19
|
_options = Options(
|
|
18
20
|
ignore_case=True,
|
csv_detective/formats/percent.py
CHANGED
csv_detective/formats/region.py
CHANGED
|
@@ -2,19 +2,19 @@ from frformat import Millesime, Options, Region
|
|
|
2
2
|
|
|
3
3
|
proportion = 1
|
|
4
4
|
tags = ["fr", "geo"]
|
|
5
|
-
labels =
|
|
6
|
-
"region"
|
|
7
|
-
"libelle region"
|
|
8
|
-
"nom region"
|
|
9
|
-
"libelle reg"
|
|
10
|
-
"nom reg"
|
|
11
|
-
"reg libusage"
|
|
12
|
-
"nom de la region"
|
|
13
|
-
"regionorg"
|
|
14
|
-
"regionlieu"
|
|
15
|
-
"reg"
|
|
16
|
-
"nom officiel region"
|
|
17
|
-
|
|
5
|
+
labels = [
|
|
6
|
+
"region",
|
|
7
|
+
"libelle region",
|
|
8
|
+
"nom region",
|
|
9
|
+
"libelle reg",
|
|
10
|
+
"nom reg",
|
|
11
|
+
"reg libusage",
|
|
12
|
+
"nom de la region",
|
|
13
|
+
"regionorg",
|
|
14
|
+
"regionlieu",
|
|
15
|
+
"reg",
|
|
16
|
+
"nom officiel region",
|
|
17
|
+
]
|
|
18
18
|
|
|
19
19
|
_extra_valid_values_set = frozenset(
|
|
20
20
|
{
|
csv_detective/formats/sexe.py
CHANGED
csv_detective/formats/siren.py
CHANGED
|
@@ -2,15 +2,16 @@ import re
|
|
|
2
2
|
|
|
3
3
|
proportion = 0.9
|
|
4
4
|
tags = ["fr"]
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
"siren"
|
|
8
|
-
"
|
|
9
|
-
"siren
|
|
10
|
-
"siren
|
|
11
|
-
"
|
|
12
|
-
"
|
|
13
|
-
|
|
5
|
+
labels = [
|
|
6
|
+
"siren",
|
|
7
|
+
"siren organisme designe",
|
|
8
|
+
"siren organisme designant",
|
|
9
|
+
"n° siren",
|
|
10
|
+
"siren organisme",
|
|
11
|
+
"siren titulaire",
|
|
12
|
+
"numero siren",
|
|
13
|
+
"epci",
|
|
14
|
+
]
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
def _is(val):
|
csv_detective/formats/siret.py
CHANGED
|
@@ -2,15 +2,15 @@ import re
|
|
|
2
2
|
|
|
3
3
|
proportion = 0.8
|
|
4
4
|
tags = ["fr"]
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
"siret"
|
|
8
|
-
"num siret"
|
|
9
|
-
"siretacheteur"
|
|
10
|
-
"n° siret"
|
|
11
|
-
"coll siret"
|
|
12
|
-
"epci"
|
|
13
|
-
|
|
5
|
+
labels = [
|
|
6
|
+
"siret",
|
|
7
|
+
"siret d",
|
|
8
|
+
"num siret",
|
|
9
|
+
"siretacheteur",
|
|
10
|
+
"n° siret",
|
|
11
|
+
"coll siret",
|
|
12
|
+
"epci",
|
|
13
|
+
]
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def _is(val):
|
csv_detective/formats/tel_fr.py
CHANGED
|
@@ -2,13 +2,19 @@ import re
|
|
|
2
2
|
|
|
3
3
|
proportion = 0.7
|
|
4
4
|
tags = ["fr"]
|
|
5
|
-
labels =
|
|
6
|
-
"telephone"
|
|
7
|
-
"tel"
|
|
8
|
-
"
|
|
9
|
-
"
|
|
10
|
-
"
|
|
11
|
-
|
|
5
|
+
labels = [
|
|
6
|
+
"telephone",
|
|
7
|
+
"tel",
|
|
8
|
+
"tel1",
|
|
9
|
+
"tel2",
|
|
10
|
+
"phone",
|
|
11
|
+
"num tel",
|
|
12
|
+
"tel mob",
|
|
13
|
+
"telephone sav",
|
|
14
|
+
"telephone1",
|
|
15
|
+
"coordinates.phone",
|
|
16
|
+
"telephone du lieu",
|
|
17
|
+
]
|
|
12
18
|
|
|
13
19
|
|
|
14
20
|
def _is(val):
|
csv_detective/formats/uai.py
CHANGED
|
@@ -2,23 +2,24 @@ import re
|
|
|
2
2
|
|
|
3
3
|
proportion = 0.8
|
|
4
4
|
tags = ["fr"]
|
|
5
|
-
labels =
|
|
6
|
-
"uai"
|
|
7
|
-
"code etablissement"
|
|
8
|
-
"code uai"
|
|
9
|
-
"uai - identifiant"
|
|
10
|
-
"numero uai"
|
|
11
|
-
"rne"
|
|
12
|
-
"numero de l'etablissement"
|
|
13
|
-
"code rne"
|
|
14
|
-
"codeetab"
|
|
15
|
-
"code uai de l'etablissement"
|
|
16
|
-
"ref uai"
|
|
17
|
-
"cd rne"
|
|
18
|
-
"numerouai"
|
|
19
|
-
"numero d etablissement"
|
|
20
|
-
"
|
|
21
|
-
|
|
5
|
+
labels = [
|
|
6
|
+
"uai",
|
|
7
|
+
"code etablissement",
|
|
8
|
+
"code uai",
|
|
9
|
+
"uai - identifiant",
|
|
10
|
+
"numero uai",
|
|
11
|
+
"rne",
|
|
12
|
+
"numero de l'etablissement",
|
|
13
|
+
"code rne",
|
|
14
|
+
"codeetab",
|
|
15
|
+
"code uai de l'etablissement",
|
|
16
|
+
"ref uai",
|
|
17
|
+
"cd rne",
|
|
18
|
+
"numerouai",
|
|
19
|
+
"numero d etablissement",
|
|
20
|
+
"code etablissement",
|
|
21
|
+
"numero etablissement",
|
|
22
|
+
]
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
def _is(val):
|
csv_detective/formats/url.py
CHANGED
|
@@ -1,22 +1,22 @@
|
|
|
1
1
|
import re
|
|
2
2
|
|
|
3
3
|
proportion = 1
|
|
4
|
-
labels =
|
|
5
|
-
"url"
|
|
6
|
-
"url source"
|
|
7
|
-
"site web"
|
|
8
|
-
"source url"
|
|
9
|
-
"site internet"
|
|
10
|
-
"remote url"
|
|
11
|
-
"web"
|
|
12
|
-
"site"
|
|
13
|
-
"lien"
|
|
14
|
-
"site data"
|
|
15
|
-
"lien url"
|
|
16
|
-
"lien vers le fichier"
|
|
17
|
-
"sitweb"
|
|
18
|
-
"interneturl"
|
|
19
|
-
|
|
4
|
+
labels = [
|
|
5
|
+
"url",
|
|
6
|
+
"url source",
|
|
7
|
+
"site web",
|
|
8
|
+
"source url",
|
|
9
|
+
"site internet",
|
|
10
|
+
"remote url",
|
|
11
|
+
"web",
|
|
12
|
+
"site",
|
|
13
|
+
"lien",
|
|
14
|
+
"site data",
|
|
15
|
+
"lien url",
|
|
16
|
+
"lien vers le fichier",
|
|
17
|
+
"sitweb",
|
|
18
|
+
"interneturl",
|
|
19
|
+
]
|
|
20
20
|
|
|
21
21
|
pattern = re.compile(
|
|
22
22
|
r"^((https?|ftp)://|www\.)(([A-Za-z0-9-]+\.)+[A-Za-z]{2,6})"
|
csv_detective/formats/uuid.py
CHANGED
csv_detective/formats/year.py
CHANGED
|
@@ -1,12 +1,17 @@
|
|
|
1
1
|
proportion = 1
|
|
2
2
|
tags = ["temp"]
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
"
|
|
6
|
-
"annee"
|
|
7
|
-
"
|
|
8
|
-
"exercice"
|
|
9
|
-
|
|
3
|
+
labels = [
|
|
4
|
+
"year",
|
|
5
|
+
"annee",
|
|
6
|
+
"annee depot",
|
|
7
|
+
"an nais",
|
|
8
|
+
"exercice",
|
|
9
|
+
"data year",
|
|
10
|
+
"annee de publication",
|
|
11
|
+
"exercice comptable",
|
|
12
|
+
"annee de naissance",
|
|
13
|
+
"annee ouverture",
|
|
14
|
+
]
|
|
10
15
|
|
|
11
16
|
|
|
12
17
|
def _is(val):
|
csv_detective/output/__init__.py
CHANGED
|
File without changes
|
|
@@ -13,16 +13,11 @@ from csv_detective.parsing.csv import CHUNK_SIZE
|
|
|
13
13
|
from csv_detective.utils import display_logs_depending_process_time
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
def cast(value: str, _type: str) -> str |
|
|
17
|
-
if not isinstance(value, str) or value
|
|
18
|
-
#
|
|
16
|
+
def cast(value: str, _type: str) -> str | float | bool | date | datetime | bytes | None:
|
|
17
|
+
if not isinstance(value, str) or not value:
|
|
18
|
+
# None is the current default value in hydra, should we keep this?
|
|
19
19
|
return None
|
|
20
20
|
match _type:
|
|
21
|
-
case "string":
|
|
22
|
-
# not used here, convenience for external use (cc hydra)
|
|
23
|
-
return value
|
|
24
|
-
case "int":
|
|
25
|
-
return int(value)
|
|
26
21
|
case "float":
|
|
27
22
|
return float_casting(value)
|
|
28
23
|
case "bool":
|
csv_detective/output/example.py
CHANGED
|
File without changes
|
csv_detective/output/profile.py
CHANGED
|
@@ -23,7 +23,7 @@ def create_profile(
|
|
|
23
23
|
logging.info("Creating profile")
|
|
24
24
|
|
|
25
25
|
if num_rows > 0:
|
|
26
|
-
raise ValueError("To create
|
|
26
|
+
raise ValueError("To create profiles num_rows has to be set to -1")
|
|
27
27
|
if not limited_output:
|
|
28
28
|
columns = {
|
|
29
29
|
k: v[0] if v else {"python_type": "string", "format": "string", "score": 1.0}
|
|
@@ -81,11 +81,7 @@ def create_profile(
|
|
|
81
81
|
del cast_col
|
|
82
82
|
# for all formats we want most frequent values, nb unique values and nb missing values
|
|
83
83
|
tops_bruts = (
|
|
84
|
-
(
|
|
85
|
-
table[c].value_counts()
|
|
86
|
-
if _col_values is None
|
|
87
|
-
else (s := _col_values[c]).loc[s.index.notna()].sort_values(ascending=False)
|
|
88
|
-
)
|
|
84
|
+
(table[c].value_counts() if _col_values is None else _col_values[c].sort_values())
|
|
89
85
|
.reset_index(name=_count_col)
|
|
90
86
|
.iloc[:10]
|
|
91
87
|
.to_dict(orient="records")
|
csv_detective/output/schema.py
CHANGED
|
File without changes
|
csv_detective/output/utils.py
CHANGED
|
File without changes
|
|
File without changes
|
csv_detective/parsing/columns.py
CHANGED
|
File without changes
|
csv_detective/parsing/csv.py
CHANGED
|
File without changes
|
csv_detective/parsing/excel.py
CHANGED
|
@@ -23,7 +23,7 @@ def parse_excel(
|
|
|
23
23
|
file_path: str,
|
|
24
24
|
num_rows: int = -1,
|
|
25
25
|
engine: str | None = None,
|
|
26
|
-
sheet_name: str |
|
|
26
|
+
sheet_name: str | None = None,
|
|
27
27
|
random_state: int = 42,
|
|
28
28
|
verbose: bool = False,
|
|
29
29
|
) -> tuple[pd.DataFrame, int, int, str, str, int]:
|