PyPI - csv-detective - Versions diffs - 0.7.5.dev1286__py3-none-any.whl → 0.7.5.dev1307__py3-none-any.whl - Mend

csv-detective 0.7.5.dev1286py3-none-any.whl → 0.7.5.dev1307py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

csv_detective/detect_fields/__init__.py CHANGED Viewed

@@ -10,19 +10,21 @@ from .FR.other import (
     insee_ape700,
     date_fr,
     code_waldec,
-    code_rna
+    code_rna,
 )
 from .other import (
     email,
     url,
     booleen,
+    money,
     mongo_object_id,
+    percent,
     twitter,
     float,
     int,
     uuid,
-    json
+    json,
 )
 from .FR.geo import (
@@ -40,7 +42,7 @@ from .FR.geo import (
     code_region,
     latitude_l93,
     longitude_l93,
-    insee_canton
+    insee_canton,
 )
 from .geo import (
@@ -50,7 +52,7 @@ from .geo import (
     latitude_wgs,
     longitude_wgs,
     latlon_wgs,
-    json_geojson
+    json_geojson,
 )
 from .FR.temp import jour_de_la_semaine, mois_de_annee

csv_detective/detect_fields/geo/latlon_wgs/__init__.py CHANGED Viewed

@@ -1,13 +1,13 @@
-import re
+from ..latitude_wgs import _is as is_lat
+from ..longitude_wgs import _is as is_lon
-PROPORTION = 0.9
+PROPORTION = 1
 def _is(val):
     '''Renvoie True si val peut etre une latitude,longitude'''
-    return isinstance(val, str) and bool(
-        re.match(
-            r'^\[?[\+\-]?[0-8]?\d\.\d* ?, ?[\+\-]?(1[0-7]\d|\d{1,2})\.\d+\]?$', val
-        )
-    )
+    if not isinstance(val, str) or val.count(",") != 1:
+        return False
+    lat, lon = val.split(",")
+    return is_lat(lat) and is_lon(lon.replace(" ", ""))

csv_detective/detect_fields/other/money/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from ..float import _is as is_float
+currencies = set(["€", "$", "£", "¥"])
+PROPORTION = 0.8
+def _is(val: str):
+    if not isinstance(val, str) or val[-1] not in currencies:
+        return False
+    return is_float(val[:-1])

csv_detective/detect_fields/other/percent/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+from ..float import _is as is_float
+PROPORTION = 0.8
+def _is(val: str):
+    if not isinstance(val, str) or val[-1] != "%":
+        return False
+    return is_float(val[:-1])

csv_detective/output/example.py CHANGED Viewed

@@ -17,12 +17,12 @@ def create_example_csv_file(
     fields: Optional[dict] = None,
     schema_path: Optional[str] = None,
     file_length: int = 10,
-    output_name: str = 'example_file.csv',
-    output_sep: str = ';',
-    encoding: str = 'utf-8',
+    output_name: Optional[str] = "example_file.csv",
+    output_sep: str = ";",
+    encoding: str = "utf-8",
     ignore_required: bool = False,
 ) -> pd.DataFrame:
-    '''
+    """
     Create an example file based on a list of dicts like follows:
     fields = [
         {
@@ -33,7 +33,7 @@ def create_example_csv_file(
         ...
     ]
     Or from a TableSchema
-    '''
+    """
     # need to make a CLI command
     if not (fields or schema_path):
@@ -53,65 +53,65 @@ def create_example_csv_file(
         enum: Optional[str] = None,
     ) -> str:
         if potential_skip(required):
-            return ''
+            return ""
         if pattern is not None:
             return rstr.xeger(pattern)
         elif enum is not None:
             return random.choice(enum)
         else:
             letters = string.ascii_lowercase
-            return ''.join(random.choice(letters) for i in range(length))
+            return "".join(random.choice(letters) for i in range(length))
     def _id(
         required: bool = True,
     ) -> str:
         if potential_skip(required):
-            return ''
+            return ""
         return str(uuid.uuid4())
     def _date(
         date_range: Union[None, list[str]] = None,
-        format: str = '%Y-%m-%d',
+        format: str = "%Y-%m-%d",
         required: bool = True,
     ) -> str:
         # the bounds specified in date_range are expected in the same format as the desired output format
-        assert all([k in format for k in ['%d', '%m', '%Y']])
+        assert all([k in format for k in ["%d", "%m", "%Y"]])
         if potential_skip(required):
-            return ''
+            return ""
         if date_range is None:
             return fake.date(format)
         else:
             if len(date_range) != 2:
-                raise ValueError('"date_range" must have exactly two elements.')
+                raise ValueError("'date_range' must have exactly two elements.")
             return fake.date_between_dates(
                 datetime.strptime(date_range[0], format),
                 datetime.strptime(date_range[1], format),
             ).strftime(format)
     def _time(
-        format: str = '%H:%M:%S',
+        format: str = "%H:%M:%S",
         required: bool = True,
     ) -> str:
-        assert all([k in format for k in ['%H', '%M', '%S']])
+        assert all([k in format for k in ["%H", "%M", "%S"]])
         if potential_skip(required):
-            return ''
+            return ""
         # maybe add a time_range argument?
         return fake.time(format)
     def _datetime(
         datetime_range: Optional[list[str]] = None,
-        format: str = '%Y-%m-%d %H-%M-%S',
+        format: str = "%Y-%m-%d %H-%M-%S",
         required: bool = True,
     ) -> str:
         # the bounds specified in datetime_range are expected in the same format as the desired output format
-        assert all([k in format for k in ['%d', '%m', '%Y', '%H', '%M', '%S']])
+        assert all([k in format for k in ["%d", "%m", "%Y", "%H", "%M", "%S"]])
         if potential_skip(required):
-            return ''
+            return ""
         if datetime_range is None:
             return fake.date_time().strftime(format)
         else:
             if len(datetime_range) != 2:
-                raise ValueError('"date_range" must have exactly two elements.')
+                raise ValueError("'date_range' must have exactly two elements.")
             return fake.date_time_between(
                 datetime.strptime(datetime_range[0], format),
                 datetime.strptime(datetime_range[1], format),
@@ -119,8 +119,8 @@ def create_example_csv_file(
     def _url(required: bool = True) -> str:
         if potential_skip(required):
-            return ''
-        return f'http://{rstr.domainsafe()}.{rstr.letters(3)}/{rstr.urlsafe()}'
+            return ""
+        return f"http://{rstr.domainsafe()}.{rstr.letters(3)}/{rstr.urlsafe()}"
     def _number(
         num_type: Type[Union[int, float]] = int,
@@ -130,7 +130,7 @@ def create_example_csv_file(
     ) -> Union[int, float]:
         assert num_range is None or len(num_range) == 2
         if potential_skip(required):
-            return ''
+            return ""
         if enum:
             return random.choice(enum)
         if num_range is None:
@@ -142,100 +142,100 @@ def create_example_csv_file(
     def _bool(required: bool = True) -> bool:
         if potential_skip(required):
-            return ''
+            return ""
         return random.randint(0, 1) == 0
     def _array(enum: list[Any], required: bool = True) -> str:
         if potential_skip(required):
-            return ''
+            return ""
         return f"[{','.join(random.sample(enum, random.randint(1, len(enum))))}]"
     def build_args_from_constraints(constraints: dict) -> dict:
         args = {}
-        args['required'] = constraints.get('required', False)
-        for _ in ['pattern', 'enum', 'format']:
+        args["required"] = constraints.get("required", False)
+        for _ in ["pattern", "enum", "format"]:
             if _ in constraints:
                 args[_] = constraints[_]
-        if 'minimum' in constraints and 'maximum' in constraints:
-            args['num_range'] = [constraints['minimum'], constraints['maximum']]
+        if "minimum" in constraints and "maximum" in constraints:
+            args["num_range"] = [constraints["minimum"], constraints["maximum"]]
         # maybe there are better values than these?
-        elif 'minimum' in constraints:
-            args['num_range'] = [constraints['minimum'], 10 + constraints['minimum']]
-        elif 'maximum' in constraints:
-            args['num_range'] = [constraints['maximum'] - 10, constraints['maximum']]
-        if 'minLength' in constraints:
-            args['length'] = constraints['minLength']
-        if 'maxLength' in constraints:
-            args['length'] = constraints['maxLength']
+        elif "minimum" in constraints:
+            args["num_range"] = [constraints["minimum"], 10 + constraints["minimum"]]
+        elif "maximum" in constraints:
+            args["num_range"] = [constraints["maximum"] - 10, constraints["maximum"]]
+        if "minLength" in constraints:
+            args["length"] = constraints["minLength"]
+        if "maxLength" in constraints:
+            args["length"] = constraints["maxLength"]
         return args
     schema_types_to_python = {
-        'number': 'float',
-        'integer': 'int',
-        'string': 'str',
-        'year': 'year',
-        'boolean': 'bool',
-        'date': 'date',
-        'yearmonth': 'date',
-        'time': 'time',
-        'datetime': 'datetime',
-        'array': 'array'
+        "number": "float",
+        "integer": "int",
+        "string": "str",
+        "year": "year",
+        "boolean": "bool",
+        "date": "date",
+        "yearmonth": "date",
+        "time": "time",
+        "datetime": "datetime",
+        "array": "array"
     }
     if schema_path:
-        if schema_path.startswith('http'):
+        if schema_path.startswith("http"):
             schema = requests.get(schema_path).json()
         else:
             with open(schema_path, encoding=encoding) as jsonfile:
                 schema = json.load(jsonfile)
-        if not ('fields' in schema.keys()):
-            raise ValueError('The schema must have a "fields" key.')
+        if not ("fields" in schema.keys()):
+            raise ValueError("The schema must have a 'fields' key.")
         else:
             fields = [
                 {
-                    'name': f['name'],
-                    'type': schema_types_to_python.get(f['type'], 'str'),
+                    "name": f["name"],
+                    "type": schema_types_to_python.get(f["type"], "str"),
                     # when frformat is supported in TableSchema, we can build args for French standards
                     # linked to https://github.com/datagouv/fr-format/issues/26
-                    'args': (
-                        build_args_from_constraints(f['constraints']) if 'constraints' in f.keys()
-                        else build_args_from_constraints(f['arrayItem']['constraints'])
-                        if 'arrayItem' in f.keys() and 'constraints' in f['arrayItem'].keys()
+                    "args": (
+                        build_args_from_constraints(f["constraints"]) if "constraints" in f.keys()
+                        else build_args_from_constraints(f["arrayItem"]["constraints"])
+                        if "arrayItem" in f.keys() and "constraints" in f["arrayItem"].keys()
                         else {}
                     )
-                } for f in schema['fields']
+                } for f in schema["fields"]
             ]
     for k in range(len(fields)):
-        if 'args' not in fields[k]:
-            fields[k]['args'] = {}
-        if fields[k]['type'] == 'float':
-            fields[k]['args']['num_type'] = float
-        elif fields[k]['type'] == 'int':
-            fields[k]['args']['num_type'] = int
-        elif fields[k]['type'] == 'year':
-            fields[k]['args']['num_type'] = int
-            fields[k]['args']['num_range'] = [1990, 2050]
+        if "args" not in fields[k]:
+            fields[k]["args"] = {}
+        if fields[k]["type"] == "float":
+            fields[k]["args"]["num_type"] = float
+        elif fields[k]["type"] == "int":
+            fields[k]["args"]["num_type"] = int
+        elif fields[k]["type"] == "year":
+            fields[k]["args"]["num_type"] = int
+            fields[k]["args"]["num_range"] = [1990, 2050]
     types_to_func = {
-        'int': _number,
-        'float': _number,
-        'date': _date,
-        'time': _time,
-        'str': _string,
-        'url': _url,
-        'id': _id,
-        'year': _number,
-        'bool': _bool,
-        'datetime': _datetime,
-        'array': _array,
+        "int": _number,
+        "float": _number,
+        "date": _date,
+        "time": _time,
+        "str": _string,
+        "url": _url,
+        "id": _id,
+        "year": _number,
+        "bool": _bool,
+        "datetime": _datetime,
+        "array": _array,
     }
     # would it be better to create by column or by row (as for now)?
     output = pd.DataFrame(
         [
             [
-                types_to_func.get(f['type'], 'str')(**f['args'])
+                types_to_func.get(f["type"], "str")(**f["args"])
                 for f in fields
             ] for _ in range(file_length)
         ],

csv_detective/validate.py CHANGED Viewed

@@ -42,7 +42,8 @@ def validate(
         any(col_name not in list(table.columns) for col_name in previous_analysis["columns"])
         or any(col_name not in list(previous_analysis["columns"].keys()) for col_name in table.columns)
     ):
-        logging.warning("> Columns do not match, proceeding with full analysis")
+        if verbose:
+            logging.warning("> Columns do not match, proceeding with full analysis")
         return False, table, analysis
     for col_name, args in previous_analysis["columns"].items():
         if verbose:
@@ -55,7 +56,8 @@ def validate(
         if skipna:
             col_data = col_data.loc[~col_data.isna()]
         if not col_data.apply(test_func).all():
-            logging.warning("> Test failed, proceeding with full analysis")
+            if verbose:
+                logging.warning("> Test failed, proceeding with full analysis")
             return False, table, analysis
     if verbose:
         logging.info("> All checks successful")

{csv_detective-0.7.5.dev1286.data → csv_detective-0.7.5.dev1307.data}/data/share/csv_detective/CHANGELOG.md RENAMED Viewed

@@ -15,6 +15,7 @@
 - Restructure repo (breaking changes) [#111](https://github.com/datagouv/csv-detective/pull/111)
 - Add validation function and associated flow [#112](https://github.com/datagouv/csv-detective/pull/112)
 - Better float detection [#113](https://github.com/datagouv/csv-detective/pull/113)
+- Refactor fields tests [#114](https://github.com/datagouv/csv-detective/pull/114)
 ## 0.7.4 (2024-11-15)

{csv_detective-0.7.5.dev1286.dist-info → csv_detective-0.7.5.dev1307.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: csv_detective
-Version: 0.7.5.dev1286
+Version: 0.7.5.dev1307
 Summary: Detect CSV column content
 Home-page: https://github.com/etalab/csv_detective
 Author: Etalab

{csv_detective-0.7.5.dev1286.dist-info → csv_detective-0.7.5.dev1307.dist-info}/RECORD RENAMED Viewed

@@ -4,8 +4,8 @@ csv_detective/explore_csv.py,sha256=ocWlUEtuwZ-6bjDc6gfhC2-6DljMVhvXhHrfICCXGfQ,
 csv_detective/load_tests.py,sha256=GILvfkd4OVI-72mA4nzbPlZqgcXZ4wznOhGfZ1ucWkM,2385
 csv_detective/s3_utils.py,sha256=1cIVdQUYY2ovErbMwp72Gqtqx2bkB8nfVhn-QaOFTT0,1451
 csv_detective/utils.py,sha256=Bx_1k4Sdpd5PCjuAy4AeayCmmw7TMR_zgtKIHNLi5g0,1157
-csv_detective/validate.py,sha256=o4Qulf8E-x1zsWT9OD4Fpw83Gku1WA3JlX83j7bu0DA,2314
-csv_detective/detect_fields/__init__.py,sha256=NVfE3BQVExgXb-BPbhDvlkM5-0naEVLpZ4aM_OGHYfE,931
+csv_detective/validate.py,sha256=0wSi5GgKPRW3m66413a-9Uti1vBRam5pQxVA9Dc5jQ8,2368
+csv_detective/detect_fields/__init__.py,sha256=qkwT_o_S7qvLEsRssICpoGmCc3h5y2MVy1XI56LFcV0,959
 csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 csv_detective/detect_fields/FR/geo/adresse/__init__.py,sha256=NqV8ULf9gY9iFnA1deKR-1Yobr96WwCsn5JfbP_MjiY,1675
@@ -50,7 +50,7 @@ csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py,sha256=wJAy
 csv_detective/detect_fields/geo/iso_country_code_numeric/iso_country_code_numeric.txt,sha256=2GtEhuporsHYV-pU4q9kfXU5iOtfW5C0GYBTTKQtnnA,1004
 csv_detective/detect_fields/geo/json_geojson/__init__.py,sha256=FPHOfTrfXJs62-NgeOcNGOvwPd7I1fEVp8lTdMNfj3w,433
 csv_detective/detect_fields/geo/latitude_wgs/__init__.py,sha256=ArS6PuYEd0atZwSqNDZhXZz1TwzdiwdV8ovRYTOacpg,327
-csv_detective/detect_fields/geo/latlon_wgs/__init__.py,sha256=3nlBqFYD4kVSVxw4b9DTPcxW59oL0T3Kj0OxPlyP9og,268
+csv_detective/detect_fields/geo/latlon_wgs/__init__.py,sha256=7_mnO9uC_kI7e2WR8xIer7Kqw8zi-v-JKaAD4zcoGbE,342
 csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=G7afWOKiGh_Tv7gwDNGt1a4B_A8hkCBkIxn3THDCUFk,330
 csv_detective/detect_fields/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 csv_detective/detect_fields/other/booleen/__init__.py,sha256=wn_yyTAmGxqo0l0b7JRpGb0da_E27iGxES9zWCrnsqc,497
@@ -58,7 +58,9 @@ csv_detective/detect_fields/other/email/__init__.py,sha256=O9tgJmq0O8Q-8iin63NqE
 csv_detective/detect_fields/other/float/__init__.py,sha256=AT4Kpgwoz5PuAoLx00u0SL8DjjXZxsE8zSRbN18uAv4,578
 csv_detective/detect_fields/other/int/__init__.py,sha256=QN3kQJLYqLRBiubUK7g4Xq03PlA5wqVwx2pPPIO9FdI,320
 csv_detective/detect_fields/other/json/__init__.py,sha256=DhzyvT12kOqgum89silIu3uoSYXmC_s_AaxLtXAD4eU,540
+csv_detective/detect_fields/other/money/__init__.py,sha256=g_ZwBZXl9LhldwFYQotC5WqLiE8qQCZHtoI9eJvl_9M,232
 csv_detective/detect_fields/other/mongo_object_id/__init__.py,sha256=7fcrHsOZAqXp2_N0IjPskYJ_qi4xRlo9iyNNDQVLzsU,156
+csv_detective/detect_fields/other/percent/__init__.py,sha256=vgpekNOPBRuunoVBXMi81rwHv4uSOhe78pbVtQ5SBO8,177
 csv_detective/detect_fields/other/twitter/__init__.py,sha256=qbwLKsTBRFQ4PyTNVeEZ5Hkf5Wwi3ZKclLER_V0YO3g,154
 csv_detective/detect_fields/other/url/__init__.py,sha256=9WaTqCglEsw_lJG_xZsBMdxJXg2yuQ92_fkX6CXWNV0,286
 csv_detective/detect_fields/other/uuid/__init__.py,sha256=3-z0fDax29SJc57zPjNGR6DPICJu6gfuNGC5L3jh4d0,223
@@ -135,7 +137,7 @@ csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_p
 csv_detective/detection/variables.py,sha256=3qEMtjZ_zyIFXvTnFgK7ZMDx8C12uQXKfFjEj2moyJc,3558
 csv_detective/output/__init__.py,sha256=XDS4Dgvv6oloIao9JquHa0m1nnlQ_q2gHuEPGlaETic,1890
 csv_detective/output/dataframe.py,sha256=89iQRE59cHQyQQEsujQVIKP2YAUYpPklWkdDOqZE-wE,2183
-csv_detective/output/example.py,sha256=i8PkdXxidF7qR_9aK8vh12JpZdJQryhBgyrMS8iy5rk,8642
+csv_detective/output/example.py,sha256=26rY7XNXK47e9xJMl-Js8jJwFIuv7V7B7e256VecKuk,8652
 csv_detective/output/profile.py,sha256=B8YU541T_YPDezJGh4dkHckOShiwHSrZd9GS8jbmz7A,2919
 csv_detective/output/schema.py,sha256=ZDBWDOD8IYp7rcB0_n8l9JXGIhOQ6bTZHFWfTmnNNEQ,13480
 csv_detective/output/utils.py,sha256=HbmvCCCmFo7NJxhD_UsJIveuw-rrfhrvYckv1CJn_10,2301
@@ -145,19 +147,19 @@ csv_detective/parsing/csv.py,sha256=11mibDnJhIjykXLGZvA5ZEU5U7KgxIrbyO6BNv6jlro,
 csv_detective/parsing/excel.py,sha256=AslE2S1e67o8yTIAIhp-lAnJ6-XqeBBRz1-VMFqhZBM,7055
 csv_detective/parsing/load.py,sha256=SpP0pfxswOAPPpwbZfoP1blh0EKV5VMs0TpTgQJKzjs,3621
 csv_detective/parsing/text.py,sha256=rsfk66BCmdpsCOd0kDJ8tmqMsEWd-OeBkEisWc4Ej9k,1246
-csv_detective-0.7.5.dev1286.data/data/share/csv_detective/CHANGELOG.md,sha256=Gqw7W41bXK_JgIYi80vdOPR6JLY5rgABeNsiDStE4XA,7901
-csv_detective-0.7.5.dev1286.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
-csv_detective-0.7.5.dev1286.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
-csv_detective-0.7.5.dev1286.dist-info/licenses/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
+csv_detective-0.7.5.dev1307.data/data/share/csv_detective/CHANGELOG.md,sha256=Y8aL18x5EGGvA9AqukEi4tn78se_Lzisa2J32kOSer8,7984
+csv_detective-0.7.5.dev1307.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
+csv_detective-0.7.5.dev1307.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
+csv_detective-0.7.5.dev1307.dist-info/licenses/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
 tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-tests/test_example.py,sha256=0NfChooJQlFxTo2nY5FOQIcsK4zzWA_SBmt2LwVQovY,2014
-tests/test_fields.py,sha256=53kiUQiqGt4_fnyCoxhNLeEsuN1LRDB-7HGT3p_Ed9I,11147
+tests/test_example.py,sha256=JeHxSK0IVDcSrOhSZlNGSQv4JAc_r6mzvJM8PfmLTMw,2018
+tests/test_fields.py,sha256=0hce2XtDHY9dTLCYhrm2s4I41OeKsQbbaKmDZ4XctUw,9824
 tests/test_file.py,sha256=9APE1d43lQ8Dk8lwJFNUK_YekYYsQ0ae2_fgpcPE9mk,8116
 tests/test_labels.py,sha256=6MOKrGznkwU5fjZ_3oiB6Scmb480Eu-9geBJs0UDLds,159
 tests/test_structure.py,sha256=bv-tjgXohvQAxwmxzH0BynFpK2TyPjcxvtIAmIRlZmA,1393
 tests/test_validation.py,sha256=VwtBcnGAQ_eSFrBibWnMSTDjuy6y2JLlqvc3Zb667NY,479
-csv_detective-0.7.5.dev1286.dist-info/METADATA,sha256=rLptgL-FkLZzfkxPt7_0I-k7EKPKbEHhd3Ei2qt54KI,1386
-csv_detective-0.7.5.dev1286.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
-csv_detective-0.7.5.dev1286.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
-csv_detective-0.7.5.dev1286.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
-csv_detective-0.7.5.dev1286.dist-info/RECORD,,
+csv_detective-0.7.5.dev1307.dist-info/METADATA,sha256=RaSc6oAUAB9KsfbjOi5xRdyM8d127pL_GKYEU0195mA,1386
+csv_detective-0.7.5.dev1307.dist-info/WHEEL,sha256=DnLRTWE75wApRYVsjgc6wsVswC54sMSJhAEd4xhDpBk,91
+csv_detective-0.7.5.dev1307.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
+csv_detective-0.7.5.dev1307.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
+csv_detective-0.7.5.dev1307.dist-info/RECORD,,

{csv_detective-0.7.5.dev1286.dist-info → csv_detective-0.7.5.dev1307.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (79.0.0)
+Generator: setuptools (80.4.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

tests/test_example.py CHANGED Viewed

@@ -13,24 +13,24 @@ def test_example_creation():
         {
             "name": "nom_modele",
             "type": "str",
-            "args": {'length': 20},
+            "args": {"length": 20},
         },
         {
             "name": "siret",
             "type": "str",
-            "args": {'pattern': '^\\d{14}$'},
+            "args": {"pattern": "^\\d{14}$"},
         },
         {
             "name": "type_producteur",
             "type": "str",
-            "args": {'enum': ['privé', 'public', 'association']},
+            "args": {"enum": ["privé", "public", "association"]},
         },
         {
             "name": "date_creation",
             "type": "date",
             "args": {
-                'date_range': ['1996-02-13', '2000-01-28'],
-                'format': '%Y-%m-%d',
+                "date_range": ["1996-02-13", "2000-01-28"],
+                "format": "%Y-%m-%d",
             },
         },
         {
@@ -44,20 +44,20 @@ def test_example_creation():
         {
             "name": "note",
             "type": "float",
-            "args": {'num_range': [1, 20]}
+            "args": {"num_range": [1, 20]}
         },
     ]
     df = create_example_csv_file(
         fields=fields,
         file_length=5,
-        output_name="",
+        output_name=None,
     )
     assert len(df) == 5
     assert all(UUID(_) for _ in df["id_unique"])
     assert all(len(_) == 20 for _ in df["nom_modele"])
     assert all(re.match("^\\d{14}$", _) for _ in df["siret"])
-    assert all(_ in ['privé', 'public', 'association'] for _ in df["type_producteur"])
-    assert all(_ >= '1996-02-13' and _ <= '2000-01-28' for _ in df["date_creation"])
+    assert all(_ in ["privé", "public", "association"] for _ in df["type_producteur"])
+    assert all(_ >= "1996-02-13" and _ <= "2000-01-28" for _ in df["date_creation"])
     assert all(_.startswith("http") for _ in df["url_produit"])
     assert all(isinstance(_, int) for _ in df["nb_produits"])
     assert all(_ >= 1 and _ <= 20 for _ in df["note"])
@@ -66,6 +66,6 @@ def test_example_creation():
 def test_example_from_tableschema():
     df = create_example_csv_file(
         schema_path="https://schema.data.gouv.fr/schemas/etalab/schema-irve-statique/2.3.1/schema-statique.json",
-        output_name="",
+        output_name=None,
     )
     assert len(df) == 10

tests/test_fields.py CHANGED Viewed

@@ -15,7 +15,9 @@ from csv_detective.detect_fields.FR.geo import (
     departement,
     insee_canton,
     latitude_l93,
+    latitude_wgs_fr_metropole,
     longitude_l93,
+    longitude_wgs_fr_metropole,
     pays,
     region,
 )
@@ -24,26 +26,38 @@ from csv_detective.detect_fields.FR.other import (
     code_rna,
     code_waldec,
     csp_insee,
+    date_fr,
+    insee_ape700,
     sexe,
     siren,
+    siret,
     tel_fr,
+    uai,
 )
-from csv_detective.detect_fields.FR.temp import jour_de_la_semaine
+from csv_detective.detect_fields.FR.temp import jour_de_la_semaine, mois_de_annee
 from csv_detective.detect_fields.geo import (
     iso_country_code_alpha2,
     iso_country_code_alpha3,
     iso_country_code_numeric,
+    json_geojson,
+    latitude_wgs,
+    latlon_wgs,
+    longitude_wgs,
 )
 from csv_detective.detect_fields.other import (
+    booleen,
     email,
     json,
+    money,
     mongo_object_id,
+    percent,
+    twitter,
     url,
     uuid,
     int as test_int,
     float as test_float,
 )
-from csv_detective.detect_fields.temp import date, datetime_iso, datetime_rfc822, year
+from csv_detective.detect_fields.temp import date, datetime, datetime_iso, datetime_rfc822, year
 from csv_detective.detection.variables import (
     detect_continuous_variable,
     detect_categorical_variable,
@@ -94,420 +108,261 @@ def test_detect_continuous_variable():
     assert res2.values and res2.values[0] == "cont"
-# csp_insee
-def test_match_csp_insee():
-    val = "employes de la poste"
-    assert csp_insee._is(val)
-def test_do_not_match_csp_insee():
-    val = "super-heros"
-    assert not csp_insee._is(val)
-# code_csp_insee
-def test_match_code_csp_insee():
-    val = "121f"
-    assert code_csp_insee._is(val)
-def test_do_not_match_code_csp_insee():
-    val = "121x"
-    assert not code_csp_insee._is(val)
-# sexe
-def test_match_sexe():
-    val = "homme"
-    assert sexe._is(val)
-def test_do_not_match_sexe():
-    val = "hermaphrodite"
-    assert not sexe._is(val)
-# tel_fr
-def test_match_tel_fr():
-    val = "0134643467"
-    assert tel_fr._is(val)
-def test_do_not_match_tel_fr():
-    val = "3345689715"
-    assert not tel_fr._is(val)
-# email
-def test_match_email():
-    val = "cdo_intern@data.gouv.fr"
-    assert email._is(val)
-def test_do_not_match_email():
-    val = "cdo@@gouv.sfd"
-    assert not email._is(val)
-# uuid
-def test_match_uuid():
-    val = "884762be-51f3-44c3-b811-1e14c5d89262"
-    assert uuid._is(val)
-def test_do_not_match_uuid():
-    val = "0610928327"
-    assert not uuid._is(val)
-# Mongo ObjectId
-def test_match_mongo_object_id():
-    val = "62320e50f981bc2b57bcc044"
-    assert mongo_object_id._is(val)
-def test_do_not_match_mongo_object_id():
-    val = "884762be-51f3-44c3-b811-1e14c5d89262"
-    assert not mongo_object_id._is(val)
-# url
-def test_match_url():
-    val = "www.etalab.data.gouv.fr"
-    assert url._is(val)
-def test_do_not_match_url():
-    val = "c est une phrase"
-    assert not url._is(val)
-# adresse
-def test_match_adresse():
-    val = "rue du martyr"
-    assert adresse._is(val)
-def test_do_not_match_adresse():
-    val = "bonjour les amis"
-    assert not adresse._is(val)
-# code_commune_insee
-def test_match_code_commune_insee():
-    val = "91471"
-    assert code_commune_insee._is(val)
-def test_do_not_match_code_commune_insee():
-    val = "914712"
-    assert not code_commune_insee._is(val)
-# code_postal
-def test_match_code_postal():
-    val = "75020"
-    assert code_postal._is(val)
-def test_do_not_match_code_postal():
-    val = "77777"
-    assert not code_postal._is(val)
-# code_departement
-def test_match_code_departement():
-    vals = ["75", "2A", "2a", "974"]
-    for val in vals:
-        assert code_departement._is(val)
-def test_do_not_match_code_departement():
-    val = "00"
-    assert not code_departement._is(val)
-# code_fantoir
-def test_match_code_fantoir():
-    vals = ["7755A", "B150B", "ZA04C", "ZB03D"]
-    for val in vals:
-        assert code_fantoir._is(val)
-def test_do_not_match_code_fantoir():
-    vals = ["7755", "ZA99A"]
-    for val in vals:
-        assert not code_fantoir._is(val)
-# code_region
-def test_match_code_region():
-    val = "32"
-    assert code_region._is(val)
-def test_do_not_match_code_region():
-    val = "55"
-    assert not code_region._is(val)
-# commune
-def test_match_commune():
-    val = "saint denis"
-    assert commune._is(val)
-def test_do_not_match_commune():
-    val = "new york"
-    assert not commune._is(val)
-# departement
-def test_match_departement():
-    val = "essonne"
-    assert departement._is(val)
-def test_do_not_match_departement():
-    val = "new york"
-    assert not departement._is(val)
-# insee_canton
-def test_match_canton():
-    val = "nantua"
-    assert insee_canton._is(val)
-def test_do_not_match_canton():
-    val = "new york"
-    assert not departement._is(val)
-# latitude_l93
-def test_match_latitude_l93():
-    vals = ["6037008", "7123528.5", "7124528,5"]
-    for val in vals:
-        assert latitude_l93._is(val)
-def test_do_not_match_latitude_93():
-    vals = ["0", "-6734529.6", "7245669.8", "3422674,78", "32_34"]
-    for val in vals:
-        assert not latitude_l93._is(val)
-# longitude_l93
-def test_match_longitude_l93():
-    vals = ["0", "-154", "1265783,45", "34723.4"]
-    for val in vals:
-        assert longitude_l93._is(val)
-def test_do_not_match_longitude_93():
-    vals = ["1456669.8", "-776225", "346_3214"]
-    for val in vals:
-        assert not longitude_l93._is(val)
-# pays
-def test_match_pays():
-    val = "france"
-    assert pays._is(val)
-def test_do_not_match_pays():
-    val = "new york"
-    assert not pays._is(val)
-# region
-def test_match_region():
-    val = "bretagne"
-    assert region._is(val)
-def test_do_not_match_region():
-    val = "jambon beurre"
-    assert not region._is(val)
-# iso_country_code
-def test_match_iso_country_code():
-    val = "FR"
-    assert iso_country_code_alpha2._is(val)
-def test_do_not_match_iso_country_code():
-    val = "XX"
-    assert not iso_country_code_alpha2._is(val)
-# iso_country_code alpha-3
-def test_match_iso_country_code_alpha3():
-    val = "FRA"
-    assert iso_country_code_alpha3._is(val)
-def test_do_not_match_iso_country_code_alpha3():
-    val = "ABC"
-    assert not iso_country_code_alpha3._is(val)
-# iso_country_code numerique
-def test_match_iso_country_code_numeric():
-    val = "250"
-    assert iso_country_code_numeric._is(val)
-def test_do_not_match_iso_country_code_numeric():
-    val = "003"
-    assert not iso_country_code_numeric._is(val)
-# jour de la semaine
-def test_match_jour_de_la_semaine():
-    val = "lundi"
-    assert jour_de_la_semaine._is(val)
-def test_do_not_match_jour_de_la_semaine():
-    val = "jour de la biere"
-    assert not jour_de_la_semaine._is(val)
-# year
-def test_match_year():
-    val = "2015"
-    assert year._is(val)
-def test_do_not_match_year():
-    val = "20166"
-    assert not year._is(val)
-# date
-def test_match_date():
-    val = "1960-08-07"
-    assert date._is(val)
-    val = "12/02/2007"
-    assert date._is(val)
-    val = "15 jan 1985"
-    assert date._is(val)
-    val = "15 décembre 1985"
-    assert date._is(val)
-    val = "02 05 2003"
-    assert date._is(val)
-    val = "20030502"
-    assert date._is(val)
-    val = "1993-12/02"
-    assert date._is(val)
-def test_do_not_match_date():
-    val = "1993-1993-1993"
-    assert not date._is(val)
-    val = "39-10-1993"
-    assert not date._is(val)
-    val = "19-15-1993"
-    assert not date._is(val)
-    val = "15 tambour 1985"
-    assert not date._is(val)
-    val = "12152003"
-    assert not date._is(val)
-    val = "20031512"
-    assert not date._is(val)
-    val = "02052003"
-    assert not date._is(val)
-# datetime
-def test_match_datetime():
-    val = "2021-06-22T10:20:10"
-    assert datetime_iso._is(val)
-    val = "2021-06-22T30:20:10"
-    assert not datetime_iso._is(val)
-    val = "Sun, 06 Nov 1994 08:49:37 GMT"
-    assert datetime_rfc822._is(val)
-# siren
-def test_match_siren():
-    val = "552 100 554"
-    assert siren._is(val)
-def test_do_not_match_siren():
-    val = "42"
-    assert not siren._is(val)
-# rna
-def test_match_rna():
-    val = "W751515517"
-    assert code_rna._is(val)
-def test_do_not_match_rna():
-    vals = [
-        "W111111111111111111111111111111111111",
-        "w143788974",
-        "W12",
-        "678W23456",
-        "165789325",
-        "Wa1#89sf&h",
-    ]
-    for val in vals:
-        assert not code_rna._is(val)
-def test_match_waldec():
-    val = "751P00188854"
-    assert code_waldec._is(val)
-def test_do_not_match_waldec():
-    val = "AA751PEE00188854"
-    assert not code_waldec._is(val)
-# json
-def test_match_json():
-    val = '{"pomme": "fruit", "reponse": 42}'
-    assert json._is(val)
-    val = "[1,2,3,4]"
-    assert json._is(val)
-def test_do_not_match_json():
-    val = '{"coordinates": [45.783753, 3.049342], "citycode": "63870"}'
-    assert not json._is(val)
-    val = "666"
-    assert not json._is(val)
-# int
-def test_match_int():
-    for val in ["1", "0", "1764", "-24"]:
-        assert test_int._is(val)
-def test_not_match_int():
-    for val in ["01053", "1.2", "123_456", "+35"]:
-        assert not test_int._is(val)
-# float
-def test_match_float():
-    for val in ["1", "0", "1764", "-24", "1.2", "1863.23", "-12.7", "0.1"]:
-        assert test_float._is(val)
+fields = {
+    adresse: {
+        True: ["rue du martyr"],
+        False: ["un batiment"],
+    },
+    code_commune_insee: {
+        True: ["91471", "01053"],
+        False: ["914712", "01000"],
+    },
+    code_departement: {
+        True: ["75", "2A", "2b", "974", "01"],
+        False: ["00", "96", "101"],
+    },
+    code_fantoir: {
+        True: ["7755A", "B150B", "ZA04C", "ZB03D"],
+        False: ["7755", "ZA99A"],
+    },
+    code_postal: {
+        True: ["75020", "01000"],
+        False: ["77777", "018339"],
+    },
+    code_region: {
+        True: ["32"],
+        False: ["55"],
+    },
+    commune: {
+        True: ["saint denis"],
+        False: ["new york", "lion"],
+    },
+    departement: {
+        True: ["essonne"],
+        False: ["alabama", "auvergne"],
+    },
+    insee_canton: {
+        True: ["nantua"],
+        False: ["california"],
+    },
+    latitude_l93: {
+        True: ["6037008", "7123528.5", "7124528,5"],
+        False: ["0", "-6734529.6", "7245669.8", "3422674,78", "32_34"],
+    },
+    longitude_l93: {
+        True: ["0", "-154", "1265783,45", "34723.4"],
+        False: ["1456669.8", "-776225", "346_3214"],
+    },
+    latitude_wgs_fr_metropole: {
+        True: ["42.5"],
+        False: ["22.5", "62.5"],
+    },
+    longitude_wgs_fr_metropole: {
+        True: ["-2.5"],
+        False: ["12.8"],
+    },
+    pays: {
+        True: ["france", "italie"],
+        False: ["amerique", "paris"],
+    },
+    region: {
+        True: ["bretagne", "ile-de-france"],
+        False: ["baviere", "overgne"],
+    },
+    code_csp_insee: {
+        True: ["121f"],
+        False: ["121x"],
+    },
+    code_rna: {
+        True: ["W751515517"],
+        False: [
+            "W111111111111111111111111111111111111",
+            "w143788974",
+            "W12",
+            "678W23456",
+            "165789325",
+            "Wa1#89sf&h",
+        ],
+    },
+    code_waldec: {
+        True: ["751P00188854"],
+        False: ["AA751PEE00188854"],
+    },
+    csp_insee: {
+        True: ["employes de la poste"],
+        False: ["super-heros"],
+    },
+    sexe: {
+        True: ["homme"],
+        False: ["hermaphrodite"],
+    },
+    siren: {
+        True: ["552 100 554", "552100554"],
+        False: ["42"],
+    },
+    siret: {
+        True: ["13002526500013", "130 025 265 00013"],
+        False: ["13002526500012"],
+    },
+    uai: {
+        True: ["0422170F"],
+        False: ["04292E"],
+    },
+    date_fr: {
+        True: ["13 fevrier 1996"],
+        False: ["44 march 2025"],
+    },
+    insee_ape700: {
+        True: ["0116Z"],
+        False: ["0116A"]
+    },
+    tel_fr: {
+        True: ["0134643467"],
+        False: ["6625388263", "01288398"],
+    },
+    jour_de_la_semaine: {
+        True: ["lundi"],
+        False: ["jour de la biere"],
+    },
+    mois_de_annee: {
+        True: ["juin", "décembre"],
+        False: ["november"],
+    },
+    iso_country_code_alpha2: {
+        True: ["FR"],
+        False: ["XX", "A", "FRA"],
+    },
+    iso_country_code_alpha3: {
+        True: ["FRA"],
+        False: ["XXX", "FR", "A"],
+    },
+    iso_country_code_numeric: {
+        True: ["250"],
+        False: ["003"],
+    },
+    json_geojson: {
+        True: [
+            '{"coordinates": [45.783753, 3.049342], "type": "63870"}',
+            '{"geometry": {"coordinates": [45.783753, 3.049342]}}',
+        ],
+        False: ['{"pomme": "fruit", "reponse": 42}'],
+    },
+    latitude_wgs: {
+        True: ["43.2", "-22"],
+        False: ["100"],
+    },
+    latlon_wgs: {
+        True: ["43.2,-22.6", "-10.7,140", "-40.7, 10.8"],
+        False: ["0.1,192", "-102, 92"],
+    },
+    longitude_wgs: {
+        True: ["120", "-20.2"],
+        False: ["-200"],
+    },
+    booleen: {
+        True: ["oui", "0", "1", "yes", "false", "True"],
+        False: ["nein", "ja", "2", "-0"],
+    },
+    email: {
+        True: ["cdo_intern@data.gouv.fr"],
+        False: ["cdo@@gouv.sfd"],
+    },
+    json: {
+        True: ['{"pomme": "fruit", "reponse": 42}', "[1,2,3,4]"],
+        False: ['{"coordinates": [45.783753, 3.049342], "citycode": "63870"}', "{zefib:"],
+    },
+    money: {
+        True: ["120€", "-20.2$"],
+        False: ["200", "100 euros"],
+    },
+    mongo_object_id: {
+        True: ["62320e50f981bc2b57bcc044"],
+        False: ["884762be-51f3-44c3-b811-1e14c5d89262", "0230240284a66e"],
+    },
+    percent: {
+        True: ["120%", "-20.2%"],
+        False: ["200", "100 pourcents"],
+    },
+    twitter: {
+        True: ["@accueil1"],
+        False: ["adresse@mail"],
+    },
+    url: {
+        True: ["www.etalab.data.gouv.fr"],
+        False: ["une phrase avec un @ dedans"],
+    },
+    uuid: {
+        True: ["884762be-51f3-44c3-b811-1e14c5d89262"],
+        False: ["0610928327"],
+    },
+    test_int: {
+        True: ["1", "0", "1764", "-24"],
+        False: ["01053", "1.2", "123_456", "+35"],
+    },
+    test_float: {
+        True: ["1", "0", "1764", "-24", "1.2", "1863.23", "-12.7", "0.1"],
+        False: ["01053", "01053.89", "1e3", "123_456", "123_456.78", "+35", "+35.9"],
+    },
+    date: {
+        True: [
+            "1960-08-07",
+            "12/02/2007",
+            "15 jan 1985",
+            "15 décembre 1985",
+            "02 05 2003",
+            "20030502",
+            "1993-12/02",
+        ],
+        False: [
+            "1993-1993-1993",
+            "39-10-1993",
+            "19-15-1993",
+            "15 tambour 1985",
+            "12152003",
+            "20031512",
+            "02052003",
+        ],
+    },
+    datetime: {
+        True: ["2021-06-22T10:20:10"],
+        False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT"],
+    },
+    datetime_iso: {
+        True: ["2021-06-22T10:20:10"],
+        False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT"],
+    },
+    datetime_rfc822: {
+        True: ["Sun, 06 Nov 1994 08:49:37 GMT"],
+        False: ["2021-06-22T10:20:10"],
+    },
+    year: {
+        True: ["2015"],
+        False: ["20166"],
+    },
+}
+# we could also have a function here to add all True values of (almost)
+# each field to the False values of all others
+def test_all_fields_have_tests():
+    all_tests = return_all_tests("ALL", "detect_fields")
+    for test in all_tests:
+        assert fields.get(test)
-def test_not_match_float():
-    for val in ["01053", "01053.89", "1e3", "123_456", "123_456.78", "+35", "+35.9"]:
-        assert not test_float._is(val)
+@pytest.mark.parametrize(
+    "args",
+    (
+        (field, value, valid)
+        for field in fields
+        for valid in [True, False]
+        for value in fields[field][valid]
+    ),
+)
+def test_fields_with_values(args):
+    field, value, valid = args
+    assert field._is(value) is valid
 @pytest.mark.parametrize(

{csv_detective-0.7.5.dev1286.data → csv_detective-0.7.5.dev1307.data}/data/share/csv_detective/LICENSE.AGPL.txt RENAMED Viewed

File without changes

{csv_detective-0.7.5.dev1286.data → csv_detective-0.7.5.dev1307.data}/data/share/csv_detective/README.md RENAMED Viewed

File without changes

{csv_detective-0.7.5.dev1286.dist-info → csv_detective-0.7.5.dev1307.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{csv_detective-0.7.5.dev1286.dist-info → csv_detective-0.7.5.dev1307.dist-info}/licenses/LICENSE.AGPL.txt RENAMED Viewed

File without changes

{csv_detective-0.7.5.dev1286.dist-info → csv_detective-0.7.5.dev1307.dist-info}/top_level.txt RENAMED Viewed

File without changes

csv-detective 0.7.5.dev1286__py3-none-any.whl → 0.7.5.dev1307__py3-none-any.whl

csv-detective 0.7.5.dev1286py3-none-any.whl → 0.7.5.dev1307py3-none-any.whl