PyPI - csv-detective - Versions diffs - 0.8.1.dev1674__py3-none-any.whl → 0.8.1.dev1720__py3-none-any.whl - Mend

csv-detective 0.8.1.dev1674py3-none-any.whl → 0.8.1.dev1720py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (84) hide show

csv_detective/output/__init__.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import Optional, Union
 import pandas as pd
 from csv_detective.utils import is_url
 from .dataframe import cast_df
 from .profile import create_profile
 from .schema import generate_table_schema
@@ -24,7 +25,6 @@ def generate_output(
     verbose: bool = False,
     sheet_name: Optional[Union[str, int]] = None,
 ) -> Union[dict, tuple[dict, pd.DataFrame]]:
     if output_profile:
         analysis["profile"] = create_profile(
             table=table,
@@ -40,7 +40,7 @@ def generate_output(
         else:
             output_path = os.path.splitext(file_path)[0]
             if is_url(output_path):
-                output_path = output_path.split('/')[-1]
+                output_path = output_path.split("/")[-1]
             if analysis.get("sheet_name"):
                 output_path += "_sheet-" + str(sheet_name)
             output_path += ".json"
@@ -48,11 +48,7 @@ def generate_output(
             json.dump(analysis, fp, indent=4, separators=(",", ": "), ensure_ascii=False)
     if output_schema:
-        analysis["schema"] = generate_table_schema(
-            analysis,
-            save_file=False,
-            verbose=verbose
-        )
+        analysis["schema"] = generate_table_schema(analysis, save_file=False, verbose=verbose)
     if output_df:
         return analysis, cast_df(

csv_detective/output/dataframe.py CHANGED Viewed

@@ -1,7 +1,7 @@
-from datetime import date, datetime
 import json
-from typing import Optional, Union
+from datetime import date, datetime
 from time import time
+from typing import Optional, Union
 import pandas as pd
@@ -30,12 +30,16 @@ def cast(value: str, _type: str) -> Optional[Union[str, float, bool, date, datet
     raise ValueError(f"Unknown type `{_type}`")
-def cast_df(df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bool = False) -> pd.DataFrame:
+def cast_df(
+    df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bool = False
+) -> pd.DataFrame:
     if verbose:
         start = time()
     output_df = pd.DataFrame()
     for col_name, detection in columns.items():
-        if detection["python_type"] == "string" or (detection["python_type"] == "json" and not cast_json):
+        if detection["python_type"] == "string" or (
+            detection["python_type"] == "json" and not cast_json
+        ):
             # no change if detected type is string
             output_df[col_name] = df[col_name].copy()
         elif detection["python_type"] == "int":
@@ -49,7 +53,7 @@ def cast_df(df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bo
         del df[col_name]
     if verbose:
         display_logs_depending_process_time(
-            f'Casting columns completed in {round(time() - start, 3)}s',
+            f"Casting columns completed in {round(time() - start, 3)}s",
             time() - start,
         )
     return output_df

csv_detective/output/example.py CHANGED Viewed

@@ -1,14 +1,14 @@
-from datetime import datetime
 import json
 import random
 import string
-from typing import Union, Optional, Any, Type
 import uuid
+from datetime import datetime
+from typing import Any, Optional, Type, Union
-from faker import Faker
 import pandas as pd
 import requests
 import rstr
+from faker import Faker
 fake = Faker()
@@ -135,7 +135,7 @@ def create_example_csv_file(
             return random.choice(enum)
         if num_range is None:
             num_range = [0, 1000]
-        if num_type == int:
+        if num_type is int:
             return random.randint(num_range[0], num_range[1])
         else:
             return round(random.uniform(num_range[0], num_range[1]), 1)
@@ -179,7 +179,7 @@ def create_example_csv_file(
         "yearmonth": "date",
         "time": "time",
         "datetime": "datetime",
-        "array": "array"
+        "array": "array",
     }
     if schema_path:
@@ -188,7 +188,7 @@ def create_example_csv_file(
         else:
             with open(schema_path, encoding=encoding) as jsonfile:
                 schema = json.load(jsonfile)
-        if not ("fields" in schema.keys()):
+        if "fields" not in schema.keys():
             raise ValueError("The schema must have a 'fields' key.")
         else:
             fields = [
@@ -198,12 +198,14 @@ def create_example_csv_file(
                     # when frformat is supported in TableSchema, we can build args for French standards
                     # linked to https://github.com/datagouv/fr-format/issues/26
                     "args": (
-                        build_args_from_constraints(f["constraints"]) if "constraints" in f.keys()
+                        build_args_from_constraints(f["constraints"])
+                        if "constraints" in f.keys()
                         else build_args_from_constraints(f["arrayItem"]["constraints"])
                         if "arrayItem" in f.keys() and "constraints" in f["arrayItem"].keys()
                         else {}
-                    )
-                } for f in schema["fields"]
+                    ),
+                }
+                for f in schema["fields"]
             ]
     for k in range(len(fields)):
@@ -234,10 +236,8 @@ def create_example_csv_file(
     # would it be better to create by column or by row (as for now)?
     output = pd.DataFrame(
         [
-            [
-                types_to_func.get(f["type"], "str")(**f["args"])
-                for f in fields
-            ] for _ in range(file_length)
+            [types_to_func.get(f["type"], "str")(**f["args"]) for f in fields]
+            for _ in range(file_length)
         ],
         columns=[f["name"] for f in fields],
     )

csv_detective/output/profile.py CHANGED Viewed

@@ -1,5 +1,5 @@
-from collections import defaultdict
 import logging
+from collections import defaultdict
 from time import time
 import pandas as pd
@@ -29,15 +29,12 @@ def create_profile(
     safe_table = table.copy()
     if not limited_output:
         dict_cols_fields = {
-            k: v[0] if v else {'python_type': 'string', 'format': 'string', 'score': 1.0}
+            k: v[0] if v else {"python_type": "string", "format": "string", "score": 1.0}
             for k, v in dict_cols_fields.items()
         }
-    dtypes = {
-        k: map_python_types.get(v["python_type"], str)
-        for k, v in dict_cols_fields.items()
-    }
+    dtypes = {k: map_python_types.get(v["python_type"], str) for k, v in dict_cols_fields.items()}
     for c in safe_table.columns:
-        if dtypes[c] == float:
+        if dtypes[c] is float:
             safe_table[c] = safe_table[c].apply(
                 lambda s: float_casting(s) if isinstance(s, str) else s
             )
@@ -48,18 +45,26 @@ def create_profile(
             int,
         ]:
             profile[c].update(
-                min=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
-                    safe_table[c].min()
-                )),
-                max=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
-                    safe_table[c].max()
-                )),
-                mean=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
-                    safe_table[c].mean()
-                )),
-                std=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
-                    safe_table[c].std()
-                )),
+                min=prevent_nan(
+                    map_python_types.get(dict_cols_fields[c]["python_type"], str)(
+                        safe_table[c].min()
+                    )
+                ),
+                max=prevent_nan(
+                    map_python_types.get(dict_cols_fields[c]["python_type"], str)(
+                        safe_table[c].max()
+                    )
+                ),
+                mean=prevent_nan(
+                    map_python_types.get(dict_cols_fields[c]["python_type"], str)(
+                        safe_table[c].mean()
+                    )
+                ),
+                std=prevent_nan(
+                    map_python_types.get(dict_cols_fields[c]["python_type"], str)(
+                        safe_table[c].std()
+                    )
+                ),
             )
         tops_bruts = (
             safe_table[safe_table[c].notna()][c]
@@ -70,10 +75,12 @@ def create_profile(
         )
         tops = []
         for tb in tops_bruts:
-            tops.append({
-                "count": tb["count"],
-                "value": tb[c],
-            })
+            tops.append(
+                {
+                    "count": tb["count"],
+                    "value": tb[c],
+                }
+            )
         profile[c].update(
             tops=tops,
             nb_distinct=safe_table[c].nunique(),

csv_detective/output/schema.py CHANGED Viewed

@@ -1,14 +1,14 @@
-from datetime import datetime
 import json
 import logging
 import os
 import tempfile
+from datetime import datetime
 from time import time
 from typing import Optional
 from botocore.exceptions import ClientError
-from csv_detective.s3_utils import get_s3_client, download_from_minio, upload_to_minio
+from csv_detective.s3_utils import download_from_minio, get_s3_client, upload_to_minio
 from csv_detective.utils import display_logs_depending_process_time
@@ -26,13 +26,11 @@ def get_description(format: str) -> str:
         "insee_canton": "Le nom du canton",
         "latitude_l93": "La latitude au format Lambert 93",
         "latitude_wgs_fr_metropole": (
-            "La latitude au format WGS. Ne concerne que des latitudes "
-            "de la métropole française"
+            "La latitude au format WGS. Ne concerne que des latitudes de la métropole française"
         ),
         "longitude_l93": "La longitude au format Lambert 93",
         "longitude_wgs_fr_metropole": (
-            "La longitude au format WGS. Ne concerne que des longitudes "
-            "de la métropole française"
+            "La longitude au format WGS. Ne concerne que des longitudes de la métropole française"
         ),
         "pays": "Le nom du pays",
         "region": "Le nom de la région",
@@ -86,13 +84,13 @@ def get_pattern(format: str) -> str:
         ),
         "uai": r"^(0[0-8][0-9]|09[0-5]|9[78][0-9]|[67]20)[0-9]{4}[A-Z]$",
         "email": r"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$",
-        "twitter": r'^@[A-Za-z0-9_]+$',
-        "mongo_object_id": r'^[0-9a-fA-F]{24}$',
-        "uuid": r'^[{]?[0-9a-fA-F]{8}' + '-?([0-9a-fA-F]{4}-?)' + '{3}[0-9a-fA-F]{12}[}]?$',
+        "twitter": r"^@[A-Za-z0-9_]+$",
+        "mongo_object_id": r"^[0-9a-fA-F]{24}$",
+        "uuid": r"^[{]?[0-9a-fA-F]{8}" + "-?([0-9a-fA-F]{4}-?)" + "{3}[0-9a-fA-F]{12}[}]?$",
         "url": (
-            r'^https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]'
-            r'{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)$'
-        )
+            r"^https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]"
+            r"{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)$"
+        ),
     }
     if format in format_to_pattern:
         return {"pattern": format_to_pattern[format]}
@@ -210,7 +208,7 @@ def generate_table_schema(
     key: Optional[str] = None,
     minio_user: Optional[str] = None,
     minio_pwd: Optional[str] = None,
-    verbose: bool = False
+    verbose: bool = False,
 ) -> dict:
     """Generates a table schema from the analysis report
@@ -236,7 +234,7 @@ def generate_table_schema(
             "example": get_example(field_report["format"]),
             "type": get_validata_type(field_report["format"]),
             "formatFR": field_report["format"],
-            "constraints": get_constraints(field_report["format"])
+            "constraints": get_constraints(field_report["format"]),
         }
         for header, field_report in analysis_report["columns"].items()
     ]
@@ -255,12 +253,9 @@ def generate_table_schema(
         "sources": [
             {
                 "title": "Spécification Tableschema",
-                "path": "https://specs.frictionlessdata.io/table-schema"
+                "path": "https://specs.frictionlessdata.io/table-schema",
             },
-            {
-                "title": "schema.data.gouv.fr",
-                "path": "https://schema.data.gouv.fr"
-            }
+            {"title": "schema.data.gouv.fr", "path": "https://schema.data.gouv.fr"},
         ],
         "created": datetime.today().strftime("%Y-%m-%d"),
         "lastModified": datetime.today().strftime("%Y-%m-%d"),
@@ -278,7 +273,9 @@ def generate_table_schema(
     }
     if verbose:
-        display_logs_depending_process_time(f'Created schema in {round(time() - start, 3)}s', time() - start)
+        display_logs_depending_process_time(
+            f"Created schema in {round(time() - start, 3)}s", time() - start
+        )
     if not save_file:
         return schema
@@ -301,9 +298,9 @@ def generate_table_schema(
         if "Contents" in tableschema_objects:
             tableschema_keys = [
                 tableschema["Key"]
-                for tableschema in client.list_objects(
-                    Bucket=bucket, Prefix=key, Delimiter="/"
-                )["Contents"]
+                for tableschema in client.list_objects(Bucket=bucket, Prefix=key, Delimiter="/")[
+                    "Contents"
+                ]
             ]
             tableschema_versions = [
                 os.path.splitext(tableschema_key)[0].split("_")[-1]

csv_detective/output/utils.py CHANGED Viewed

@@ -19,14 +19,17 @@ def prepare_output_dict(return_table: pd.DataFrame, limited_output: bool):
             # no need to specify int and float everywhere, they are deprioritized anyway
             ("int", ("float",)),
             # bool over everything
-            ("booleen", (
-                "latitude_l93",
-                "latitude_wgs",
-                "latitude_wgs_fr_metropole",
-                "longitude_l93",
-                "longitude_wgs",
-                "longitude_wgs_fr_metropole",
-            )),
+            (
+                "booleen",
+                (
+                    "latitude_l93",
+                    "latitude_wgs",
+                    "latitude_wgs_fr_metropole",
+                    "longitude_l93",
+                    "longitude_wgs",
+                    "longitude_wgs_fr_metropole",
+                ),
+            ),
             ("geojson", ("json",)),
             # latlon over lonlat if no longitude allows to discriminate
             ("latlon_wgs", ("json", "lonlat_wgs")),
@@ -49,13 +52,10 @@ def prepare_output_dict(return_table: pd.DataFrame, limited_output: bool):
         for prio_format, secondary_formats in priorities:
             if prio_format in detected_formats:
                 for secondary in secondary_formats:
-                    if (
-                        secondary in detected_formats
-                        and (
-                            return_dict_cols[column_name][prio_format]
-                            >= return_dict_cols[column_name][secondary]
-                            or return_dict_cols[column_name][prio_format] >= 1
-                        )
+                    if secondary in detected_formats and (
+                        return_dict_cols[column_name][prio_format]
+                        >= return_dict_cols[column_name][secondary]
+                        or return_dict_cols[column_name][prio_format] >= 1
                     ):
                         formats_to_remove.add(secondary)

csv_detective/parsing/columns.py CHANGED Viewed

@@ -28,6 +28,7 @@ def test_col_val(
     # TODO : change for a cleaner method and only test columns in modules labels
     def apply_test_func(serie: pd.Series, test_func: Callable, _range: int):
         return serie.sample(n=_range).apply(test_func)
     try:
         if skipna:
             serie = serie[serie.notnull()]
@@ -60,11 +61,13 @@ def test_col_val(
         if verbose and time() - start > 3:
             display_logs_depending_process_time(
                 f"\t/!\\ Column '{serie.name}' took too long ({round(time() - start, 3)}s)",
-                time() - start
+                time() - start,
             )
-def test_col_label(label: str, test_func: Callable, proportion: float = 1, limited_output: bool = False):
+def test_col_label(
+    label: str, test_func: Callable, proportion: float = 1, limited_output: bool = False
+):
     """Tests label (from header) using test_func.
     - proportion :  indicates the minimum score to pass the test for the serie
     to be detected as a certain format
@@ -76,7 +79,13 @@ def test_col_label(label: str, test_func: Callable, proportion: float = 1, limit
         return result if result >= proportion else 0
-def test_col(table: pd.DataFrame, all_tests: list, limited_output: bool, skipna: bool = True, verbose: bool = False):
+def test_col(
+    table: pd.DataFrame,
+    all_tests: list,
+    limited_output: bool,
+    skipna: bool = True,
+    verbose: bool = False,
+):
     if verbose:
         start = time()
         logging.info("Testing columns to get types")
@@ -106,11 +115,13 @@ def test_col(table: pd.DataFrame, all_tests: list, limited_output: bool, skipna:
         )
         if verbose:
             display_logs_depending_process_time(
-                f'\t> Done with type "{key}" in {round(time() - start_type, 3)}s ({idx+1}/{len(test_funcs)})',
-                time() - start_type
+                f'\t> Done with type "{key}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(test_funcs)})',
+                time() - start_type,
             )
     if verbose:
-        display_logs_depending_process_time(f"Done testing columns in {round(time() - start, 3)}s", time() - start)
+        display_logs_depending_process_time(
+            f"Done testing columns in {round(time() - start, 3)}s", time() - start
+        )
     return return_table
@@ -128,16 +139,16 @@ def test_label(table: pd.DataFrame, all_tests: list, limited_output: bool, verbo
         if verbose:
             start_type = time()
         return_table.loc[key] = [
-            test_col_label(
-                col_name, value["func"], value["prop"], limited_output=limited_output
-            )
+            test_col_label(col_name, value["func"], value["prop"], limited_output=limited_output)
             for col_name in table.columns
         ]
         if verbose:
             display_logs_depending_process_time(
-                f'\t- Done with type "{key}" in {round(time() - start_type, 3)}s ({idx+1}/{len(test_funcs)})',
-                time() - start_type
+                f'\t- Done with type "{key}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(test_funcs)})',
+                time() - start_type,
             )
     if verbose:
-        display_logs_depending_process_time(f"Done testing labels in {round(time() - start, 3)}s", time() - start)
+        display_logs_depending_process_time(
+            f"Done testing labels in {round(time() - start, 3)}s", time() - start
+        )
     return return_table

csv_detective/parsing/csv.py CHANGED Viewed

@@ -49,7 +49,7 @@ def parse_csv(
         raise ValueError("Could not load file")
     if verbose:
         display_logs_depending_process_time(
-            f'Table parsed successfully in {round(time() - start, 3)}s',
+            f"Table parsed successfully in {round(time() - start, 3)}s",
             time() - start,
         )
     return table, total_lines, nb_duplicates

csv_detective/parsing/excel.py CHANGED Viewed

@@ -28,14 +28,13 @@ def parse_excel(
     random_state: int = 42,
     verbose: bool = False,
 ) -> tuple[pd.DataFrame, int, int, str, str, int]:
-    """"Excel-like parsing is really slow, could be a good improvement for future development"""
+    """ "Excel-like parsing is really slow, could be a good improvement for future development"""
     if verbose:
         start = time()
     no_sheet_specified = sheet_name is None
-    if (
-        engine in ['openpyxl', 'xlrd'] or
-        any([file_path.endswith(k) for k in NEW_EXCEL_EXT + OLD_EXCEL_EXT])
+    if engine in ["openpyxl", "xlrd"] or any(
+        [file_path.endswith(k) for k in NEW_EXCEL_EXT + OLD_EXCEL_EXT]
     ):
         remote_content = None
         if is_url(file_path):
@@ -50,7 +49,7 @@ def parse_excel(
         if sheet_name is None:
             if verbose:
                 display_logs_depending_process_time(
-                    f'Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one',
+                    f"Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one",
                     time() - start,
                 )
             try:
@@ -58,8 +57,8 @@ def parse_excel(
                     # openpyxl doesn't want to open files that don't have a valid extension
                     # see: https://foss.heptapod.net/openpyxl/openpyxl/-/issues/2157
                     # if the file is remote, we have a remote content anyway so it's fine
-                    if not remote_content and '.' not in file_path.split('/')[-1]:
-                        with open(file_path, 'rb') as f:
+                    if not remote_content and "." not in file_path.split("/")[-1]:
+                        with open(file_path, "rb") as f:
                             remote_content = BytesIO(f.read())
                     # faster than loading all sheets
                     wb = openpyxl.load_workbook(remote_content or file_path, read_only=True)
@@ -82,7 +81,7 @@ def parse_excel(
                 # sometimes a xls file is recognized as ods
                 if verbose:
                     display_logs_depending_process_time(
-                        'Could not read file with classic xls reader, trying with ODS',
+                        "Could not read file with classic xls reader, trying with ODS",
                         time() - start,
                     )
                 engine = "odf"
@@ -95,7 +94,7 @@ def parse_excel(
         if sheet_name is None:
             if verbose:
                 display_logs_depending_process_time(
-                    f'Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one',
+                    f"Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one",
                     time() - start,
                 )
             tables = pd.read_excel(
@@ -132,7 +131,7 @@ def parse_excel(
             table = table.sample(num_rows, random_state=random_state)
         if verbose:
             display_logs_depending_process_time(
-                f'Table parsed successfully in {round(time() - start, 3)}s',
+                f"Table parsed successfully in {round(time() - start, 3)}s",
                 time() - start,
             )
         return table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx
@@ -163,7 +162,7 @@ def parse_excel(
         table = table.sample(num_rows, random_state=random_state)
     if verbose:
         display_logs_depending_process_time(
-            f'Table parsed successfully in {round(time() - start, 3)}s',
+            f"Table parsed successfully in {round(time() - start, 3)}s",
             time() - start,
         )
     return table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx

csv_detective/parsing/load.py CHANGED Viewed

@@ -14,6 +14,7 @@ from csv_detective.detection.engine import (
 from csv_detective.detection.headers import detect_headers
 from csv_detective.detection.separator import detect_separator
 from csv_detective.utils import is_url
 from .compression import unzip
 from .csv import parse_csv
 from .excel import (
@@ -30,9 +31,9 @@ def load_file(
     verbose: bool = False,
     sheet_name: Optional[Union[str, int]] = None,
 ) -> tuple[pd.DataFrame, dict]:
-    file_name = file_path.split('/')[-1]
+    file_name = file_path.split("/")[-1]
     engine = None
-    if '.' not in file_name or not file_name.endswith("csv"):
+    if "." not in file_name or not file_name.endswith("csv"):
         # file has no extension, we'll investigate how to read it
         engine = detect_engine(file_path, verbose=verbose)
@@ -88,10 +89,12 @@ def load_file(
             "heading_columns": heading_columns,
             "trailing_columns": trailing_columns,
         }
-    analysis.update({
-        "header_row_idx": header_row_idx,
-        "header": header,
-        "total_lines": total_lines,
-        "nb_duplicates": nb_duplicates,
-    })
+    analysis.update(
+        {
+            "header_row_idx": header_row_idx,
+            "header": header,
+            "total_lines": total_lines,
+            "nb_duplicates": nb_duplicates,
+        }
+    )
     return table, analysis

csv_detective/parsing/text.py CHANGED Viewed

@@ -2,9 +2,7 @@ from re import finditer
 def camel_case_split(identifier: str):
-    matches = finditer(
-        ".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)", identifier
-    )
+    matches = finditer(".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)", identifier)
     return " ".join([m.group(0) for m in matches])
@@ -46,15 +44,12 @@ def header_score(header: str, words_combinations_list: list[str]) -> float:
     processed_header = _process_text(header)
     header_matches_words_combination = float(
-        any(
-            words_combination == processed_header for words_combination in words_combinations_list
-        )
+        any(words_combination == processed_header for words_combination in words_combinations_list)
     )
     words_combination_in_header = 0.5 * (
         any(
-            is_word_in_string(
-                words_combination, processed_header
-            ) for words_combination in words_combinations_list
+            is_word_in_string(words_combination, processed_header)
+            for words_combination in words_combinations_list
         )
     )

csv_detective/s3_utils.py CHANGED Viewed

@@ -1,6 +1,6 @@
-import boto3
 import logging
+import boto3
 from botocore.client import Config
 from botocore.exceptions import ClientError
@@ -27,9 +27,7 @@ def download_from_minio(
     s3 = get_s3_client(netloc, minio_user, minio_pwd)
     try:
         s3.download_file(bucket, key, filepath)
-        logging.info(
-            f"Resource downloaded from minio at {get_minio_url(netloc, bucket, key)}"
-        )
+        logging.info(f"Resource downloaded from minio at {get_minio_url(netloc, bucket, key)}")
     except ClientError as e:
         logging.error(e)
@@ -41,8 +39,6 @@ def upload_to_minio(
     s3 = get_s3_client(netloc, minio_user, minio_pwd)
     try:
         s3.upload_file(filepath, bucket, key)
-        logging.info(
-            f"Resource saved into minio at {get_minio_url(netloc, bucket, key)}"
-        )
+        logging.info(f"Resource saved into minio at {get_minio_url(netloc, bucket, key)}")
     except ClientError as e:
         logging.error(e)

csv_detective/utils.py CHANGED Viewed

@@ -4,7 +4,9 @@ from typing import Optional
 import pandas as pd
 logging.basicConfig(level=logging.INFO)
-logging.addLevelName(logging.CRITICAL, "\033[1;41m%s\033[1;0m" % logging.getLevelName(logging.CRITICAL))
+logging.addLevelName(
+    logging.CRITICAL, "\033[1;41m%s\033[1;0m" % logging.getLevelName(logging.CRITICAL)
+)
 logging.addLevelName(logging.WARN, "\033[1;31m%s\033[1;0m" % logging.getLevelName(logging.WARN))
 THRESHOLD_WARN = 1
@@ -26,7 +28,7 @@ def display_logs_depending_process_time(prompt: str, duration: float) -> None:
 def is_url(file_path: str) -> bool:
     # could be more sophisticated if needed
     # using the URL detection test was considered but too broad (schema required to use requests)
-    return file_path.startswith('http')
+    return file_path.startswith("http")
 def prevent_nan(value: float) -> Optional[float]:

csv-detective 0.8.1.dev1674__py3-none-any.whl → 0.8.1.dev1720__py3-none-any.whl

csv-detective 0.8.1.dev1674py3-none-any.whl → 0.8.1.dev1720py3-none-any.whl