PyPI - csv-detective - Versions diffs - 0.7.5.dev1009__py3-none-any.whl → 0.7.5.dev1052__py3-none-any.whl - Mend

csv-detective 0.7.5.dev1009py3-none-any.whl → 0.7.5.dev1052py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

csv_detective/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from .explore_csv import routine, routine_minio  # noqa
+from .create_example import create_example_csv_file  # noqa
 __version__ = '0.7.5.dev'

csv_detective/create_example.py ADDED Viewed

@@ -0,0 +1,247 @@
+import random
+import uuid
+import string
+from datetime import datetime
+import pandas as pd
+from typing import List, Union, Optional, Any, Type
+import json
+import requests
+import rstr
+from faker import Faker
+fake = Faker()
+def create_example_csv_file(
+    fields: Optional[dict] = None,
+    schema_path: Optional[str] = None,
+    file_length: int = 10,
+    output_name: str = 'example_file.csv',
+    output_sep: str = ';',
+    encoding: str = 'utf-8',
+    ignore_required: bool = False,
+) -> pd.DataFrame:
+    '''
+    Create an example file based on a list of dicts like follows:
+    fields = [
+        {
+            "name": "column_name",
+            "type": "column_type",
+            "args": {dict_of_args}  # optional
+        },
+        ...
+    ]
+    Or from a TableSchema
+    '''
+    # need to make a CLI command
+    if not (fields or schema_path):
+        raise ValueError("At least fields or schema_path must be specified.")
+    def potential_skip(required: bool) -> bool:
+        if ignore_required:
+            return False
+        if not required:
+            # for now 30% chance to have an optional value, this could go as an argument
+            return random.randint(1, 100) <= 30
+    def _string(
+        length: int = 10,
+        required: bool = True,
+        pattern: Optional[str] = None,
+        enum: Optional[str] = None,
+    ) -> str:
+        if potential_skip(required):
+            return ''
+        if pattern is not None:
+            return rstr.xeger(pattern)
+        elif enum is not None:
+            return random.choice(enum)
+        else:
+            letters = string.ascii_lowercase
+            return ''.join(random.choice(letters) for i in range(length))
+    def _id(
+        required: bool = True,
+    ) -> str:
+        if potential_skip(required):
+            return ''
+        return str(uuid.uuid4())
+    def _date(
+        date_range: Union[None, List[str]] = None,
+        format: str = '%Y-%m-%d',
+        required: bool = True,
+    ) -> str:
+        # the bounds specified in date_range are expected in the same format as the desired output format
+        assert all([k in format for k in ['%d', '%m', '%Y']])
+        if potential_skip(required):
+            return ''
+        if date_range is None:
+            return fake.date(format)
+        else:
+            if len(date_range) != 2:
+                raise ValueError('"date_range" must have exactly two elements.')
+            return fake.date_between_dates(
+                datetime.strptime(date_range[0], format),
+                datetime.strptime(date_range[1], format),
+            ).strftime(format)
+    def _time(
+        format: str = '%H:%M:%S',
+        required: bool = True,
+    ) -> str:
+        assert all([k in format for k in ['%H', '%M', '%S']])
+        if potential_skip(required):
+            return ''
+        # maybe add a time_range argument?
+        return fake.time(format)
+    def _datetime(
+        datetime_range: Optional[List[str]] = None,
+        format: str = '%Y-%m-%d %H-%M-%S',
+        required: bool = True,
+    ) -> str:
+        # the bounds specified in datetime_range are expected in the same format as the desired output format
+        assert all([k in format for k in ['%d', '%m', '%Y', '%H', '%M', '%S']])
+        if potential_skip(required):
+            return ''
+        if datetime_range is None:
+            return fake.date_time().strftime(format)
+        else:
+            if len(datetime_range) != 2:
+                raise ValueError('"date_range" must have exactly two elements.')
+            return fake.date_time_between(
+                datetime.strptime(datetime_range[0], format),
+                datetime.strptime(datetime_range[1], format),
+            ).strftime(format)
+    def _url(required: bool = True) -> str:
+        if potential_skip(required):
+            return ''
+        return f'http://{rstr.domainsafe()}.{rstr.letters(3)}/{rstr.urlsafe()}'
+    def _number(
+        num_type: Type[Union[int, float]] = int,
+        num_range: Optional[List[float]] = None,
+        enum: Optional[list] = None,
+        required: bool = True,
+    ) -> Union[int, float]:
+        assert num_range is None or len(num_range) == 2
+        if potential_skip(required):
+            return ''
+        if enum:
+            return random.choice(enum)
+        if num_range is None:
+            num_range = [0, 1000]
+        if num_type == int:
+            return random.randint(num_range[0], num_range[1])
+        else:
+            return round(random.uniform(num_range[0], num_range[1]), 1)
+    def _bool(required: bool = True) -> bool:
+        if potential_skip(required):
+            return ''
+        return random.randint(0, 1) == 0
+    def _array(enum: List[Any], required: bool = True) -> str:
+        if potential_skip(required):
+            return ''
+        return f"[{','.join(random.sample(enum, random.randint(1, len(enum))))}]"
+    def build_args_from_constraints(constraints: dict) -> dict:
+        args = {}
+        args['required'] = constraints.get('required', False)
+        for _ in ['pattern', 'enum', 'format']:
+            if _ in constraints:
+                args[_] = constraints[_]
+        if 'minimum' in constraints and 'maximum' in constraints:
+            args['num_range'] = [constraints['minimum'], constraints['maximum']]
+        # maybe there are better values than these?
+        elif 'minimum' in constraints:
+            args['num_range'] = [constraints['minimum'], 10 + constraints['minimum']]
+        elif 'maximum' in constraints:
+            args['num_range'] = [constraints['maximum'] - 10, constraints['maximum']]
+        if 'minLength' in constraints:
+            args['length'] = constraints['minLength']
+        if 'maxLength' in constraints:
+            args['length'] = constraints['maxLength']
+        return args
+    schema_types_to_python = {
+        'number': 'float',
+        'integer': 'int',
+        'string': 'str',
+        'year': 'year',
+        'boolean': 'bool',
+        'date': 'date',
+        'yearmonth': 'date',
+        'time': 'time',
+        'datetime': 'datetime',
+        'array': 'array'
+    }
+    if schema_path:
+        if schema_path.startswith('http'):
+            schema = requests.get(schema_path).json()
+        else:
+            with open(schema_path, encoding=encoding) as jsonfile:
+                schema = json.load(jsonfile)
+        if not ('fields' in schema.keys()):
+            raise ValueError('The schema must have a "fields" key.')
+        else:
+            fields = [
+                {
+                    'name': f['name'],
+                    'type': schema_types_to_python.get(f['type'], 'str'),
+                    # when frformat is supported in TableSchema, we can build args for French standards
+                    # linked to https://github.com/datagouv/fr-format/issues/26
+                    'args': (
+                        build_args_from_constraints(f['constraints']) if 'constraints' in f.keys()
+                        else build_args_from_constraints(f['arrayItem']['constraints'])
+                        if 'arrayItem' in f.keys() and 'constraints' in f['arrayItem'].keys()
+                        else {}
+                    )
+                } for f in schema['fields']
+            ]
+    for k in range(len(fields)):
+        if 'args' not in fields[k]:
+            fields[k]['args'] = {}
+        if fields[k]['type'] == 'float':
+            fields[k]['args']['num_type'] = float
+        elif fields[k]['type'] == 'int':
+            fields[k]['args']['num_type'] = int
+        elif fields[k]['type'] == 'year':
+            fields[k]['args']['num_type'] = int
+            fields[k]['args']['num_range'] = [1990, 2050]
+    types_to_func = {
+        'int': _number,
+        'float': _number,
+        'date': _date,
+        'time': _time,
+        'str': _string,
+        'url': _url,
+        'id': _id,
+        'year': _number,
+        'bool': _bool,
+        'datetime': _datetime,
+        'array': _array,
+    }
+    # would it be better to create by column or by row (as for now)?
+    output = pd.DataFrame(
+        [
+            [
+                types_to_func.get(f['type'], 'str')(**f['args'])
+                for f in fields
+            ] for _ in range(file_length)
+        ],
+        columns=[f["name"] for f in fields],
+    )
+    if output_name:
+        output.to_csv(output_name, sep=output_sep, index=False)
+    return output

{csv_detective-0.7.5.dev1009.data → csv_detective-0.7.5.dev1052.data}/data/share/csv_detective/CHANGELOG.md RENAMED Viewed

@@ -2,7 +2,7 @@
 ## Current (in progress)
-- Nothing yet
+- New function that creates a csv from a list of fields and constraints, or from a TableSchema [#100](https://github.com/datagouv/csv-detective/pull/100)
 ## 0.7.4 (2024-11-15)

{csv_detective-0.7.5.dev1009.dist-info → csv_detective-0.7.5.dev1052.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: csv_detective
-Version: 0.7.5.dev1009
+Version: 0.7.5.dev1052
 Summary: Detect CSV column content
 Home-page: https://github.com/etalab/csv_detective
 Author: Etalab
@@ -27,4 +27,6 @@ Requires-Dist: requests==2.32.3
 Requires-Dist: responses==0.25.0
 Requires-Dist: python-magic==0.4.27
 Requires-Dist: frformat==0.4.0
+Requires-Dist: faker==33.0.0
+Requires-Dist: rstr==3.2.2

{csv_detective-0.7.5.dev1009.dist-info → csv_detective-0.7.5.dev1052.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,6 @@
-csv_detective/__init__.py,sha256=giVhs0g13y4U2H0WiVBLcrvytcMxQ1LiCd2i03XITwQ,83
+csv_detective/__init__.py,sha256=Au4bNJ_Gi6P6o0uO4R56nYdshG7M6-7Rg_xX4whLmLI,143
 csv_detective/cli.py,sha256=Ua7SE1wMH2uFUsTmfumh4nJk7O06okpMd2gvjUDO1II,1048
+csv_detective/create_example.py,sha256=358e7Q7RWMrY_eEo3pUteJWmg2smFb5edJ_AzcQPrqA,8646
 csv_detective/detection.py,sha256=AuXlPOZfzqznZY2ybAAgaXIq6qVITYd3MXf2CoigI3I,22097
 csv_detective/explore_csv.py,sha256=X5yZS3WCUsafUMcs5tOnDTeMGzMnfr0iB9vEDx7xiqg,16977
 csv_detective/process_text.py,sha256=rsfk66BCmdpsCOd0kDJ8tmqMsEWd-OeBkEisWc4Ej9k,1246
@@ -125,17 +126,18 @@ csv_detective/detect_labels/temp/date/__init__.py,sha256=GrIbo64WVM3hi7ShBRKKyKU
 csv_detective/detect_labels/temp/datetime_iso/__init__.py,sha256=Ih9l56nBcdmGLyWDavVUWuUUuVZBz9QUDE1hHzADvVg,1157
 csv_detective/detect_labels/temp/datetime_rfc822/__init__.py,sha256=DQ_h4uDW1e6qu2rATEhgGKw6O-vVi7HbDhbEDDCT9uY,1175
 csv_detective/detect_labels/temp/year/__init__.py,sha256=zPF_mvhzhXMAlHPAskS8mhuxjLj2AlKpV4ss8Q4tDms,1150
-csv_detective-0.7.5.dev1009.data/data/share/csv_detective/CHANGELOG.md,sha256=4ABp5UF2L6tPg-eK7Dj6NWgnFnkU74BwhrMzrRGJ2Lw,6585
-csv_detective-0.7.5.dev1009.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
-csv_detective-0.7.5.dev1009.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
+csv_detective-0.7.5.dev1052.data/data/share/csv_detective/CHANGELOG.md,sha256=oDqKO3qTo-cUSJB4fMbsyQY2O4pEQhOwWeHsZwaGkxM,6725
+csv_detective-0.7.5.dev1052.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
+csv_detective-0.7.5.dev1052.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
 tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+tests/test_example.py,sha256=0NfChooJQlFxTo2nY5FOQIcsK4zzWA_SBmt2LwVQovY,2014
 tests/test_fields.py,sha256=kXel-hiyQYrJ3OLmwUMg1K3DKbbwBLvUplxZWxpp18I,10605
 tests/test_file.py,sha256=1fEOu3bArGBaarRKAoTXAF3cSIGJfFN3UIwOW6esWRs,6399
 tests/test_labels.py,sha256=6MOKrGznkwU5fjZ_3oiB6Scmb480Eu-9geBJs0UDLds,159
 tests/test_structure.py,sha256=SVsnluVoIIprYw_67I1_gB3cp9m1wlO8C7SpdsLW8cM,1161
-csv_detective-0.7.5.dev1009.dist-info/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
-csv_detective-0.7.5.dev1009.dist-info/METADATA,sha256=dd1MAjbcCr_c82zoDPcO5zbfB8KLLDd8WdDCMWtchzA,1090
-csv_detective-0.7.5.dev1009.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
-csv_detective-0.7.5.dev1009.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
-csv_detective-0.7.5.dev1009.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
-csv_detective-0.7.5.dev1009.dist-info/RECORD,,
+csv_detective-0.7.5.dev1052.dist-info/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
+csv_detective-0.7.5.dev1052.dist-info/METADATA,sha256=mgg54BtjqPn-L_G2a4JU0SyorK8uYzUyp64cxOAIe6A,1146
+csv_detective-0.7.5.dev1052.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
+csv_detective-0.7.5.dev1052.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
+csv_detective-0.7.5.dev1052.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
+csv_detective-0.7.5.dev1052.dist-info/RECORD,,

tests/test_example.py ADDED Viewed

@@ -0,0 +1,71 @@
+import re
+from uuid import UUID
+from csv_detective import create_example_csv_file
+def test_example_creation():
+    fields = [
+        {
+            "name": "id_unique",
+            "type": "id",
+        },
+        {
+            "name": "nom_modele",
+            "type": "str",
+            "args": {'length': 20},
+        },
+        {
+            "name": "siret",
+            "type": "str",
+            "args": {'pattern': '^\\d{14}$'},
+        },
+        {
+            "name": "type_producteur",
+            "type": "str",
+            "args": {'enum': ['privé', 'public', 'association']},
+        },
+        {
+            "name": "date_creation",
+            "type": "date",
+            "args": {
+                'date_range': ['1996-02-13', '2000-01-28'],
+                'format': '%Y-%m-%d',
+            },
+        },
+        {
+            "name": "url_produit",
+            "type": "url",
+        },
+        {
+            "name": "nb_produits",
+            "type": "int",
+        },
+        {
+            "name": "note",
+            "type": "float",
+            "args": {'num_range': [1, 20]}
+        },
+    ]
+    df = create_example_csv_file(
+        fields=fields,
+        file_length=5,
+        output_name="",
+    )
+    assert len(df) == 5
+    assert all(UUID(_) for _ in df["id_unique"])
+    assert all(len(_) == 20 for _ in df["nom_modele"])
+    assert all(re.match("^\\d{14}$", _) for _ in df["siret"])
+    assert all(_ in ['privé', 'public', 'association'] for _ in df["type_producteur"])
+    assert all(_ >= '1996-02-13' and _ <= '2000-01-28' for _ in df["date_creation"])
+    assert all(_.startswith("http") for _ in df["url_produit"])
+    assert all(isinstance(_, int) for _ in df["nb_produits"])
+    assert all(_ >= 1 and _ <= 20 for _ in df["note"])
+def test_example_from_tableschema():
+    df = create_example_csv_file(
+        schema_path="https://schema.data.gouv.fr/schemas/etalab/schema-irve-statique/2.3.1/schema-statique.json",
+        output_name="",
+    )
+    assert len(df) == 10

{csv_detective-0.7.5.dev1009.data → csv_detective-0.7.5.dev1052.data}/data/share/csv_detective/LICENSE.AGPL.txt RENAMED Viewed

File without changes

{csv_detective-0.7.5.dev1009.data → csv_detective-0.7.5.dev1052.data}/data/share/csv_detective/README.md RENAMED Viewed

File without changes

{csv_detective-0.7.5.dev1009.dist-info → csv_detective-0.7.5.dev1052.dist-info}/LICENSE.AGPL.txt RENAMED Viewed

File without changes

{csv_detective-0.7.5.dev1009.dist-info → csv_detective-0.7.5.dev1052.dist-info}/WHEEL RENAMED Viewed

File without changes

{csv_detective-0.7.5.dev1009.dist-info → csv_detective-0.7.5.dev1052.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{csv_detective-0.7.5.dev1009.dist-info → csv_detective-0.7.5.dev1052.dist-info}/top_level.txt RENAMED Viewed

File without changes

csv-detective 0.7.5.dev1009__py3-none-any.whl → 0.7.5.dev1052__py3-none-any.whl

csv-detective 0.7.5.dev1009py3-none-any.whl → 0.7.5.dev1052py3-none-any.whl