csv-detective 0.9.1.dev1869__py3-none-any.whl → 0.9.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,14 +25,7 @@ def generate_output(
25
25
  verbose: bool = False,
26
26
  sheet_name: Optional[Union[str, int]] = None,
27
27
  ) -> Union[dict, tuple[dict, pd.DataFrame]]:
28
- if output_profile or output_df:
29
- # to create the profile we have to cast columns, so using the dedicated function
30
- table = cast_df(
31
- df=table,
32
- columns=analysis["columns"],
33
- cast_json=cast_json,
34
- verbose=verbose,
35
- )
28
+ if output_profile:
36
29
  analysis["profile"] = create_profile(
37
30
  table=table,
38
31
  columns=analysis["columns"],
@@ -61,5 +54,10 @@ def generate_output(
61
54
  analysis["schema"] = generate_table_schema(analysis, save_file=False, verbose=verbose)
62
55
 
63
56
  if output_df:
64
- return analysis, table
57
+ return analysis, cast_df(
58
+ df=table,
59
+ columns=analysis["columns"],
60
+ cast_json=cast_json,
61
+ verbose=verbose,
62
+ )
65
63
  return analysis
@@ -4,7 +4,8 @@ from time import time
4
4
 
5
5
  import pandas as pd
6
6
 
7
- from csv_detective.utils import display_logs_depending_process_time, prevent_nan
7
+ from csv_detective.detect_fields.other.float import float_casting
8
+ from csv_detective.utils import cast_prevent_nan, display_logs_depending_process_time
8
9
 
9
10
 
10
11
  def create_profile(
@@ -18,11 +19,6 @@ def create_profile(
18
19
  if verbose:
19
20
  start = time()
20
21
  logging.info("Creating profile")
21
- map_python_types = {
22
- "string": str,
23
- "int": float,
24
- "float": float,
25
- }
26
22
 
27
23
  if num_rows > 0:
28
24
  raise ValueError("To create profiles num_rows has to be set to -1")
@@ -35,12 +31,19 @@ def create_profile(
35
31
  for c in table.columns:
36
32
  # for numerical formats we want min, max, mean, std
37
33
  if columns[c]["python_type"] in ["float", "int"]:
34
+ # we locally cast the column to perform the operations, using the same method as in cast_df
35
+ cast_col = (
36
+ table[c].astype(pd.Int64Dtype())
37
+ if columns[c]["python_type"] == "int"
38
+ else table[c].apply(lambda x: float_casting(x) if isinstance(x, str) else pd.NA)
39
+ )
38
40
  profile[c].update(
39
- min=prevent_nan(map_python_types[columns[c]["python_type"]](table[c].min())),
40
- max=prevent_nan(map_python_types[columns[c]["python_type"]](table[c].max())),
41
- mean=prevent_nan(map_python_types[columns[c]["python_type"]](table[c].mean())),
42
- std=prevent_nan(map_python_types[columns[c]["python_type"]](table[c].std())),
41
+ min=cast_prevent_nan(cast_col.min(), columns[c]["python_type"]),
42
+ max=cast_prevent_nan(cast_col.max(), columns[c]["python_type"]),
43
+ mean=cast_prevent_nan(cast_col.mean(), columns[c]["python_type"]),
44
+ std=cast_prevent_nan(cast_col.std(), columns[c]["python_type"]),
43
45
  )
46
+ del cast_col
44
47
  # for all formats we want most frequent values, nb unique values and nb missing values
45
48
  tops_bruts = (
46
49
  table.loc[table[c].notna(), c]
csv_detective/utils.py CHANGED
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import Optional
2
+ from typing import Optional, Union
3
3
 
4
4
  import pandas as pd
5
5
 
@@ -31,5 +31,7 @@ def is_url(file_path: str) -> bool:
31
31
  return file_path.startswith("http")
32
32
 
33
33
 
34
- def prevent_nan(value: float) -> Optional[float]:
35
- return None if pd.isna(value) else value
34
+ def cast_prevent_nan(value: float, _type: str) -> Optional[Union[float, int]]:
35
+ if _type not in {"int", "float"}:
36
+ raise ValueError(f"Invalid type was passed: {_type}")
37
+ return None if pd.isna(value) else eval(_type)(value)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csv-detective
3
- Version: 0.9.1.dev1869
3
+ Version: 0.9.2
4
4
  Summary: Detect tabular files column content
5
5
  Author-email: Etalab <opendatateam@data.gouv.fr>
6
6
  License: MIT
@@ -3,7 +3,7 @@ csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
3
3
  csv_detective/explore_csv.py,sha256=sEMza4Z27ac88fGq7tUiK1zlfvuftztHhHVoa0c2EVU,9191
4
4
  csv_detective/load_tests.py,sha256=uVKweLq3cf-yB5ZZI-m9tBVs_SWNcOw8sDJa97TOJGo,2266
5
5
  csv_detective/s3_utils.py,sha256=z1KTVVkdurMv21o-rZu7_aluMJnSi-d5uxnQbqT2NoI,1407
6
- csv_detective/utils.py,sha256=u9I1tsyMfVr2eIYiGCD7Iu30d55H3za44-N3cV2nj8M,1013
6
+ csv_detective/utils.py,sha256=xiIO7ZDqkTm9Rnhnq6RaDdnrPIfoG0JV9AsmaOG6plA,1162
7
7
  csv_detective/validate.py,sha256=RLHXLrRuynkdcvHUlSEbyglPvdbNYlT1Z4nQI-BdYdA,2898
8
8
  csv_detective/detect_fields/__init__.py,sha256=ZZ7u9zsMtCqPC2xxeLp57UTCbqpKFJi6D_LO1ew15BU,1980
9
9
  csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -137,10 +137,10 @@ csv_detective/detection/headers.py,sha256=y5iR4jWH5fUtAH_Zg0zxWSVG_INCHlXJFMbhPp
137
137
  csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
138
138
  csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
139
139
  csv_detective/detection/variables.py,sha256=wfsA_MOk14TPMOY7gkvpTGpo9-USzMnFaAou3MPHqxc,3536
140
- csv_detective/output/__init__.py,sha256=02F5D5TODMiImyZzjnX-vIkMPkUC0ioIryqdBm6xT-w,2056
140
+ csv_detective/output/__init__.py,sha256=bMsLp-XCVf4sNymIof_kdMdqFIY7GocOas-lPNekfQg,1930
141
141
  csv_detective/output/dataframe.py,sha256=pjxvpzIWVUW9_xvT3JjoPnOIVUUHnzL7kZo1xQdMDxQ,2139
142
142
  csv_detective/output/example.py,sha256=XrnPS_uC0cICn7tgnLWNctpUbnPzl7fIMzNTzJEWGJc,8655
143
- csv_detective/output/profile.py,sha256=k-t--uVHkrt3MRLnRAthiaF069jGc1jsQnfcOoBchrU,2524
143
+ csv_detective/output/profile.py,sha256=thckCcfy9cES5yYNW6TDGV82gP1OFWJuLhInT1g7JpI,2814
144
144
  csv_detective/output/schema.py,sha256=5Duw5qnsJ-LaVC6JgF7p1zZAkehDzsbXA4iTSJUgLNM,13760
145
145
  csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
146
146
  csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -150,19 +150,19 @@ csv_detective/parsing/csv.py,sha256=fJkjKvyk7InkNnYKtmivyi48mmcwvrha7gvZ5J4-86A,
150
150
  csv_detective/parsing/excel.py,sha256=sKD5PRN1TlzPPOKFnZ3VRb0r1yIjPLlpxVWmZQeLYFk,7027
151
151
  csv_detective/parsing/load.py,sha256=C3M8nvgWenOb8aDFi5dpDGCoAw9EBqr4EB63zbz2M14,3699
152
152
  csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
153
- csv_detective-0.9.1.dev1869.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
153
+ csv_detective-0.9.2.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
154
154
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
155
155
  tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
156
156
  tests/test_fields.py,sha256=5901OxKDReGMPQm3ZJ36oDjtJ8H3El5jPxf1YNu5wVg,12542
157
- tests/test_file.py,sha256=NBLwPCFN2skZHLkckPZ0M0ZvanEdL88KVK1Vi9GhSaU,8925
157
+ tests/test_file.py,sha256=ZL0Jx499RUpmKFvcPQVnAeafSbyc23fqwt93ZrYg9GE,10258
158
158
  tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
159
159
  tests/test_structure.py,sha256=GRDYKy0UcdqlN4qglzsRC0puFj5cb-SVvONjvcPvtAA,1400
160
160
  tests/test_validation.py,sha256=ie-Xf0vk6-M6GQq-x7kY5yse1EmXfxQkbaV7fR3fvYo,3308
161
161
  venv/bin/activate_this.py,sha256=NRy3waFmwW1pOaNUp33wNN0vD1Kzkd-zXX-Sgl4EiVI,1286
162
162
  venv/bin/jp.py,sha256=7z7dvRg0M7HzpZG4ssQID7nScjvQx7bcYTxJWDOrS6E,1717
163
163
  venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
164
- csv_detective-0.9.1.dev1869.dist-info/METADATA,sha256=3gGiQT_yLk3thJkrLt5l90W8ylzk_MVYN0_F3wGv5qE,9767
165
- csv_detective-0.9.1.dev1869.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
166
- csv_detective-0.9.1.dev1869.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
167
- csv_detective-0.9.1.dev1869.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
168
- csv_detective-0.9.1.dev1869.dist-info/RECORD,,
164
+ csv_detective-0.9.2.dist-info/METADATA,sha256=Yval8NfM6FC2eiIz8bybr9vbjJXOgS81VHzDJiBiPGI,9759
165
+ csv_detective-0.9.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
166
+ csv_detective-0.9.2.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
167
+ csv_detective-0.9.2.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
168
+ csv_detective-0.9.2.dist-info/RECORD,,
tests/test_file.py CHANGED
@@ -5,6 +5,7 @@ import pytest
5
5
  import responses
6
6
 
7
7
  from csv_detective import routine
8
+ from csv_detective.output.profile import create_profile
8
9
 
9
10
 
10
11
  @pytest.mark.parametrize(
@@ -97,6 +98,55 @@ def test_profile_with_num_rows():
97
98
  )
98
99
 
99
100
 
101
+ @pytest.mark.parametrize(
102
+ "params",
103
+ (
104
+ (
105
+ True,
106
+ {
107
+ "int_with_nan": {"format": "int", "python_type": "int"},
108
+ "date": {"format": "date", "python_type": "date"},
109
+ },
110
+ ),
111
+ (
112
+ False,
113
+ {
114
+ "int_with_nan": [{"format": "int", "python_type": "int"}],
115
+ "date": [{"format": "date", "python_type": "date"}],
116
+ },
117
+ ),
118
+ ),
119
+ )
120
+ def test_profile_specific_cases(params):
121
+ limited_output, columns = params
122
+ table = pd.DataFrame(
123
+ {
124
+ "int_with_nan": ["1", pd.NA, pd.NA],
125
+ "date": ["1996-01-02", "1996-01-02", "2024-11-12"],
126
+ }
127
+ )
128
+ profile = create_profile(
129
+ table=table,
130
+ columns=columns,
131
+ limited_output=limited_output,
132
+ num_rows=-1,
133
+ )
134
+ assert profile["int_with_nan"] == {
135
+ "min": 1,
136
+ "max": 1,
137
+ "mean": 1,
138
+ "std": None,
139
+ "tops": [{"count": 1, "value": "1"}],
140
+ "nb_distinct": 1,
141
+ "nb_missing_values": 2,
142
+ }
143
+ assert profile["date"] == {
144
+ "tops": [{"count": 2, "value": "1996-01-02"}, {"count": 1, "value": "2024-11-12"}],
145
+ "nb_distinct": 2,
146
+ "nb_missing_values": 0,
147
+ }
148
+
149
+
100
150
  def test_exception_different_number_of_columns():
101
151
  """
102
152
  A ValueError should be raised if the number of columns differs between the first rows