csv-detective 0.8.1.dev1703__py3-none-any.whl → 0.8.1.dev1729__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. csv_detective/cli.py +6 -9
  2. csv_detective/detect_fields/FR/geo/adresse/__init__.py +78 -78
  3. csv_detective/detect_fields/FR/geo/code_departement/__init__.py +2 -2
  4. csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -1
  5. csv_detective/detect_fields/FR/geo/code_region/__init__.py +1 -1
  6. csv_detective/detect_fields/FR/geo/commune/__init__.py +2 -2
  7. csv_detective/detect_fields/FR/geo/departement/__init__.py +2 -2
  8. csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +2 -2
  9. csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +1 -2
  10. csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
  11. csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +1 -2
  12. csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
  13. csv_detective/detect_fields/FR/geo/pays/__init__.py +6 -6
  14. csv_detective/detect_fields/FR/geo/region/__init__.py +6 -4
  15. csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +15 -14
  16. csv_detective/detect_fields/FR/other/csp_insee/__init__.py +4 -3
  17. csv_detective/detect_fields/FR/other/date_fr/__init__.py +3 -3
  18. csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +4 -3
  19. csv_detective/detect_fields/FR/other/sexe/__init__.py +2 -2
  20. csv_detective/detect_fields/FR/other/siren/__init__.py +3 -3
  21. csv_detective/detect_fields/FR/other/siret/__init__.py +3 -3
  22. csv_detective/detect_fields/FR/other/tel_fr/__init__.py +3 -3
  23. csv_detective/detect_fields/FR/other/uai/__init__.py +2 -2
  24. csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +15 -15
  25. csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +27 -27
  26. csv_detective/detect_fields/__init__.py +94 -43
  27. csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +5 -5
  28. csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +5 -5
  29. csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +5 -5
  30. csv_detective/detect_fields/geo/latitude_wgs/__init__.py +1 -1
  31. csv_detective/detect_fields/geo/longitude_wgs/__init__.py +1 -1
  32. csv_detective/detect_fields/other/booleen/__init__.py +1 -1
  33. csv_detective/detect_fields/other/email/__init__.py +4 -2
  34. csv_detective/detect_fields/other/int/__init__.py +3 -3
  35. csv_detective/detect_fields/other/mongo_object_id/__init__.py +2 -2
  36. csv_detective/detect_fields/other/twitter/__init__.py +2 -2
  37. csv_detective/detect_fields/other/uuid/__init__.py +4 -5
  38. csv_detective/detect_fields/temp/date/__init__.py +3 -2
  39. csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +6 -6
  40. csv_detective/detect_fields/temp/year/__init__.py +1 -1
  41. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -1
  42. csv_detective/detect_labels/__init__.py +51 -1
  43. csv_detective/detect_labels/geo/lonlat_wgs/__init__.py +1 -0
  44. csv_detective/detect_labels/other/mongo_object_id/__init__.py +1 -1
  45. csv_detective/detection/columns.py +9 -9
  46. csv_detective/detection/encoding.py +6 -4
  47. csv_detective/detection/engine.py +6 -5
  48. csv_detective/detection/formats.py +19 -19
  49. csv_detective/detection/headers.py +3 -5
  50. csv_detective/detection/rows.py +1 -1
  51. csv_detective/detection/variables.py +6 -7
  52. csv_detective/explore_csv.py +7 -8
  53. csv_detective/load_tests.py +7 -16
  54. csv_detective/output/__init__.py +3 -7
  55. csv_detective/output/dataframe.py +9 -5
  56. csv_detective/output/example.py +13 -13
  57. csv_detective/output/profile.py +30 -23
  58. csv_detective/output/schema.py +20 -23
  59. csv_detective/output/utils.py +15 -15
  60. csv_detective/parsing/columns.py +23 -12
  61. csv_detective/parsing/csv.py +1 -1
  62. csv_detective/parsing/excel.py +10 -11
  63. csv_detective/parsing/load.py +11 -8
  64. csv_detective/parsing/text.py +4 -9
  65. csv_detective/s3_utils.py +3 -7
  66. csv_detective/utils.py +4 -2
  67. csv_detective/validate.py +18 -13
  68. {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/METADATA +12 -2
  69. {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/RECORD +79 -79
  70. tests/test_example.py +2 -6
  71. tests/test_fields.py +16 -10
  72. tests/test_file.py +10 -9
  73. tests/test_labels.py +3 -2
  74. tests/test_structure.py +4 -3
  75. tests/test_validation.py +9 -6
  76. {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/WHEEL +0 -0
  77. {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/entry_points.txt +0 -0
  78. {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/licenses/LICENSE +0 -0
  79. {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/top_level.txt +0 -0
tests/test_fields.py CHANGED
@@ -1,8 +1,9 @@
1
- from datetime import date as _date, datetime as _datetime
1
+ from datetime import date as _date
2
+ from datetime import datetime as _datetime
2
3
 
3
- from numpy import random
4
4
  import pandas as pd
5
5
  import pytest
6
+ from numpy import random
6
7
 
7
8
  from csv_detective.detect_fields.FR.geo import (
8
9
  adresse,
@@ -23,8 +24,8 @@ from csv_detective.detect_fields.FR.geo import (
23
24
  )
24
25
  from csv_detective.detect_fields.FR.other import (
25
26
  code_csp_insee,
26
- code_rna,
27
27
  code_import,
28
+ code_rna,
28
29
  code_waldec,
29
30
  csp_insee,
30
31
  date_fr,
@@ -56,9 +57,13 @@ from csv_detective.detect_fields.other import (
56
57
  twitter,
57
58
  url,
58
59
  uuid,
59
- int as test_int,
60
+ )
61
+ from csv_detective.detect_fields.other import (
60
62
  float as test_float,
61
63
  )
64
+ from csv_detective.detect_fields.other import (
65
+ int as test_int,
66
+ )
62
67
  from csv_detective.detect_fields.temp import (
63
68
  date,
64
69
  datetime_aware,
@@ -67,8 +72,8 @@ from csv_detective.detect_fields.temp import (
67
72
  year,
68
73
  )
69
74
  from csv_detective.detection.variables import (
70
- detect_continuous_variable,
71
75
  detect_categorical_variable,
76
+ detect_continuous_variable,
72
77
  )
73
78
  from csv_detective.load_tests import return_all_tests
74
79
  from csv_detective.output.dataframe import cast
@@ -225,10 +230,7 @@ fields = {
225
230
  True: ["13 fevrier 1996"],
226
231
  False: ["44 march 2025"],
227
232
  },
228
- insee_ape700: {
229
- True: ["0116Z"],
230
- False: ["0116A"]
231
- },
233
+ insee_ape700: {True: ["0116Z"], False: ["0116A"]},
232
234
  tel_fr: {
233
235
  True: ["0134643467"],
234
236
  False: ["6625388263", "01288398"],
@@ -360,7 +362,11 @@ fields = {
360
362
  },
361
363
  datetime_naive: {
362
364
  True: ["2021-06-22 10:20:10", "2030/06/22 00:00:00.0028"],
363
- False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT", "2021-06-44 10:20:10+02:00"],
365
+ False: [
366
+ "2021-06-22T30:20:10",
367
+ "Sun, 06 Nov 1994 08:49:37 GMT",
368
+ "2021-06-44 10:20:10+02:00",
369
+ ],
364
370
  },
365
371
  datetime_rfc822: {
366
372
  True: ["Sun, 06 Nov 1994 08:49:37 GMT"],
tests/test_file.py CHANGED
@@ -1,7 +1,8 @@
1
+ from unittest.mock import patch
2
+
1
3
  import pandas as pd
2
4
  import pytest
3
5
  import responses
4
- from unittest.mock import patch
5
6
 
6
7
  from csv_detective import routine
7
8
 
@@ -70,10 +71,10 @@ def test_profile_output_on_file():
70
71
  [
71
72
  c in list(output["profile"]["NUMCOM"].keys())
72
73
  for c in [
73
- "min",
74
- "max",
75
- "mean",
76
- "std",
74
+ "min",
75
+ "max",
76
+ "mean",
77
+ "std",
77
78
  ]
78
79
  ]
79
80
  )
@@ -191,7 +192,7 @@ def mocked_responses():
191
192
  "params",
192
193
  # ideally we'd like to do the same with params_others but pandas.read_excel uses urllib
193
194
  # which doesn't support the way we mock the response, TBC
194
- params_csv + [("a_test_file.csv", {"separator": ";", "header_row_idx": 2, "total_lines": 404})]
195
+ params_csv + [("a_test_file.csv", {"separator": ";", "header_row_idx": 2, "total_lines": 404})],
195
196
  )
196
197
  def test_urls(mocked_responses, params):
197
198
  file_name, checks = params
@@ -261,17 +262,17 @@ def test_cast_json(mocked_responses, cast_json):
261
262
  cast_json, expected_type = cast_json
262
263
  expected_content = 'id,a_simple_dict\n1,{"a": 1}\n2,{"b": 2}\n3,{"c": 3}\n'
263
264
  mocked_responses.get(
264
- 'http://example.com/test.csv',
265
+ "http://example.com/test.csv",
265
266
  body=expected_content,
266
267
  status=200,
267
268
  )
268
269
  analysis, df = routine(
269
- file_path='http://example.com/test.csv',
270
+ file_path="http://example.com/test.csv",
270
271
  num_rows=-1,
271
272
  output_profile=False,
272
273
  save_results=False,
273
274
  output_df=True,
274
275
  cast_json=cast_json,
275
276
  )
276
- assert analysis['columns']["a_simple_dict"]["python_type"] == "json"
277
+ assert analysis["columns"]["a_simple_dict"]["python_type"] == "json"
277
278
  assert isinstance(df["a_simple_dict"][0], expected_type)
tests/test_labels.py CHANGED
@@ -10,13 +10,14 @@ def test_money_labels():
10
10
 
11
11
 
12
12
  @pytest.mark.parametrize(
13
- "params", [
13
+ "params",
14
+ [
14
15
  ("latitude", 1.0),
15
16
  ("lat", 1.0),
16
17
  ("coord_lat", 0.5),
17
18
  ("y", 1.0),
18
19
  ("nb_cycles", 0.0),
19
- ]
20
+ ],
20
21
  )
21
22
  def test_latitude(params):
22
23
  header, expected = params
tests/test_structure.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import os
2
- # flake8: noqa
3
- from csv_detective import detect_fields, detect_labels
2
+
3
+ from csv_detective import detect_fields, detect_labels # noqa
4
4
  from csv_detective.load_tests import return_all_tests
5
5
 
6
6
 
@@ -18,7 +18,8 @@ def tests_conformity():
18
18
  if "__pycache__" not in dirname:
19
19
  subfolders.append(os.path.join(dirpath, dirname))
20
20
  final_subfolders = [
21
- sf for sf in subfolders
21
+ sf
22
+ for sf in subfolders
22
23
  if not any(other_sf.startswith(sf) for other_sf in subfolders if sf != other_sf)
23
24
  ]
24
25
  for f_sf in final_subfolders:
tests/test_validation.py CHANGED
@@ -30,13 +30,16 @@ def get_nested_value(source_dict: dict, key_chain: list[str]):
30
30
  ((False, None, None), {"separator": "|"}),
31
31
  ((False, None, None), {"encoding": "unknown"}),
32
32
  ((False, None, None), {"header": ["a", "b"]}),
33
- ((False, pd.DataFrame, dict), {
34
- "columns.NUMCOM": {
35
- "python_type": "int",
36
- "format": "int",
37
- "score": 1.0,
33
+ (
34
+ (False, pd.DataFrame, dict),
35
+ {
36
+ "columns.NUMCOM": {
37
+ "python_type": "int",
38
+ "format": "int",
39
+ "score": 1.0,
40
+ },
38
41
  },
39
- }),
42
+ ),
40
43
  ),
41
44
  )
42
45
  def test_validation(_params):