csv-detective 0.9.3.dev2382__py3-none-any.whl → 0.9.3.dev2400__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. csv_detective/detection/__init__.py +0 -0
  2. csv_detective/detection/columns.py +0 -0
  3. csv_detective/detection/encoding.py +0 -0
  4. csv_detective/detection/engine.py +0 -0
  5. csv_detective/detection/formats.py +0 -0
  6. csv_detective/detection/headers.py +0 -0
  7. csv_detective/detection/rows.py +0 -0
  8. csv_detective/detection/separator.py +0 -0
  9. csv_detective/detection/variables.py +0 -0
  10. csv_detective/format.py +0 -0
  11. csv_detective/formats/__init__.py +0 -0
  12. csv_detective/formats/adresse.py +0 -0
  13. csv_detective/formats/booleen.py +0 -0
  14. csv_detective/formats/code_commune_insee.py +0 -0
  15. csv_detective/formats/code_csp_insee.py +0 -0
  16. csv_detective/formats/code_departement.py +0 -0
  17. csv_detective/formats/code_fantoir.py +0 -0
  18. csv_detective/formats/code_import.py +0 -0
  19. csv_detective/formats/code_postal.py +0 -0
  20. csv_detective/formats/code_region.py +0 -0
  21. csv_detective/formats/code_rna.py +0 -0
  22. csv_detective/formats/code_waldec.py +0 -0
  23. csv_detective/formats/commune.py +0 -0
  24. csv_detective/formats/csp_insee.py +0 -0
  25. csv_detective/formats/date.py +0 -0
  26. csv_detective/formats/date_fr.py +0 -0
  27. csv_detective/formats/datetime_aware.py +0 -0
  28. csv_detective/formats/datetime_naive.py +0 -0
  29. csv_detective/formats/datetime_rfc822.py +0 -0
  30. csv_detective/formats/departement.py +0 -0
  31. csv_detective/formats/email.py +0 -0
  32. csv_detective/formats/float.py +0 -0
  33. csv_detective/formats/geojson.py +0 -0
  34. csv_detective/formats/insee_ape700.py +0 -0
  35. csv_detective/formats/insee_canton.py +0 -0
  36. csv_detective/formats/int.py +0 -0
  37. csv_detective/formats/iso_country_code_alpha2.py +0 -0
  38. csv_detective/formats/iso_country_code_alpha3.py +0 -0
  39. csv_detective/formats/iso_country_code_numeric.py +0 -0
  40. csv_detective/formats/jour_de_la_semaine.py +0 -0
  41. csv_detective/formats/json.py +0 -0
  42. csv_detective/formats/latitude_l93.py +0 -0
  43. csv_detective/formats/latitude_wgs.py +0 -0
  44. csv_detective/formats/latitude_wgs_fr_metropole.py +0 -0
  45. csv_detective/formats/latlon_wgs.py +0 -0
  46. csv_detective/formats/longitude_l93.py +0 -0
  47. csv_detective/formats/longitude_wgs.py +0 -0
  48. csv_detective/formats/longitude_wgs_fr_metropole.py +0 -0
  49. csv_detective/formats/lonlat_wgs.py +0 -0
  50. csv_detective/formats/mois_de_lannee.py +0 -0
  51. csv_detective/formats/money.py +0 -0
  52. csv_detective/formats/mongo_object_id.py +0 -0
  53. csv_detective/formats/pays.py +0 -0
  54. csv_detective/formats/percent.py +0 -0
  55. csv_detective/formats/region.py +0 -0
  56. csv_detective/formats/sexe.py +0 -0
  57. csv_detective/formats/siren.py +0 -0
  58. csv_detective/formats/siret.py +0 -0
  59. csv_detective/formats/tel_fr.py +0 -0
  60. csv_detective/formats/uai.py +0 -0
  61. csv_detective/formats/url.py +2 -1
  62. csv_detective/formats/username.py +0 -0
  63. csv_detective/formats/uuid.py +0 -0
  64. csv_detective/formats/year.py +0 -0
  65. csv_detective/output/__init__.py +0 -0
  66. csv_detective/output/dataframe.py +0 -0
  67. csv_detective/output/example.py +0 -0
  68. csv_detective/output/profile.py +0 -0
  69. csv_detective/output/schema.py +0 -0
  70. csv_detective/output/utils.py +0 -0
  71. csv_detective/parsing/__init__.py +0 -0
  72. csv_detective/parsing/columns.py +0 -0
  73. csv_detective/parsing/compression.py +0 -0
  74. csv_detective/parsing/csv.py +0 -0
  75. csv_detective/parsing/excel.py +0 -0
  76. csv_detective/parsing/load.py +0 -0
  77. csv_detective/validate.py +0 -0
  78. {csv_detective-0.9.3.dev2382.dist-info → csv_detective-0.9.3.dev2400.dist-info}/METADATA +17 -18
  79. {csv_detective-0.9.3.dev2382.dist-info → csv_detective-0.9.3.dev2400.dist-info}/RECORD +14 -25
  80. csv_detective-0.9.3.dev2400.dist-info/WHEEL +4 -0
  81. {csv_detective-0.9.3.dev2382.dist-info → csv_detective-0.9.3.dev2400.dist-info}/entry_points.txt +1 -0
  82. csv_detective-0.9.3.dev2382.dist-info/WHEEL +0 -5
  83. csv_detective-0.9.3.dev2382.dist-info/licenses/LICENSE +0 -21
  84. csv_detective-0.9.3.dev2382.dist-info/top_level.txt +0 -4
  85. tests/__init__.py +0 -0
  86. tests/test_example.py +0 -67
  87. tests/test_fields.py +0 -167
  88. tests/test_file.py +0 -413
  89. tests/test_labels.py +0 -26
  90. tests/test_structure.py +0 -45
  91. tests/test_validation.py +0 -108
  92. venv/bin/activate_this.py +0 -38
  93. venv/bin/runxlrd.py +0 -410
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
csv_detective/format.py CHANGED
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
@@ -20,7 +20,7 @@ labels = [
20
20
 
21
21
  pattern = re.compile(
22
22
  r"^((https?|ftp)://|www\.)(([A-Za-z0-9-]+\.)+[A-Za-z]{2,6})"
23
- r"(/[A-Za-z0-9._~:/?#[@!$&'()*+,;=%-]*)?$"
23
+ r"(/[A-Za-z\u00C0-\u024F\u1E00-\u1EFF0-9\s._~:/?#[@!$&'()*+,;=%-]*)?$"
24
24
  )
25
25
 
26
26
 
@@ -40,6 +40,7 @@ _test_values = {
40
40
  "aaaaaaaa-1111-bbbb-2222-cccccccccccc/data/"
41
41
  "?score__greater=0.9&decompte__exact=13"
42
42
  ),
43
+ "https://une-ville.fr/délibérations/2025/Doc avec espaces et àccëñts.pdf",
43
44
  ],
44
45
  False: ["tmp@data.gouv.fr"],
45
46
  }
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
csv_detective/validate.py CHANGED
File without changes
@@ -1,33 +1,32 @@
1
- Metadata-Version: 2.4
1
+ Metadata-Version: 2.3
2
2
  Name: csv-detective
3
- Version: 0.9.3.dev2382
3
+ Version: 0.9.3.dev2400
4
4
  Summary: Detect tabular files column content
5
- Author-email: "data.gouv.fr" <opendatateam@data.gouv.fr>
6
- License: MIT
7
- Project-URL: Source, https://github.com/datagouv/csv_detective
8
5
  Keywords: CSV,data processing,encoding,guess,parser,tabular
9
- Requires-Python: <3.15,>=3.10
10
- Description-Content-Type: text/markdown
11
- License-File: LICENSE
12
- Requires-Dist: dateparser<2,>=1.2.0
6
+ Author: data.gouv.fr
7
+ Author-email: data.gouv.fr <opendatateam@data.gouv.fr>
8
+ License: MIT
9
+ Requires-Dist: dateparser>=1.2.0,<2
13
10
  Requires-Dist: faust-cchardet==2.1.19
14
- Requires-Dist: pandas<3,>=2.2.0
15
- Requires-Dist: python-dateutil<3,>=2.8.2
16
- Requires-Dist: Unidecode<2,>=1.3.6
11
+ Requires-Dist: pandas>=2.2.0,<3
12
+ Requires-Dist: python-dateutil>=2.8.2,<3
13
+ Requires-Dist: unidecode>=1.3.6,<2
17
14
  Requires-Dist: openpyxl>=3.1.5
18
15
  Requires-Dist: xlrd>=2.0.1
19
16
  Requires-Dist: odfpy>=1.4.1
20
- Requires-Dist: requests<3,>=2.32.3
17
+ Requires-Dist: requests>=2.32.3,<3
21
18
  Requires-Dist: python-magic>=0.4.27
22
19
  Requires-Dist: frformat==0.4.0
23
- Requires-Dist: Faker>=33.0.0
20
+ Requires-Dist: faker>=33.0.0
24
21
  Requires-Dist: rstr>=3.2.2
25
22
  Requires-Dist: more-itertools>=10.8.0
23
+ Requires-Dist: pytest>=8.3.0 ; extra == 'dev'
24
+ Requires-Dist: responses>=0.25.0 ; extra == 'dev'
25
+ Requires-Dist: ruff>=0.9.3 ; extra == 'dev'
26
+ Requires-Python: >=3.10, <3.15
27
+ Project-URL: Source, https://github.com/datagouv/csv_detective
26
28
  Provides-Extra: dev
27
- Requires-Dist: pytest>=8.3.0; extra == "dev"
28
- Requires-Dist: responses>=0.25.0; extra == "dev"
29
- Requires-Dist: ruff>=0.9.3; extra == "dev"
30
- Dynamic: license-file
29
+ Description-Content-Type: text/markdown
31
30
 
32
31
  # CSV Detective
33
32
 
@@ -1,9 +1,5 @@
1
1
  csv_detective/__init__.py,sha256=zlYElTOp_I2_VG7ZdOTuAu0wuCXSc0cr3sH6gtk2bcg,152
2
2
  csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
3
- csv_detective/explore_csv.py,sha256=-LCHr7vyT0Q0oLtXeOO8pEevJ6-8Ib9JP3D7nVgZM8o,7090
4
- csv_detective/format.py,sha256=XX_cSTQc0jlsQq3GUqHi7Cz36AiRrpjrwPmeoOTLMvo,2396
5
- csv_detective/utils.py,sha256=RJ_zFOJ1DRY8HtDrKPiCdNk5gU6-KwOrOKOyfSkBZZY,1118
6
- csv_detective/validate.py,sha256=XldlbGkUlPaIh0y4z9iaWlmmahwCrD1900s5Cxlq5wI,5430
7
3
  csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
4
  csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
9
5
  csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
@@ -13,6 +9,8 @@ csv_detective/detection/headers.py,sha256=95pTL524Sy5PGxyQ03ofFUaamvlmkxTJQe8u6H
13
9
  csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
14
10
  csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
15
11
  csv_detective/detection/variables.py,sha256=-QtZOB96z3pWbqnZ-c1RU3yzoYqcO61A0JzeS6JbkxY,3576
12
+ csv_detective/explore_csv.py,sha256=-LCHr7vyT0Q0oLtXeOO8pEevJ6-8Ib9JP3D7nVgZM8o,7090
13
+ csv_detective/format.py,sha256=XX_cSTQc0jlsQq3GUqHi7Cz36AiRrpjrwPmeoOTLMvo,2396
16
14
  csv_detective/formats/__init__.py,sha256=Egiy29kcG3Oz2eE2maYhD3wP29zOSOWyRlOpGD5LGvU,318
17
15
  csv_detective/formats/adresse.py,sha256=jALDpEDAWyAcgqEfNVRg_W1r6XaYuJKD_jAaP2l-bxk,1943
18
16
  csv_detective/formats/booleen.py,sha256=AnDDKShkSYpWO4POhwY2V7_C4yPWbmqBu8CJPgQ9Gwc,648
@@ -27,6 +25,11 @@ csv_detective/formats/code_rna.py,sha256=WExlQtlAUfOFT4N3MKsMBhZVxTdNzgexFjmXhZd
27
25
  csv_detective/formats/code_waldec.py,sha256=kJEJfikbhMfVwtA8hBpup0tpeSFoY_rWrEdXQxgNwhg,297
28
26
  csv_detective/formats/commune.py,sha256=oVpwINGqpwMOT43KkasozipJ9hBeoQ5FrKV_wIeVJGE,532
29
27
  csv_detective/formats/csp_insee.py,sha256=HE6NK6Sw91mLFeAAKwWUXZZfXX6fiA0zK4RI4YdkUFY,656
28
+ csv_detective/formats/data/csp_insee.txt,sha256=kgKaKc-5PHu5U4--ugLjpFyMNtTU9CGdZ9ANU3YAsM4,32879
29
+ csv_detective/formats/data/insee_ape700.txt,sha256=nKgslakENwgE7sPkVNHqR23iXuxF02p9-v5MC2_ntx8,4398
30
+ csv_detective/formats/data/iso_country_code_alpha2.txt,sha256=YyPlDqCdz65ecf4Wes_r0P4rDSJG35niXtjc4MmctXM,1740
31
+ csv_detective/formats/data/iso_country_code_alpha3.txt,sha256=aYqKSohgXuBtcIBfF52f8JWYDdxL_HV_Ol1srGnWBp4,1003
32
+ csv_detective/formats/data/iso_country_code_numeric.txt,sha256=2GtEhuporsHYV-pU4q9kfXU5iOtfW5C0GYBTTKQtnnA,1004
30
33
  csv_detective/formats/date.py,sha256=X4ohXaFO8cXPJktUSumc3bfdlbDIWEYTG8S9ugVRcsE,2730
31
34
  csv_detective/formats/date_fr.py,sha256=3hTw5RommrhcgECFRSt9KgyB9zyi1j4W3UygEHmRgoE,502
32
35
  csv_detective/formats/datetime_aware.py,sha256=-1ZBix6vYlYXTvhXrijP-98AN7iPB0x_DbbwU1QjMCI,1470
@@ -63,15 +66,10 @@ csv_detective/formats/siren.py,sha256=ieLe50vdSnkXadcUI8VXnnId9GFGHyIBWVTP6bJtyM
63
66
  csv_detective/formats/siret.py,sha256=ehkZgOH-HggN6IgxF4G0DMut_6giZ3gc4g9wMdwZFHQ,997
64
67
  csv_detective/formats/tel_fr.py,sha256=yKCqIlqKO2yKucCoCjYfSjqNKfTjqFcmNXxg6THG0WE,624
65
68
  csv_detective/formats/uai.py,sha256=uT5gjdTmoFH9QPZdTFkJgiyuKLW0B6KmT6yqHQeaeOU,711
66
- csv_detective/formats/url.py,sha256=GYE9j_i4kpEQueBXa1Fla0wk8_sc0n230GL3KaIRvwY,932
69
+ csv_detective/formats/url.py,sha256=j6tCbcEzQw7U53ixeeFfhzueN8syVgQsjmAmY7RRWdU,1049
67
70
  csv_detective/formats/username.py,sha256=y38OggfWpEQsGi0JnD9QRM30musa29lO6nz-qybR24U,249
68
71
  csv_detective/formats/uuid.py,sha256=ekMEFfzQtz0cLudzmu3AoCM0Yf5pu23qAcFNFgHWJ1A,346
69
72
  csv_detective/formats/year.py,sha256=pkAfYPKZdy0g1ZoHGgJNpgTS5y5weGEKXCVMGaxIX8k,472
70
- csv_detective/formats/data/csp_insee.txt,sha256=kgKaKc-5PHu5U4--ugLjpFyMNtTU9CGdZ9ANU3YAsM4,32879
71
- csv_detective/formats/data/insee_ape700.txt,sha256=nKgslakENwgE7sPkVNHqR23iXuxF02p9-v5MC2_ntx8,4398
72
- csv_detective/formats/data/iso_country_code_alpha2.txt,sha256=YyPlDqCdz65ecf4Wes_r0P4rDSJG35niXtjc4MmctXM,1740
73
- csv_detective/formats/data/iso_country_code_alpha3.txt,sha256=aYqKSohgXuBtcIBfF52f8JWYDdxL_HV_Ol1srGnWBp4,1003
74
- csv_detective/formats/data/iso_country_code_numeric.txt,sha256=2GtEhuporsHYV-pU4q9kfXU5iOtfW5C0GYBTTKQtnnA,1004
75
73
  csv_detective/output/__init__.py,sha256=ALSq_tgX7rGyh--7rmbKz8wHkmResN0h7mNujndow3w,2103
76
74
  csv_detective/output/dataframe.py,sha256=TyBc2ObaVUns_ydJWOMKmCYvuj7ddxag0QN3z37g3GE,3219
77
75
  csv_detective/output/example.py,sha256=8LWheSBYCeDFfarbnmzBrdCbTd8Alh1U4pfXMKfabOw,8630
@@ -85,18 +83,9 @@ csv_detective/parsing/csv.py,sha256=0T0gpaXzwJo-sq41IoLQD704GiMUYeDVVASVbat-zWg,
85
83
  csv_detective/parsing/excel.py,sha256=oAVTuoDccJc4-kVjHXiIPLQx3lq3aZRRZQxkG1c06JQ,6992
86
84
  csv_detective/parsing/load.py,sha256=f-8aKiNpy_47qg4Lq-UZUR4NNrbJ_-KEGvcUQZ8cmb0,4317
87
85
  csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
88
- csv_detective-0.9.3.dev2382.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
89
- tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
90
- tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
91
- tests/test_fields.py,sha256=EWHIKwRSdIh74bBSoozYmZBETf7V03JMWpglyxA0ci0,5616
92
- tests/test_file.py,sha256=MxJOWwhRG2Xm1_m3C9x8CS9FepjUebET-6EsMi3DvmY,13125
93
- tests/test_labels.py,sha256=kDPerWC3_J3l1p5I3-MHwz7BmhcuxZAws_wSgHCHUuI,536
94
- tests/test_structure.py,sha256=XDbviuuvk-0Mu9Y9PI6He2e5hry2dXVJ6yBVwEqF_2o,1043
95
- tests/test_validation.py,sha256=9djBT-PDhu_563OFgWyE20o-wPEWEIQGXp6Pjh0_MQM,3463
96
- venv/bin/activate_this.py,sha256=wS7qPipy8R-dS_0ICD8PqqUQ8F-PrtcpiJw2DUPngYM,1287
97
- venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
98
- csv_detective-0.9.3.dev2382.dist-info/METADATA,sha256=gGotUngB4Ch3dhlapEv97KEq1JUX-xI1NsT51rOCZ1U,11084
99
- csv_detective-0.9.3.dev2382.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
100
- csv_detective-0.9.3.dev2382.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
101
- csv_detective-0.9.3.dev2382.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
102
- csv_detective-0.9.3.dev2382.dist-info/RECORD,,
86
+ csv_detective/utils.py,sha256=RJ_zFOJ1DRY8HtDrKPiCdNk5gU6-KwOrOKOyfSkBZZY,1118
87
+ csv_detective/validate.py,sha256=XldlbGkUlPaIh0y4z9iaWlmmahwCrD1900s5Cxlq5wI,5430
88
+ csv_detective-0.9.3.dev2400.dist-info/WHEEL,sha256=z-mOpxbJHqy3cq6SvUThBZdaLGFZzdZPtgWLcP2NKjQ,79
89
+ csv_detective-0.9.3.dev2400.dist-info/entry_points.txt,sha256=1J86TQNCanjsLMboAufdEUla03qEQaC9QmVGYgt2FCQ,57
90
+ csv_detective-0.9.3.dev2400.dist-info/METADATA,sha256=XBMZp650BNXuUmMPEw7ffC7tNfMD69JGd0diGhKCIQE,11063
91
+ csv_detective-0.9.3.dev2400.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: uv 0.9.15
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -1,2 +1,3 @@
1
1
  [console_scripts]
2
2
  csv_detective = csv_detective.cli:run
3
+
@@ -1,5 +0,0 @@
1
- Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
3
- Root-Is-Purelib: true
4
- Tag: py3-none-any
5
-
@@ -1,21 +0,0 @@
1
- MIT License
2
-
3
- Copyright (c) 2025 data.gouv.fr
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
@@ -1,4 +0,0 @@
1
- csv_detective
2
- dist
3
- tests
4
- venv
tests/__init__.py DELETED
File without changes
tests/test_example.py DELETED
@@ -1,67 +0,0 @@
1
- import re
2
- from uuid import UUID
3
-
4
- from csv_detective.output.example import create_example_csv_file
5
-
6
-
7
- def test_example_creation():
8
- fields = [
9
- {
10
- "name": "id_unique",
11
- "type": "id",
12
- },
13
- {
14
- "name": "nom_modele",
15
- "type": "str",
16
- "args": {"length": 20},
17
- },
18
- {
19
- "name": "siret",
20
- "type": "str",
21
- "args": {"pattern": "^\\d{14}$"},
22
- },
23
- {
24
- "name": "type_producteur",
25
- "type": "str",
26
- "args": {"enum": ["privé", "public", "association"]},
27
- },
28
- {
29
- "name": "date_creation",
30
- "type": "date",
31
- "args": {
32
- "date_range": ["1996-02-13", "2000-01-28"],
33
- "format": "%Y-%m-%d",
34
- },
35
- },
36
- {
37
- "name": "url_produit",
38
- "type": "url",
39
- },
40
- {
41
- "name": "nb_produits",
42
- "type": "int",
43
- },
44
- {"name": "note", "type": "float", "args": {"num_range": [1, 20]}},
45
- ]
46
- df = create_example_csv_file(
47
- fields=fields,
48
- file_length=5,
49
- output_name=None,
50
- )
51
- assert len(df) == 5
52
- assert all(UUID(_) for _ in df["id_unique"])
53
- assert all(len(_) == 20 for _ in df["nom_modele"])
54
- assert all(re.match("^\\d{14}$", _) for _ in df["siret"])
55
- assert all(_ in ["privé", "public", "association"] for _ in df["type_producteur"])
56
- assert all(_ >= "1996-02-13" and _ <= "2000-01-28" for _ in df["date_creation"])
57
- assert all(_.startswith("http") for _ in df["url_produit"])
58
- assert all(isinstance(_, int) for _ in df["nb_produits"])
59
- assert all(_ >= 1 and _ <= 20 for _ in df["note"])
60
-
61
-
62
- def test_example_from_tableschema():
63
- df = create_example_csv_file(
64
- schema_path="https://schema.data.gouv.fr/schemas/etalab/schema-irve-statique/2.3.1/schema-statique.json",
65
- output_name=None,
66
- )
67
- assert len(df) == 10
tests/test_fields.py DELETED
@@ -1,167 +0,0 @@
1
- from datetime import date as _date
2
- from datetime import datetime as _datetime
3
- from unittest.mock import patch
4
-
5
- import pandas as pd
6
- import pytest
7
- from numpy import random
8
-
9
- from csv_detective.detection.variables import (
10
- detect_categorical_variable,
11
- detect_continuous_variable,
12
- )
13
- from csv_detective.format import FormatsManager
14
- from csv_detective.output.dataframe import cast
15
- from csv_detective.output.utils import prepare_output_dict
16
- from csv_detective.parsing.columns import test_col as col_test # to prevent pytest from testing it
17
-
18
- fmtm = FormatsManager()
19
-
20
-
21
- def test_all_format_funcs_return_bool():
22
- for format in fmtm.formats.values():
23
- for tmp in ["a", "9", "3.14", "[]", float("nan"), "2021-06-22 10:20:10"]:
24
- assert isinstance(format.func(tmp), bool)
25
-
26
-
27
- # categorical
28
- def test_detect_categorical_variable():
29
- categorical_col = ["type_a"] * 33 + ["type_b"] * 33 + ["type_c"] * 34
30
- categorical_col2 = [str(k // 20) for k in range(100)]
31
- not_categorical_col = [i for i in range(100)]
32
-
33
- df_dict = {
34
- "cat": categorical_col,
35
- "cat2": categorical_col2,
36
- "not_cat": not_categorical_col,
37
- }
38
- df = pd.DataFrame(df_dict, dtype=str)
39
-
40
- res, _ = detect_categorical_variable(df)
41
- assert len(res) and all(k in res for k in ["cat", "cat2"])
42
-
43
-
44
- # continuous
45
- def test_detect_continuous_variable():
46
- continuous_col = random.random(100)
47
- continuous_col_2 = [1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7, 21, 3] * 10
48
- not_continuous_col = ["type_a"] * 33 + ["type_b"] * 33 + ["type_c"] * 34
49
-
50
- df_dict = {"cont": continuous_col, "not_cont": not_continuous_col}
51
- df_dict_2 = {"cont": continuous_col_2, "not_cont": not_continuous_col}
52
-
53
- df = pd.DataFrame(df_dict, dtype=str)
54
- df2 = pd.DataFrame(df_dict_2, dtype=str)
55
-
56
- res = detect_continuous_variable(df)
57
- res2 = detect_continuous_variable(df2, continuous_th=0.65)
58
- assert res.values and res.values[0] == "cont"
59
- assert res2.values and res2.values[0] == "cont"
60
-
61
-
62
- # we could also have a function here to add all True values of (almost)
63
- # each field to the False values of all others (to do when parenthood is added)
64
-
65
-
66
- def test_all_fields_have_tests():
67
- for format in fmtm.formats.values():
68
- valid = format._test_values
69
- # checking structure
70
- assert all(
71
- isinstance(key, bool)
72
- and isinstance(vals, list)
73
- and all(isinstance(val, str) for val in vals)
74
- for key, vals in valid.items()
75
- )
76
- # checking that we have valid and invalid cases for each
77
- assert all(b in valid.keys() for b in [True, False])
78
-
79
-
80
- # this is based on the _test_values of each <format>.py file
81
- @pytest.mark.parametrize(
82
- "args",
83
- (
84
- (format.func, value, valid)
85
- for valid in [True, False]
86
- for format in fmtm.formats.values()
87
- for value in format._test_values[valid]
88
- ),
89
- )
90
- def test_fields_with_values(args):
91
- func, value, valid = args
92
- assert func(value) is valid
93
-
94
-
95
- @pytest.mark.parametrize(
96
- "args",
97
- (
98
- ("1.9", "float", float),
99
- ("oui", "bool", bool),
100
- ("[1, 2]", "json", list),
101
- ('{"a": 1}', "json", dict),
102
- ("2022-08-01", "date", _date),
103
- ("2024-09-23 17:32:07", "datetime", _datetime),
104
- ("2024-09-23 17:32:07+02:00", "datetime", _datetime),
105
- ),
106
- )
107
- def test_cast(args):
108
- value, detected_type, cast_type = args
109
- assert isinstance(cast(value, detected_type), cast_type)
110
-
111
-
112
- @pytest.mark.parametrize(
113
- "args",
114
- (
115
- # there is a specific numerical format => specific wins
116
- ({"int": 1, "float": 1, "latitude_wgs": 1}, "latitude_wgs"),
117
- # scores are equal for related formats => priority wins
118
- ({"int": 1, "float": 1}, "int"),
119
- # score is lower for priority format => secondary wins
120
- ({"int": 0.5, "float": 1}, "float"),
121
- # score is lower for priority format, but is 1 => priority wins
122
- ({"int": 1, "float": 1.25}, "int"),
123
- # two rounds of priority => highest priority wins
124
- ({"latlon_wgs": 1, "lonlat_wgs": 1, "json": 1}, "latlon_wgs"),
125
- # no detection => default to string
126
- ({}, "string"),
127
- ),
128
- )
129
- def test_priority(args):
130
- detections, expected = args
131
- col = "col1"
132
- output = prepare_output_dict(pd.DataFrame({col: detections}), limited_output=True)
133
- assert output[col]["format"] == expected
134
-
135
-
136
- @pytest.mark.parametrize(
137
- "args",
138
- (
139
- ("1996-02-13", fmtm.formats["date"]),
140
- ("28/01/2000", fmtm.formats["date"]),
141
- ("2025-08-20T14:30:00+02:00", fmtm.formats["datetime_aware"]),
142
- ("2025/08/20 14:30:00.2763-12:00", fmtm.formats["datetime_aware"]),
143
- ("1925_12_20T14:30:00.2763", fmtm.formats["datetime_naive"]),
144
- ("1925 12 20 14:30:00Z", fmtm.formats["datetime_aware"]),
145
- ),
146
- )
147
- def test_early_detection(args):
148
- value, format = args
149
- with patch("csv_detective.formats.date.date_casting") as mock_func:
150
- res = format.func(value)
151
- assert res
152
- mock_func.assert_not_called()
153
-
154
-
155
- def test_all_proportion_1():
156
- # building a table that uses only correct values for these formats, except on one row
157
- table = pd.DataFrame(
158
- {
159
- name: (format._test_values[True] * 100)[:100] + ["not_suitable"]
160
- for name, format in fmtm.formats.items()
161
- if format.proportion == 1
162
- }
163
- )
164
- # testing columns for all formats
165
- returned_table = col_test(table, fmtm.formats, limited_output=True)
166
- # the analysis should have found no match on any format
167
- assert all(returned_table[col].sum() == 0 for col in table.columns)