csv-detective 0.8.1.dev1440__py3-none-any.whl → 0.8.1.dev1460__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -75,6 +75,7 @@ def routine(
75
75
  analysis = detect_formats(
76
76
  table=table,
77
77
  analysis=analysis,
78
+ file_path=file_path,
78
79
  user_input_tests=user_input_tests,
79
80
  limited_output=limited_output,
80
81
  skipna=skipna,
@@ -145,6 +146,7 @@ def validate_then_detect(
145
146
  analysis = detect_formats(
146
147
  table=table,
147
148
  analysis=analysis,
149
+ file_path=file_path,
148
150
  user_input_tests=user_input_tests,
149
151
  limited_output=limited_output,
150
152
  skipna=skipna,
@@ -5,6 +5,7 @@
5
5
  - Refactor label testing [#119](https://github.com/datagouv/csv-detective/pull/119)
6
6
  - Refactor repo metadata and requirements [#120](https://github.com/datagouv/csv-detective/pull/120) [#122](https://github.com/datagouv/csv-detective/pull/122)
7
7
  - Better URL detection [#121](https://github.com/datagouv/csv-detective/pull/121)
8
+ - For big files, analyse on sample then validate on whole file [#124](https://github.com/datagouv/csv-detective/pull/124)
8
9
 
9
10
  ## 0.8.0 (2025-05-20)
10
11
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csv_detective
3
- Version: 0.8.1.dev1440
3
+ Version: 0.8.1.dev1460
4
4
  Summary: Detect tabular files column content
5
5
  Home-page: https://github.com/datagouv/csv_detective
6
6
  Author: Etalab
@@ -1,6 +1,6 @@
1
1
  csv_detective/__init__.py,sha256=fxctDlEyUexNk_ePriWu6V05xZEeirMV0v_StnEZ8vQ,165
2
2
  csv_detective/cli.py,sha256=itooHtpyfC6DUsL_DchPKe1xo7m0MYJIp1L4R8eqoTk,1401
3
- csv_detective/explore_csv.py,sha256=IT1-9TbS78p6oeDpQ5T6DQ93xQbobcscyBQb6nh86H4,9082
3
+ csv_detective/explore_csv.py,sha256=YxXgaUqUNdAGsU8bC-cs_TVvSza4wc4aMJQjWRkRT5s,9144
4
4
  csv_detective/load_tests.py,sha256=GILvfkd4OVI-72mA4nzbPlZqgcXZ4wznOhGfZ1ucWkM,2385
5
5
  csv_detective/s3_utils.py,sha256=1cIVdQUYY2ovErbMwp72Gqtqx2bkB8nfVhn-QaOFTT0,1451
6
6
  csv_detective/utils.py,sha256=-tIs9yV7RJPGj65lQ7LjRGch6Iws9UeuIPQsd2uUUJM,1025
@@ -127,19 +127,19 @@ csv_detective/output/example.py,sha256=EdPX1iqHhIG4DsiHuYdy-J7JxOkjgUh_o2D5nrfM5
127
127
  csv_detective/output/profile.py,sha256=B8YU541T_YPDezJGh4dkHckOShiwHSrZd9GS8jbmz7A,2919
128
128
  csv_detective/output/schema.py,sha256=ZDBWDOD8IYp7rcB0_n8l9JXGIhOQ6bTZHFWfTmnNNEQ,13480
129
129
  csv_detective/output/utils.py,sha256=HbmvCCCmFo7NJxhD_UsJIveuw-rrfhrvYckv1CJn_10,2301
130
- csv_detective-0.8.1.dev1440.data/data/share/csv_detective/CHANGELOG.md,sha256=b-F0tSnDQUauOqqPJCg57dvlaLt_xsb6J6O88RiiKwY,8603
131
- csv_detective-0.8.1.dev1440.data/data/share/csv_detective/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
132
- csv_detective-0.8.1.dev1440.data/data/share/csv_detective/README.md,sha256=gKLFmC8kuCCywS9eAhMak_JNriUWWNOsBKleAu5TIEY,8501
133
- csv_detective-0.8.1.dev1440.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
130
+ csv_detective-0.8.1.dev1460.data/data/share/csv_detective/CHANGELOG.md,sha256=BsmO9YQAMi31co_c0I8aYRsm2m5Q5--vORWoJArdhOM,8725
131
+ csv_detective-0.8.1.dev1460.data/data/share/csv_detective/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
132
+ csv_detective-0.8.1.dev1460.data/data/share/csv_detective/README.md,sha256=gKLFmC8kuCCywS9eAhMak_JNriUWWNOsBKleAu5TIEY,8501
133
+ csv_detective-0.8.1.dev1460.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
134
134
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
135
135
  tests/test_example.py,sha256=JeHxSK0IVDcSrOhSZlNGSQv4JAc_r6mzvJM8PfmLTMw,2018
136
136
  tests/test_fields.py,sha256=d2tNvjtal6ZbO646x1GDbp_CGgp-EIcdg2SgMG72J6E,10270
137
- tests/test_file.py,sha256=9APE1d43lQ8Dk8lwJFNUK_YekYYsQ0ae2_fgpcPE9mk,8116
137
+ tests/test_file.py,sha256=FWVtYHlD5uU7tPeYsqlQg6O4lpU8Ct35vddkbzhvvjA,8508
138
138
  tests/test_labels.py,sha256=Nkr645bUewrj8hjNDKr67FQ6Sy_TID6f3E5Kfkl231M,464
139
139
  tests/test_structure.py,sha256=bv-tjgXohvQAxwmxzH0BynFpK2TyPjcxvtIAmIRlZmA,1393
140
140
  tests/test_validation.py,sha256=CTGonR6htxcWF9WH8MxumDD8cF45Y-G4hm94SM4lFjU,3246
141
- csv_detective-0.8.1.dev1440.dist-info/METADATA,sha256=4ECGBhA77ruP1PeRV0QamjdD1lfKOgoJ_RLJ8iiQ3nA,10443
142
- csv_detective-0.8.1.dev1440.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
143
- csv_detective-0.8.1.dev1440.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
144
- csv_detective-0.8.1.dev1440.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
145
- csv_detective-0.8.1.dev1440.dist-info/RECORD,,
141
+ csv_detective-0.8.1.dev1460.dist-info/METADATA,sha256=Rhi872uRXV2PcYpcI64GJ9vw12TsYIEQJxf8H1srLic,10443
142
+ csv_detective-0.8.1.dev1460.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
143
+ csv_detective-0.8.1.dev1460.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
144
+ csv_detective-0.8.1.dev1460.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
145
+ csv_detective-0.8.1.dev1460.dist-info/RECORD,,
tests/test_file.py CHANGED
@@ -1,42 +1,49 @@
1
1
  import pandas as pd
2
2
  import pytest
3
3
  import responses
4
+ from unittest.mock import patch
4
5
 
5
6
  from csv_detective import routine
6
7
 
7
8
 
8
- def test_columns_output_on_file():
9
- output = routine(
10
- file_path="tests/data/a_test_file.csv",
11
- num_rows=-1,
12
- output_profile=False,
13
- save_results=False,
14
- )
15
- assert isinstance(output, dict)
16
- assert output["separator"] == ";"
17
- assert output["header_row_idx"] == 2
18
- assert output["header"] == [
19
- "NUMCOM",
20
- "NOMCOM",
21
- "NUMDEP",
22
- "NOMDEP",
23
- "NUMEPCI",
24
- "NOMEPCI",
25
- "TXCOUVGLO_COM_2014",
26
- "TXCOUVGLO_DEP_2014",
27
- "TXCOUVGLO_EPCI_2014",
28
- "STRUCTURED_INFO",
29
- "GEO_INFO",
30
- ]
31
- assert output["total_lines"] == 404
32
- assert output["nb_duplicates"] == 7
33
- assert output["columns"]["NOMCOM"]["format"] == "commune"
34
- assert output["columns"]["NOMDEP"]["format"] == "departement"
35
- assert output["columns"]["NUMEPCI"]["format"] == "siren"
36
- assert output["columns"]["STRUCTURED_INFO"]["python_type"] == "json"
37
- assert output["columns"]["STRUCTURED_INFO"]["format"] == "json"
38
- assert output["columns"]["GEO_INFO"]["python_type"] == "json"
39
- assert output["columns"]["GEO_INFO"]["format"] == "json_geojson"
9
+ @pytest.mark.parametrize(
10
+ "reduce_max_rows_analysis",
11
+ (True, False),
12
+ )
13
+ def test_columns_output_on_file(reduce_max_rows_analysis):
14
+ patched = 100 if reduce_max_rows_analysis else 1e5
15
+ with patch("csv_detective.detection.formats.MAX_ROWS_ANALYSIS", patched):
16
+ output = routine(
17
+ file_path="tests/data/a_test_file.csv",
18
+ num_rows=-1,
19
+ output_profile=False,
20
+ save_results=False,
21
+ )
22
+ assert isinstance(output, dict)
23
+ assert output["separator"] == ";"
24
+ assert output["header_row_idx"] == 2
25
+ assert output["header"] == [
26
+ "NUMCOM",
27
+ "NOMCOM",
28
+ "NUMDEP",
29
+ "NOMDEP",
30
+ "NUMEPCI",
31
+ "NOMEPCI",
32
+ "TXCOUVGLO_COM_2014",
33
+ "TXCOUVGLO_DEP_2014",
34
+ "TXCOUVGLO_EPCI_2014",
35
+ "STRUCTURED_INFO",
36
+ "GEO_INFO",
37
+ ]
38
+ assert output["total_lines"] == 404
39
+ assert output["nb_duplicates"] == 7
40
+ assert output["columns"]["NOMCOM"]["format"] == "commune"
41
+ assert output["columns"]["NOMDEP"]["format"] == "departement"
42
+ assert output["columns"]["NUMEPCI"]["format"] == "siren"
43
+ assert output["columns"]["STRUCTURED_INFO"]["python_type"] == "json"
44
+ assert output["columns"]["STRUCTURED_INFO"]["format"] == "json"
45
+ assert output["columns"]["GEO_INFO"]["python_type"] == "json"
46
+ assert output["columns"]["GEO_INFO"]["format"] == "json_geojson"
40
47
 
41
48
 
42
49
  def test_profile_output_on_file():