csv-detective 0.7.5.dev1180__py3-none-any.whl → 0.7.5.dev1209__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. csv_detective/__init__.py +1 -1
  2. csv_detective/detect_fields/FR/geo/adresse/__init__.py +1 -1
  3. csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +1 -1
  4. csv_detective/detect_fields/FR/other/csp_insee/__init__.py +1 -1
  5. csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +1 -1
  6. csv_detective/detect_fields/FR/other/sexe/__init__.py +1 -1
  7. csv_detective/detect_fields/temp/date/__init__.py +5 -1
  8. csv_detective/detect_labels/FR/geo/adresse/__init__.py +1 -1
  9. csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +1 -1
  10. csv_detective/detect_labels/FR/geo/code_departement/__init__.py +1 -1
  11. csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +1 -1
  12. csv_detective/detect_labels/FR/geo/code_postal/__init__.py +1 -1
  13. csv_detective/detect_labels/FR/geo/code_region/__init__.py +1 -1
  14. csv_detective/detect_labels/FR/geo/commune/__init__.py +1 -1
  15. csv_detective/detect_labels/FR/geo/departement/__init__.py +1 -1
  16. csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +1 -1
  17. csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +1 -1
  18. csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
  19. csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +1 -1
  20. csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
  21. csv_detective/detect_labels/FR/geo/pays/__init__.py +1 -1
  22. csv_detective/detect_labels/FR/geo/region/__init__.py +1 -1
  23. csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +1 -1
  24. csv_detective/detect_labels/FR/other/code_rna/__init__.py +1 -1
  25. csv_detective/detect_labels/FR/other/code_waldec/__init__.py +1 -1
  26. csv_detective/detect_labels/FR/other/csp_insee/__init__.py +1 -1
  27. csv_detective/detect_labels/FR/other/date_fr/__init__.py +1 -1
  28. csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +1 -1
  29. csv_detective/detect_labels/FR/other/sexe/__init__.py +1 -1
  30. csv_detective/detect_labels/FR/other/siren/__init__.py +1 -1
  31. csv_detective/detect_labels/FR/other/siret/__init__.py +1 -1
  32. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +1 -1
  33. csv_detective/detect_labels/FR/other/uai/__init__.py +1 -1
  34. csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +1 -1
  35. csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +1 -1
  36. csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +1 -1
  37. csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +1 -1
  38. csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +1 -1
  39. csv_detective/detect_labels/geo/json_geojson/__init__.py +1 -1
  40. csv_detective/detect_labels/geo/latitude_wgs/__init__.py +1 -1
  41. csv_detective/detect_labels/geo/latlon_wgs/__init__.py +1 -1
  42. csv_detective/detect_labels/geo/longitude_wgs/__init__.py +1 -1
  43. csv_detective/detect_labels/other/booleen/__init__.py +1 -1
  44. csv_detective/detect_labels/other/email/__init__.py +1 -1
  45. csv_detective/detect_labels/other/float/__init__.py +1 -1
  46. csv_detective/detect_labels/other/int/__init__.py +1 -1
  47. csv_detective/detect_labels/other/mongo_object_id/__init__.py +1 -1
  48. csv_detective/detect_labels/other/twitter/__init__.py +1 -1
  49. csv_detective/detect_labels/other/url/__init__.py +1 -1
  50. csv_detective/detect_labels/other/uuid/__init__.py +1 -1
  51. csv_detective/detect_labels/temp/date/__init__.py +1 -1
  52. csv_detective/detect_labels/temp/datetime_iso/__init__.py +1 -1
  53. csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +1 -1
  54. csv_detective/detect_labels/temp/year/__init__.py +1 -1
  55. csv_detective/detection/columns.py +89 -0
  56. csv_detective/detection/encoding.py +27 -0
  57. csv_detective/detection/engine.py +46 -0
  58. csv_detective/detection/headers.py +32 -0
  59. csv_detective/detection/rows.py +18 -0
  60. csv_detective/detection/separator.py +44 -0
  61. csv_detective/detection/variables.py +98 -0
  62. csv_detective/explore_csv.py +40 -110
  63. csv_detective/output/dataframe.py +55 -0
  64. csv_detective/{create_example.py → output/example.py} +10 -9
  65. csv_detective/output/profile.py +87 -0
  66. csv_detective/{schema_generation.py → output/schema.py} +344 -343
  67. csv_detective/output/utils.py +51 -0
  68. csv_detective/parsing/columns.py +141 -0
  69. csv_detective/parsing/compression.py +11 -0
  70. csv_detective/parsing/csv.py +55 -0
  71. csv_detective/parsing/excel.py +169 -0
  72. csv_detective/parsing/load.py +97 -0
  73. csv_detective/utils.py +10 -236
  74. {csv_detective-0.7.5.dev1180.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/CHANGELOG.md +3 -0
  75. {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/METADATA +3 -2
  76. {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/RECORD +85 -71
  77. {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/WHEEL +1 -1
  78. tests/test_fields.py +7 -6
  79. tests/test_file.py +56 -57
  80. csv_detective/detection.py +0 -618
  81. /csv_detective/{process_text.py → parsing/text.py} +0 -0
  82. {csv_detective-0.7.5.dev1180.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
  83. {csv_detective-0.7.5.dev1180.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/README.md +0 -0
  84. {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/entry_points.txt +0 -0
  85. {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info/licenses}/LICENSE.AGPL.txt +0 -0
  86. {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/top_level.txt +0 -0
tests/test_file.py CHANGED
@@ -1,12 +1,13 @@
1
- from csv_detective import routine
1
+ import pandas as pd
2
2
  import pytest
3
3
  import responses
4
- import pandas as pd
4
+
5
+ from csv_detective import routine
5
6
 
6
7
 
7
8
  def test_columns_output_on_file():
8
9
  output = routine(
9
- csv_file_path="tests/a_test_file.csv",
10
+ file_path="tests/data/a_test_file.csv",
10
11
  num_rows=-1,
11
12
  output_profile=False,
12
13
  save_results=False,
@@ -40,7 +41,7 @@ def test_columns_output_on_file():
40
41
 
41
42
  def test_profile_output_on_file():
42
43
  output = routine(
43
- csv_file_path="tests/a_test_file.csv",
44
+ file_path="tests/data/a_test_file.csv",
44
45
  num_rows=-1,
45
46
  output_profile=True,
46
47
  save_results=False,
@@ -69,10 +70,10 @@ def test_profile_output_on_file():
69
70
  assert output["profile"]["GEO_INFO"]["nb_distinct"] == 1
70
71
 
71
72
 
72
- def test_exception():
73
+ def test_profile_with_num_rows():
73
74
  with pytest.raises(ValueError):
74
75
  routine(
75
- csv_file_path="tests/a_test_file.csv",
76
+ file_path="tests/data/a_test_file.csv",
76
77
  num_rows=50,
77
78
  output_profile=True,
78
79
  save_results=False,
@@ -85,7 +86,7 @@ def test_exception_different_number_of_columns():
85
86
  """
86
87
  with pytest.raises(ValueError):
87
88
  routine(
88
- csv_file_path="tests/c_test_file.csv",
89
+ file_path="tests/data/c_test_file.csv",
89
90
  num_rows=-1,
90
91
  output_profile=True,
91
92
  save_results=False,
@@ -94,7 +95,7 @@ def test_exception_different_number_of_columns():
94
95
 
95
96
  def test_code_dep_reg_on_file():
96
97
  output = routine(
97
- csv_file_path="tests/b_test_file.csv",
98
+ file_path="tests/data/b_test_file.csv",
98
99
  num_rows=-1,
99
100
  output_profile=False,
100
101
  save_results=False,
@@ -106,7 +107,7 @@ def test_code_dep_reg_on_file():
106
107
 
107
108
  def test_schema_on_file():
108
109
  output = routine(
109
- csv_file_path="tests/b_test_file.csv",
110
+ file_path="tests/data/b_test_file.csv",
110
111
  num_rows=-1,
111
112
  output_schema=True,
112
113
  save_results=False,
@@ -131,52 +132,37 @@ def test_schema_on_file():
131
132
  assert is_column_reg
132
133
 
133
134
 
134
- def test_non_csv_files():
135
- _ = routine(
136
- csv_file_path="tests/file.ods",
137
- num_rows=-1,
138
- output_profile=False,
139
- save_results=False,
140
- )
141
- assert _['engine'] == 'odf'
142
-
135
+ params_csv = [
136
+ ("csv_file", {"engine": None, "sheet_name": None}),
137
+ ("file.csv.gz", {"engine": None, "sheet_name": None, "separator": ",", "columns.len": 3}),
138
+ ]
139
+ params_others = [
140
+ ("file.ods", {"engine": "odf"}),
143
141
  # this is a "tricked" xls file that is actually read as odf
144
- _ = routine(
145
- csv_file_path="tests/file.xls",
146
- num_rows=-1,
147
- output_profile=False,
148
- save_results=False,
149
- )
150
- assert _['engine'] == 'odf'
142
+ ("file.xls", {"engine": "odf"}),
143
+ # this file has an empty first row; check if the sheet we consider is the largest
144
+ ("file.xlsx", {"engine": "openpyxl", "header_row_idx": 1, "sheet_name": "REI_1987"}),
145
+ ("xlsx_file", {"engine": "openpyxl"}),
146
+ ]
151
147
 
152
- _ = routine(
153
- csv_file_path="tests/file.xlsx",
154
- num_rows=-1,
155
- output_profile=False,
156
- save_results=False,
157
- )
158
- assert _['engine'] == 'openpyxl'
159
- # this file has an empty first row
160
- assert _['header_row_idx'] == 1
161
- # check if the sheet we consider is the largest
162
- assert _['sheet_name'] == 'REI_1987'
163
-
164
- _ = routine(
165
- csv_file_path="tests/csv_file",
166
- num_rows=-1,
167
- output_profile=False,
168
- save_results=False,
169
- )
170
- assert not _.get('engine')
171
- assert not _.get('sheet_name')
172
148
 
149
+ @pytest.mark.parametrize("params", params_csv + params_others)
150
+ def test_non_csv_files(params):
151
+ file_name, checks = params
173
152
  _ = routine(
174
- csv_file_path="tests/xlsx_file",
153
+ file_path=f"tests/data/{file_name}",
175
154
  num_rows=-1,
176
155
  output_profile=False,
177
156
  save_results=False,
178
157
  )
179
- assert _['engine'] == 'openpyxl'
158
+ for k, v in checks.items():
159
+ if v is None:
160
+ assert not _.get(k)
161
+ elif "." in k:
162
+ key, func = k.split(".")
163
+ assert eval(func)(_[key]) == v
164
+ else:
165
+ assert _[k] == v
180
166
 
181
167
 
182
168
  @pytest.fixture
@@ -185,21 +171,34 @@ def mocked_responses():
185
171
  yield rsps
186
172
 
187
173
 
188
- def test_urls(mocked_responses):
189
- url = 'http://example.com/test.csv'
190
- expected_content = 'id,name,first_name\n1,John,Smith\n2,Jane,Doe\n3,Bob,Johnson'
174
+ @pytest.mark.parametrize(
175
+ "params",
176
+ # ideally we'd like to do the same with params_others but pandas.read_excel uses urllib
177
+ # which doesn't support the way we mock the response, TBC
178
+ params_csv + [("a_test_file.csv", {"separator": ";", "header_row_idx": 2, "total_lines": 414})]
179
+ )
180
+ def test_urls(mocked_responses, params):
181
+ file_name, checks = params
182
+ url = f"http://example.com/{file_name}"
191
183
  mocked_responses.get(
192
184
  url,
193
- body=expected_content,
185
+ body=open(f"tests/data/{file_name}", "rb").read(),
194
186
  status=200,
195
187
  )
196
- output = routine(
197
- csv_file_path=url,
188
+ _ = routine(
189
+ file_path=url,
198
190
  num_rows=-1,
199
191
  output_profile=False,
200
192
  save_results=False,
201
193
  )
202
- assert output['header'] == ["id", "name", "first_name"]
194
+ for k, v in checks.items():
195
+ if v is None:
196
+ assert not _.get(k)
197
+ elif "." in k:
198
+ key, func = k.split(".")
199
+ assert eval(func)(_[key]) == v
200
+ else:
201
+ assert _[k] == v
203
202
 
204
203
 
205
204
  @pytest.mark.parametrize(
@@ -213,7 +212,7 @@ def test_nan_values(expected_type):
213
212
  # if skipping NaN, the column contains only ints
214
213
  skipna, expected_type = expected_type
215
214
  output = routine(
216
- csv_file_path="tests/b_test_file.csv",
215
+ file_path="tests/data/b_test_file.csv",
217
216
  num_rows=-1,
218
217
  save_results=False,
219
218
  skipna=skipna,
@@ -223,7 +222,7 @@ def test_nan_values(expected_type):
223
222
 
224
223
  def test_output_df():
225
224
  output, df = routine(
226
- csv_file_path="tests/b_test_file.csv",
225
+ file_path="tests/data/b_test_file.csv",
227
226
  num_rows=-1,
228
227
  output_profile=False,
229
228
  save_results=False,
@@ -251,7 +250,7 @@ def test_cast_json(mocked_responses, cast_json):
251
250
  status=200,
252
251
  )
253
252
  analysis, df = routine(
254
- csv_file_path='http://example.com/test.csv',
253
+ file_path='http://example.com/test.csv',
255
254
  num_rows=-1,
256
255
  output_profile=False,
257
256
  save_results=False,