csv-detective 0.10.4.dev1__py3-none-any.whl → 0.10.12674__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. csv_detective/detection/__init__.py +0 -0
  2. csv_detective/detection/columns.py +0 -0
  3. csv_detective/detection/encoding.py +0 -0
  4. csv_detective/detection/engine.py +0 -0
  5. csv_detective/detection/formats.py +0 -2
  6. csv_detective/detection/headers.py +14 -12
  7. csv_detective/detection/rows.py +1 -1
  8. csv_detective/detection/separator.py +0 -0
  9. csv_detective/detection/variables.py +0 -0
  10. csv_detective/explore_csv.py +4 -15
  11. csv_detective/format.py +1 -1
  12. csv_detective/formats/__init__.py +0 -0
  13. csv_detective/formats/adresse.py +0 -0
  14. csv_detective/formats/binary.py +0 -0
  15. csv_detective/formats/booleen.py +0 -0
  16. csv_detective/formats/code_commune_insee.py +0 -0
  17. csv_detective/formats/code_csp_insee.py +0 -0
  18. csv_detective/formats/code_departement.py +0 -0
  19. csv_detective/formats/code_fantoir.py +0 -0
  20. csv_detective/formats/code_import.py +0 -0
  21. csv_detective/formats/code_postal.py +0 -0
  22. csv_detective/formats/code_region.py +0 -0
  23. csv_detective/formats/code_rna.py +0 -0
  24. csv_detective/formats/code_waldec.py +0 -0
  25. csv_detective/formats/commune.py +0 -0
  26. csv_detective/formats/csp_insee.py +0 -0
  27. csv_detective/formats/date.py +1 -10
  28. csv_detective/formats/date_fr.py +0 -0
  29. csv_detective/formats/datetime_aware.py +0 -0
  30. csv_detective/formats/datetime_naive.py +0 -0
  31. csv_detective/formats/datetime_rfc822.py +0 -0
  32. csv_detective/formats/departement.py +0 -0
  33. csv_detective/formats/email.py +0 -0
  34. csv_detective/formats/float.py +0 -0
  35. csv_detective/formats/geojson.py +0 -0
  36. csv_detective/formats/insee_ape700.py +0 -0
  37. csv_detective/formats/insee_canton.py +0 -0
  38. csv_detective/formats/int.py +0 -0
  39. csv_detective/formats/iso_country_code_alpha2.py +0 -0
  40. csv_detective/formats/iso_country_code_alpha3.py +0 -0
  41. csv_detective/formats/iso_country_code_numeric.py +0 -0
  42. csv_detective/formats/jour_de_la_semaine.py +0 -0
  43. csv_detective/formats/json.py +0 -0
  44. csv_detective/formats/latitude_l93.py +0 -0
  45. csv_detective/formats/latitude_wgs.py +0 -0
  46. csv_detective/formats/latitude_wgs_fr_metropole.py +0 -0
  47. csv_detective/formats/latlon_wgs.py +0 -0
  48. csv_detective/formats/longitude_l93.py +0 -0
  49. csv_detective/formats/longitude_wgs.py +0 -0
  50. csv_detective/formats/longitude_wgs_fr_metropole.py +0 -0
  51. csv_detective/formats/lonlat_wgs.py +0 -0
  52. csv_detective/formats/mois_de_lannee.py +0 -0
  53. csv_detective/formats/money.py +0 -0
  54. csv_detective/formats/mongo_object_id.py +0 -0
  55. csv_detective/formats/pays.py +0 -0
  56. csv_detective/formats/percent.py +0 -0
  57. csv_detective/formats/region.py +0 -0
  58. csv_detective/formats/sexe.py +0 -0
  59. csv_detective/formats/siren.py +0 -0
  60. csv_detective/formats/siret.py +0 -0
  61. csv_detective/formats/tel_fr.py +0 -0
  62. csv_detective/formats/uai.py +0 -0
  63. csv_detective/formats/url.py +0 -0
  64. csv_detective/formats/username.py +0 -0
  65. csv_detective/formats/uuid.py +0 -0
  66. csv_detective/formats/year.py +0 -0
  67. csv_detective/output/__init__.py +0 -0
  68. csv_detective/output/dataframe.py +2 -2
  69. csv_detective/output/example.py +0 -0
  70. csv_detective/output/profile.py +1 -1
  71. csv_detective/output/schema.py +0 -0
  72. csv_detective/output/utils.py +0 -0
  73. csv_detective/parsing/__init__.py +0 -0
  74. csv_detective/parsing/columns.py +5 -9
  75. csv_detective/parsing/compression.py +0 -0
  76. csv_detective/parsing/csv.py +0 -0
  77. csv_detective/parsing/excel.py +1 -1
  78. csv_detective/parsing/load.py +12 -11
  79. csv_detective/validate.py +36 -71
  80. {csv_detective-0.10.4.dev1.dist-info → csv_detective-0.10.12674.dist-info}/METADATA +18 -15
  81. {csv_detective-0.10.4.dev1.dist-info → csv_detective-0.10.12674.dist-info}/RECORD +22 -41
  82. csv_detective-0.10.12674.dist-info/WHEEL +4 -0
  83. {csv_detective-0.10.4.dev1.dist-info → csv_detective-0.10.12674.dist-info}/entry_points.txt +1 -0
  84. csv_detective-0.10.4.dev1.dist-info/WHEEL +0 -5
  85. csv_detective-0.10.4.dev1.dist-info/licenses/LICENSE +0 -21
  86. csv_detective-0.10.4.dev1.dist-info/top_level.txt +0 -3
  87. tests/__init__.py +0 -0
  88. tests/data/a_test_file.csv +0 -407
  89. tests/data/a_test_file.json +0 -394
  90. tests/data/b_test_file.csv +0 -7
  91. tests/data/c_test_file.csv +0 -2
  92. tests/data/csv_file +0 -7
  93. tests/data/file.csv.gz +0 -0
  94. tests/data/file.ods +0 -0
  95. tests/data/file.xls +0 -0
  96. tests/data/file.xlsx +0 -0
  97. tests/data/xlsx_file +0 -0
  98. tests/test_example.py +0 -67
  99. tests/test_fields.py +0 -175
  100. tests/test_file.py +0 -469
  101. tests/test_labels.py +0 -26
  102. tests/test_structure.py +0 -45
  103. tests/test_validation.py +0 -163
File without changes
File without changes
File without changes
File without changes
@@ -11,7 +11,6 @@ from csv_detective.format import Format, FormatsManager
11
11
  from csv_detective.output.utils import prepare_output_dict
12
12
  from csv_detective.parsing.columns import (
13
13
  MAX_NUMBER_CATEGORICAL_VALUES,
14
- handle_empty_columns,
15
14
  test_col,
16
15
  test_col_chunks,
17
16
  test_label,
@@ -50,7 +49,6 @@ def detect_formats(
50
49
  skipna=skipna,
51
50
  verbose=verbose,
52
51
  )
53
- handle_empty_columns(scores_table_fields)
54
52
  res_categorical, _ = detect_categorical_variable(
55
53
  table,
56
54
  max_number_categorical_values=MAX_NUMBER_CATEGORICAL_VALUES,
@@ -5,22 +5,24 @@ from typing import TextIO
5
5
  from csv_detective.utils import display_logs_depending_process_time
6
6
 
7
7
 
8
- def detect_header_position(file: TextIO, verbose: bool = False) -> int:
8
+ def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, list | None]:
9
9
  """Tests 10 first rows for possible header (in case header is not 1st row)"""
10
10
  if verbose:
11
11
  start = time()
12
- logging.info("Detecting header position")
12
+ logging.info("Detecting headers")
13
13
  file.seek(0)
14
14
  for i in range(10):
15
15
  row = file.readline()
16
16
  position = file.tell()
17
- next_row = file.readline()
18
- file.seek(position)
19
- if row != next_row:
20
- if verbose:
21
- display_logs_depending_process_time(
22
- f"Detected header position in {round(time() - start, 3)}s",
23
- time() - start,
24
- )
25
- return i
26
- raise ValueError("Could not accurately retrieve headers position")
17
+ headers = [c for c in row.replace("\n", "").split(sep) if c]
18
+ if not any(col == "" for col in headers):
19
+ next_row = file.readline()
20
+ file.seek(position)
21
+ if row != next_row:
22
+ if verbose:
23
+ display_logs_depending_process_time(
24
+ f"Detected headers in {round(time() - start, 3)}s",
25
+ time() - start,
26
+ )
27
+ return i, headers
28
+ raise ValueError("Could not retrieve headers")
@@ -2,7 +2,7 @@ import pandas as pd
2
2
 
3
3
 
4
4
  def remove_empty_first_rows(table: pd.DataFrame) -> tuple[pd.DataFrame, int]:
5
- """Analog process to detect_header_position for csv files, determines how many rows to skip
5
+ """Analog process to detect_headers for csv files, determines how many rows to skip
6
6
  to end up with the header at the right place"""
7
7
  idx = 0
8
8
  if all([str(c).startswith("Unnamed:") for c in table.columns]):
File without changes
File without changes
@@ -142,19 +142,20 @@ def validate_then_detect(
142
142
  if is_url(file_path):
143
143
  logging.info("Path recognized as a URL")
144
144
 
145
- is_valid, analysis, col_values = validate(
145
+ is_valid, table, analysis, col_values = validate(
146
146
  file_path=file_path,
147
147
  previous_analysis=previous_analysis,
148
148
  verbose=verbose,
149
149
  skipna=skipna,
150
150
  )
151
- if not is_valid:
152
- # if loading failed in validate, we load it from scratch and initiate an analysis
151
+ if analysis is None:
152
+ # if loading failed in validate, we load it from scratch
153
153
  table, analysis = load_file(
154
154
  file_path=file_path,
155
155
  num_rows=num_rows,
156
156
  verbose=verbose,
157
157
  )
158
+ if not is_valid:
158
159
  analysis, col_values = detect_formats(
159
160
  table=table,
160
161
  analysis=analysis,
@@ -164,18 +165,6 @@ def validate_then_detect(
164
165
  skipna=skipna,
165
166
  verbose=verbose,
166
167
  )
167
- else:
168
- # successful validation means we have a correct analysis and col_values
169
- # only need to reload the table, and we already know how
170
- table, _ = load_file(
171
- file_path=file_path,
172
- num_rows=num_rows,
173
- verbose=verbose,
174
- sep=analysis.get("separator"),
175
- encoding=analysis.get("encoding"),
176
- engine=analysis.get("engine"),
177
- sheet_name=analysis.get("sheet_name"),
178
- )
179
168
  try:
180
169
  return generate_output(
181
170
  table=table,
csv_detective/format.py CHANGED
@@ -27,7 +27,7 @@ class Format:
27
27
  tags: to allow users to submit a file to only a subset of formats
28
28
  """
29
29
  self.name: str = name
30
- self.func: Callable[[Any], bool] = func
30
+ self.func: Callable = func
31
31
  self._test_values: dict[bool, list[str]] = _test_values
32
32
  self.labels: dict[str, float] = labels
33
33
  self.proportion: float = proportion
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
@@ -57,9 +57,7 @@ string_month_pattern = (
57
57
 
58
58
 
59
59
  def _is(val):
60
- # many early stops, to cut processing time
61
- # and avoid the costly use of date_casting as much as possible
62
- # /!\ timestamps are considered ints, not dates
60
+ # early stops, to cut processing time
63
61
  if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
64
62
  return False
65
63
  # if it's a usual date pattern
@@ -72,13 +70,8 @@ def _is(val):
72
70
  ]
73
71
  ):
74
72
  return True
75
- if re.match(r"^-?\d+[\.|,]\d+$", val):
76
- # regular floats are excluded
77
- return False
78
- # not enough digits => not a date (slightly arbitrary)
79
73
  if sum([char.isdigit() for char in val]) / len(val) < threshold:
80
74
  return False
81
- # last resort
82
75
  res = date_casting(val)
83
76
  if not res or res.hour or res.minute or res.second:
84
77
  return False
@@ -93,7 +86,6 @@ _test_values = {
93
86
  "15 décembre 1985",
94
87
  "02 05 2003",
95
88
  "20030502",
96
- "2003.05.02",
97
89
  "1993-12/02",
98
90
  ],
99
91
  False: [
@@ -104,6 +96,5 @@ _test_values = {
104
96
  "12152003",
105
97
  "20031512",
106
98
  "02052003",
107
- "6.27367393749392839",
108
99
  ],
109
100
  }
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
@@ -14,8 +14,8 @@ from csv_detective.utils import display_logs_depending_process_time
14
14
 
15
15
 
16
16
  def cast(value: str, _type: str) -> str | int | float | bool | date | datetime | bytes | None:
17
- if not isinstance(value, str) or value in pd._libs.parsers.STR_NA_VALUES:
18
- # STR_NA_VALUES are directly ingested as NaN by pandas, we avoid trying to cast them (into int for instance)
17
+ if not isinstance(value, str) or not value:
18
+ # None is the current default value in hydra, should we keep this?
19
19
  return None
20
20
  match _type:
21
21
  case "string":
File without changes
@@ -23,7 +23,7 @@ def create_profile(
23
23
  logging.info("Creating profile")
24
24
 
25
25
  if num_rows > 0:
26
- raise ValueError("To create profile `num_rows` must be set to -1")
26
+ raise ValueError("To create profiles num_rows has to be set to -1")
27
27
  if not limited_output:
28
28
  columns = {
29
29
  k: v[0] if v else {"python_type": "string", "format": "string", "score": 1.0}
File without changes
File without changes
File without changes
@@ -13,13 +13,6 @@ from csv_detective.utils import display_logs_depending_process_time
13
13
  MAX_NUMBER_CATEGORICAL_VALUES = 25
14
14
 
15
15
 
16
- def handle_empty_columns(return_table: pd.DataFrame):
17
- # handling that empty columns score 1 everywhere
18
- for col in return_table.columns:
19
- if sum(return_table[col]) == len(return_table):
20
- return_table[col] = 0
21
-
22
-
23
16
  def test_col_val(
24
17
  serie: pd.Series,
25
18
  format: Format,
@@ -40,7 +33,7 @@ def test_col_val(
40
33
 
41
34
  try:
42
35
  if skipna:
43
- serie = serie.dropna()
36
+ serie = serie.loc[serie.notnull()]
44
37
  ser_len = len(serie)
45
38
  if ser_len == 0:
46
39
  # being here means the whole column is NaN, so if skipna it's a pass
@@ -229,7 +222,10 @@ def test_col_chunks(
229
222
  analysis["categorical"] = [
230
223
  col for col, values in col_values.items() if len(values) <= MAX_NUMBER_CATEGORICAL_VALUES
231
224
  ]
232
- handle_empty_columns(return_table)
225
+ # handling that empty columns score 1 everywhere
226
+ for col in return_table.columns:
227
+ if sum(return_table[col]) == len(return_table):
228
+ return_table[col] = 0
233
229
  if verbose:
234
230
  display_logs_depending_process_time(
235
231
  f"Done testing chunks in {round(time() - start, 3)}s", time() - start
File without changes
File without changes
@@ -23,7 +23,7 @@ def parse_excel(
23
23
  file_path: str,
24
24
  num_rows: int = -1,
25
25
  engine: str | None = None,
26
- sheet_name: str | int | None = None,
26
+ sheet_name: str | None = None,
27
27
  random_state: int = 42,
28
28
  verbose: bool = False,
29
29
  ) -> tuple[pd.DataFrame, int, int, str, str, int]:
@@ -1,4 +1,3 @@
1
- import codecs
2
1
  from io import BytesIO, StringIO
3
2
 
4
3
  import pandas as pd
@@ -11,7 +10,7 @@ from csv_detective.detection.engine import (
11
10
  EXCEL_ENGINES,
12
11
  detect_engine,
13
12
  )
14
- from csv_detective.detection.headers import detect_header_position
13
+ from csv_detective.detection.headers import detect_headers
15
14
  from csv_detective.detection.separator import detect_separator
16
15
  from csv_detective.parsing.compression import unzip
17
16
  from csv_detective.parsing.csv import parse_csv
@@ -28,12 +27,12 @@ def load_file(
28
27
  encoding: str | None = None,
29
28
  sep: str | None = None,
30
29
  verbose: bool = False,
31
- engine: str | None = None,
32
30
  sheet_name: str | int | None = None,
33
31
  ) -> tuple[pd.DataFrame, dict]:
34
32
  file_name = file_path.split("/")[-1]
35
- if ("." not in file_name or not file_name.endswith("csv")) and engine is None and sep is None:
36
- # file has no extension and we don't have insights from arguments, we'll investigate how to read it
33
+ engine = None
34
+ if "." not in file_name or not file_name.endswith("csv"):
35
+ # file has no extension, we'll investigate how to read it
37
36
  engine = detect_engine(file_path, verbose=verbose)
38
37
 
39
38
  if engine in EXCEL_ENGINES or any([file_path.endswith(k) for k in XLS_LIKE_EXT]):
@@ -46,6 +45,9 @@ def load_file(
46
45
  )
47
46
  if table.empty:
48
47
  raise ValueError("Table seems to be empty")
48
+ header = table.columns.to_list()
49
+ if any(col.startswith("Unnamed") for col in header):
50
+ raise ValueError("Could not retrieve headers")
49
51
  analysis = {
50
52
  "engine": engine,
51
53
  "sheet_name": sheet_name,
@@ -67,20 +69,21 @@ def load_file(
67
69
  binary_file.seek(0)
68
70
  # decoding and reading file
69
71
  if is_url(file_path) or engine in COMPRESSION_ENGINES:
70
- decoder = codecs.getincrementaldecoder(encoding)()
71
72
  str_file = StringIO()
72
73
  while True:
73
74
  chunk = binary_file.read(1024**2)
74
75
  if not chunk:
75
76
  break
76
- str_file.write(decoder.decode(chunk))
77
+ str_file.write(chunk.decode(encoding=encoding))
77
78
  del binary_file
78
79
  str_file.seek(0)
79
80
  else:
80
81
  str_file = open(file_path, "r", encoding=encoding)
81
82
  if sep is None:
82
83
  sep = detect_separator(str_file, verbose=verbose)
83
- header_row_idx = detect_header_position(str_file, verbose=verbose)
84
+ header_row_idx, header = detect_headers(str_file, sep, verbose=verbose)
85
+ if header is None or (isinstance(header, list) and any([h is None for h in header])):
86
+ raise ValueError("Could not retrieve headers")
84
87
  heading_columns = detect_heading_columns(str_file, sep, verbose=verbose)
85
88
  trailing_columns = detect_trailing_columns(str_file, sep, heading_columns, verbose=verbose)
86
89
  table, total_lines, nb_duplicates = parse_csv(
@@ -97,11 +100,9 @@ def load_file(
97
100
  }
98
101
  if engine is not None:
99
102
  analysis["compression"] = engine
100
- if any(not isinstance(col, str) or col.startswith("Unnamed:") for col in table.columns):
101
- raise ValueError("Could not accurately detect the file's columns")
102
103
  analysis |= {
103
104
  "header_row_idx": header_row_idx,
104
- "header": list(table.columns),
105
+ "header": header,
105
106
  }
106
107
  if total_lines is not None:
107
108
  analysis["total_lines"] = total_lines