csv-detective 0.9.3.dev2258__py3-none-any.whl → 0.9.3.dev2348__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (179) hide show
  1. csv_detective/detection/formats.py +12 -15
  2. csv_detective/explore_csv.py +28 -9
  3. csv_detective/format.py +67 -0
  4. csv_detective/formats/__init__.py +9 -0
  5. csv_detective/{detect_fields/FR/geo/adresse/__init__.py → formats/adresse.py} +116 -100
  6. csv_detective/{detect_fields/other/booleen/__init__.py → formats/booleen.py} +35 -27
  7. csv_detective/formats/code_commune_insee.py +26 -0
  8. csv_detective/{detect_fields/FR/other/code_csp_insee/__init__.py → formats/code_csp_insee.py} +36 -29
  9. csv_detective/{detect_fields/FR/geo/code_departement/__init__.py → formats/code_departement.py} +29 -15
  10. csv_detective/formats/code_fantoir.py +21 -0
  11. csv_detective/{detect_fields/FR/other/code_import/__init__.py → formats/code_import.py} +17 -9
  12. csv_detective/formats/code_postal.py +25 -0
  13. csv_detective/formats/code_region.py +22 -0
  14. csv_detective/formats/code_rna.py +29 -0
  15. csv_detective/formats/code_waldec.py +17 -0
  16. csv_detective/{detect_fields/FR/geo/commune/__init__.py → formats/commune.py} +27 -16
  17. csv_detective/{detect_fields/FR/other/csp_insee/__init__.py → formats/csp_insee.py} +31 -19
  18. csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
  19. csv_detective/{detect_fields/temp/date/__init__.py → formats/date.py} +99 -62
  20. csv_detective/formats/date_fr.py +22 -0
  21. csv_detective/{detect_fields/temp/datetime_aware/__init__.py → formats/datetime_aware.py} +18 -7
  22. csv_detective/{detect_fields/temp/datetime_naive/__init__.py → formats/datetime_naive.py} +21 -2
  23. csv_detective/{detect_fields/temp/datetime_rfc822/__init__.py → formats/datetime_rfc822.py} +24 -18
  24. csv_detective/formats/departement.py +37 -0
  25. csv_detective/formats/email.py +28 -0
  26. csv_detective/{detect_fields/other/float/__init__.py → formats/float.py} +29 -21
  27. csv_detective/formats/geojson.py +36 -0
  28. csv_detective/{detect_fields/FR/other/insee_ape700/__init__.py → formats/insee_ape700.py} +31 -19
  29. csv_detective/{detect_fields/FR/geo/insee_canton/__init__.py → formats/insee_canton.py} +28 -15
  30. csv_detective/{detect_fields/other/int/__init__.py → formats/int.py} +23 -16
  31. csv_detective/formats/iso_country_code_alpha2.py +30 -0
  32. csv_detective/formats/iso_country_code_alpha3.py +30 -0
  33. csv_detective/formats/iso_country_code_numeric.py +31 -0
  34. csv_detective/{detect_fields/FR/temp/jour_de_la_semaine/__init__.py → formats/jour_de_la_semaine.py} +41 -25
  35. csv_detective/{detect_fields/other/json/__init__.py → formats/json.py} +20 -14
  36. csv_detective/formats/latitude_l93.py +48 -0
  37. csv_detective/formats/latitude_wgs.py +42 -0
  38. csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
  39. csv_detective/formats/latlon_wgs.py +53 -0
  40. csv_detective/formats/longitude_l93.py +39 -0
  41. csv_detective/formats/longitude_wgs.py +32 -0
  42. csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
  43. csv_detective/formats/lonlat_wgs.py +36 -0
  44. csv_detective/{detect_fields/FR/temp/mois_de_annee/__init__.py → formats/mois_de_lannee.py} +48 -39
  45. csv_detective/formats/money.py +18 -0
  46. csv_detective/formats/mongo_object_id.py +14 -0
  47. csv_detective/formats/pays.py +35 -0
  48. csv_detective/formats/percent.py +16 -0
  49. csv_detective/{detect_fields/FR/geo/region/__init__.py → formats/region.py} +70 -50
  50. csv_detective/formats/sexe.py +17 -0
  51. csv_detective/{detect_fields/FR/other/siren/__init__.py → formats/siren.py} +37 -20
  52. csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -31
  53. csv_detective/formats/tel_fr.py +36 -0
  54. csv_detective/formats/uai.py +36 -0
  55. csv_detective/formats/url.py +45 -0
  56. csv_detective/formats/username.py +14 -0
  57. csv_detective/formats/uuid.py +16 -0
  58. csv_detective/formats/year.py +28 -0
  59. csv_detective/output/__init__.py +3 -4
  60. csv_detective/output/dataframe.py +3 -3
  61. csv_detective/output/profile.py +2 -3
  62. csv_detective/output/schema.py +2 -2
  63. csv_detective/parsing/columns.py +35 -50
  64. csv_detective/parsing/csv.py +2 -2
  65. csv_detective/parsing/load.py +4 -5
  66. csv_detective/validate.py +9 -4
  67. {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2348.dist-info}/METADATA +6 -5
  68. csv_detective-0.9.3.dev2348.dist-info/RECORD +102 -0
  69. tests/test_fields.py +39 -364
  70. tests/test_file.py +1 -1
  71. tests/test_labels.py +5 -3
  72. tests/test_structure.py +40 -36
  73. csv_detective/detect_fields/FR/__init__.py +0 -0
  74. csv_detective/detect_fields/FR/geo/__init__.py +0 -0
  75. csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -9
  76. csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -9
  77. csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -9
  78. csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -10
  79. csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -16
  80. csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -19
  81. csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
  82. csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -19
  83. csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
  84. csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -16
  85. csv_detective/detect_fields/FR/other/__init__.py +0 -0
  86. csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
  87. csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -9
  88. csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -9
  89. csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
  90. csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -11
  91. csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -17
  92. csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
  93. csv_detective/detect_fields/FR/temp/__init__.py +0 -0
  94. csv_detective/detect_fields/__init__.py +0 -112
  95. csv_detective/detect_fields/geo/__init__.py +0 -0
  96. csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
  97. csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
  98. csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
  99. csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -18
  100. csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
  101. csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -16
  102. csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
  103. csv_detective/detect_fields/geo/lonlat_wgs/__init__.py +0 -16
  104. csv_detective/detect_fields/other/__init__.py +0 -0
  105. csv_detective/detect_fields/other/email/__init__.py +0 -10
  106. csv_detective/detect_fields/other/money/__init__.py +0 -11
  107. csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
  108. csv_detective/detect_fields/other/percent/__init__.py +0 -9
  109. csv_detective/detect_fields/other/twitter/__init__.py +0 -8
  110. csv_detective/detect_fields/other/url/__init__.py +0 -14
  111. csv_detective/detect_fields/other/uuid/__init__.py +0 -10
  112. csv_detective/detect_fields/temp/__init__.py +0 -0
  113. csv_detective/detect_fields/temp/year/__init__.py +0 -10
  114. csv_detective/detect_labels/FR/__init__.py +0 -0
  115. csv_detective/detect_labels/FR/geo/__init__.py +0 -0
  116. csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -15
  117. csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -17
  118. csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -15
  119. csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -12
  120. csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -16
  121. csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -14
  122. csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -12
  123. csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -22
  124. csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -13
  125. csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -30
  126. csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -30
  127. csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -21
  128. csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -21
  129. csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -20
  130. csv_detective/detect_labels/FR/geo/region/__init__.py +0 -20
  131. csv_detective/detect_labels/FR/other/__init__.py +0 -0
  132. csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -8
  133. csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -13
  134. csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -8
  135. csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -13
  136. csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -9
  137. csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -15
  138. csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -8
  139. csv_detective/detect_labels/FR/other/siren/__init__.py +0 -17
  140. csv_detective/detect_labels/FR/other/siret/__init__.py +0 -16
  141. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -20
  142. csv_detective/detect_labels/FR/other/uai/__init__.py +0 -25
  143. csv_detective/detect_labels/FR/temp/__init__.py +0 -0
  144. csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -16
  145. csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -8
  146. csv_detective/detect_labels/__init__.py +0 -94
  147. csv_detective/detect_labels/geo/__init__.py +0 -0
  148. csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -16
  149. csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -16
  150. csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -16
  151. csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -17
  152. csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -30
  153. csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -39
  154. csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -21
  155. csv_detective/detect_labels/geo/lonlat_wgs/__init__.py +0 -23
  156. csv_detective/detect_labels/other/__init__.py +0 -0
  157. csv_detective/detect_labels/other/booleen/__init__.py +0 -8
  158. csv_detective/detect_labels/other/email/__init__.py +0 -20
  159. csv_detective/detect_labels/other/float/__init__.py +0 -8
  160. csv_detective/detect_labels/other/int/__init__.py +0 -8
  161. csv_detective/detect_labels/other/money/__init__.py +0 -8
  162. csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -8
  163. csv_detective/detect_labels/other/twitter/__init__.py +0 -8
  164. csv_detective/detect_labels/other/url/__init__.py +0 -23
  165. csv_detective/detect_labels/other/uuid/__init__.py +0 -8
  166. csv_detective/detect_labels/temp/__init__.py +0 -0
  167. csv_detective/detect_labels/temp/date/__init__.py +0 -28
  168. csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -19
  169. csv_detective/detect_labels/temp/year/__init__.py +0 -19
  170. csv_detective/load_tests.py +0 -59
  171. csv_detective-0.9.3.dev2258.dist-info/RECORD +0 -166
  172. /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
  173. /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
  174. /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
  175. /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
  176. {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2348.dist-info}/WHEEL +0 -0
  177. {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2348.dist-info}/entry_points.txt +0 -0
  178. {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2348.dist-info}/licenses/LICENSE +0 -0
  179. {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2348.dist-info}/top_level.txt +0 -0
@@ -7,7 +7,7 @@ from csv_detective.detection.variables import (
7
7
  detect_categorical_variable,
8
8
  # detect_continuous_variable,
9
9
  )
10
- from csv_detective.load_tests import return_all_tests
10
+ from csv_detective.format import Format, FormatsManager
11
11
  from csv_detective.output.utils import prepare_output_dict
12
12
  from csv_detective.parsing.columns import (
13
13
  MAX_NUMBER_CATEGORICAL_VALUES,
@@ -16,12 +16,14 @@ from csv_detective.parsing.columns import (
16
16
  test_label,
17
17
  )
18
18
 
19
+ fmtm = FormatsManager()
20
+
19
21
 
20
22
  def detect_formats(
21
23
  table: pd.DataFrame,
22
24
  analysis: dict,
23
25
  file_path: str,
24
- user_input_tests: str | list[str] = "ALL",
26
+ tags: list[str] | None = None,
25
27
  limited_output: bool = True,
26
28
  skipna: bool = True,
27
29
  verbose: bool = False,
@@ -29,15 +31,12 @@ def detect_formats(
29
31
  in_chunks = analysis.get("total_lines") is None
30
32
 
31
33
  # list testing to be performed
32
- all_tests_fields = return_all_tests(
33
- user_input_tests, detect_type="detect_fields"
34
- ) # list all tests for the fields
35
- all_tests_labels = return_all_tests(
36
- user_input_tests, detect_type="detect_labels"
37
- ) # list all tests for the labels
34
+ formats: dict[str, Format] = (
35
+ fmtm.get_formats_from_tags(tags) if tags is not None else fmtm.formats
36
+ )
38
37
 
39
38
  # if no testing then return
40
- if not all_tests_fields and not all_tests_labels:
39
+ if len(formats) == 0:
41
40
  return analysis, None
42
41
 
43
42
  # Perform testing on fields
@@ -45,7 +44,7 @@ def detect_formats(
45
44
  # table is small enough to be tested in one go
46
45
  scores_table_fields = test_col(
47
46
  table=table,
48
- all_tests=all_tests_fields,
47
+ formats=formats,
49
48
  limited_output=limited_output,
50
49
  skipna=skipna,
51
50
  verbose=verbose,
@@ -62,7 +61,7 @@ def detect_formats(
62
61
  table=table,
63
62
  file_path=file_path,
64
63
  analysis=analysis,
65
- all_tests=all_tests_fields,
64
+ formats=formats,
66
65
  limited_output=limited_output,
67
66
  skipna=skipna,
68
67
  verbose=verbose,
@@ -70,9 +69,7 @@ def detect_formats(
70
69
  analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output)
71
70
 
72
71
  # Perform testing on labels
73
- scores_table_labels = test_label(
74
- analysis["header"], all_tests_labels, limited_output, verbose=verbose
75
- )
72
+ scores_table_labels = test_label(analysis["header"], formats, limited_output, verbose=verbose)
76
73
  analysis["columns_labels"] = prepare_output_dict(scores_table_labels, limited_output)
77
74
 
78
75
  # Multiply the results of the fields by 1 + 0.5 * the results of the labels.
@@ -115,7 +112,7 @@ def detect_formats(
115
112
  "float": "float",
116
113
  "string": "string",
117
114
  "json": "json",
118
- "json_geojson": "json",
115
+ "geojson": "json",
119
116
  "datetime_aware": "datetime",
120
117
  "datetime_naive": "datetime",
121
118
  "datetime_rfc822": "datetime",
@@ -15,7 +15,7 @@ logging.basicConfig(level=logging.INFO)
15
15
  def routine(
16
16
  file_path: str,
17
17
  num_rows: int = 500,
18
- user_input_tests: str | list[str] = "ALL",
18
+ tags: list[str] | None = None,
19
19
  limited_output: bool = True,
20
20
  save_results: bool | str = True,
21
21
  encoding: str | None = None,
@@ -28,14 +28,13 @@ def routine(
28
28
  verbose: bool = False,
29
29
  sheet_name: str | int | None = None,
30
30
  ) -> dict | tuple[dict, pd.DataFrame]:
31
- """Returns a dict with information about the table and possible
32
- column contents, and if requested the DataFrame with columns cast according to analysis.
31
+ """
32
+ Returns a dict with information about the table and possible column contents, and if requested the DataFrame with columns cast according to analysis.
33
33
 
34
34
  Args:
35
35
  file_path: local path or URL to file
36
- num_rows: number of rows to sample from the file for analysis ; -1 for analysis
37
- of the whole file
38
- user_input_tests: tests to run on the file
36
+ num_rows: number of rows to sample from the file for analysis ; -1 for analysis of the whole file
37
+ tags: tags to filter formats (for instance ["geo", "fr] to run only the checks related to geo and French formats)
39
38
  limited_output: whether or not to return all possible types or only the most likely one for each column
40
39
  save_results: whether or not to save the results in a json file, or the path where to dump the output
41
40
  output_profile: whether or not to add the 'profile' field to the output
@@ -74,7 +73,7 @@ def routine(
74
73
  table=table,
75
74
  analysis=analysis,
76
75
  file_path=file_path,
77
- user_input_tests=user_input_tests,
76
+ tags=tags,
78
77
  limited_output=limited_output,
79
78
  skipna=skipna,
80
79
  verbose=verbose,
@@ -107,7 +106,7 @@ def validate_then_detect(
107
106
  file_path: str,
108
107
  previous_analysis: dict,
109
108
  num_rows: int = 500,
110
- user_input_tests: str | list[str] = "ALL",
109
+ tags: list[str] | None = None,
111
110
  limited_output: bool = True,
112
111
  save_results: bool | str = True,
113
112
  skipna: bool = True,
@@ -117,6 +116,26 @@ def validate_then_detect(
117
116
  cast_json: bool = True,
118
117
  verbose: bool = False,
119
118
  ):
119
+ """
120
+ Performs a validation of the given file against the given analysis.
121
+ If the validation fails, performs a full analysis and return it.
122
+ Otherwise return the previous analysis (which is therefore still valid).
123
+ NB: if asked, the profile is recreated in both cases.
124
+
125
+ Args:
126
+ file_path: the path of the file to validate.
127
+ previous_analysis: the previous analysis to validate against (expected in the same structure as the output of the routine)
128
+ num_rows: number of rows to sample from the file for analysis ; -1 for analysis of the whole file
129
+ tags: tags to filter formats (for instance ["geo", "fr] to run only the checks related to geo and French formats)
130
+ limited_output: whether or not to return all possible types or only the most likely one for each column
131
+ save_results: whether or not to save the results in a json file, or the path where to dump the output
132
+ skipna: whether to ignore NaN values in the checks
133
+ output_profile: whether or not to add the 'profile' field to the output
134
+ output_schema: whether or not to add the 'schema' field to the output (tableschema)
135
+ output_df: whether or not to return the loaded DataFrame along with the analysis report
136
+ cast_json: whether or not to cast json columns into objects (otherwise they are returned as strings)
137
+ verbose: whether the code displays the steps it's going through
138
+ """
120
139
  if verbose:
121
140
  start_routine = time()
122
141
  if is_url(file_path):
@@ -140,7 +159,7 @@ def validate_then_detect(
140
159
  table=table,
141
160
  analysis=analysis,
142
161
  file_path=file_path,
143
- user_input_tests=user_input_tests,
162
+ tags=tags,
144
163
  limited_output=limited_output,
145
164
  skipna=skipna,
146
165
  verbose=verbose,
@@ -0,0 +1,67 @@
1
+ from typing import Any, Callable
2
+
3
+ from csv_detective.parsing.text import header_score
4
+
5
+
6
+ class Format:
7
+ def __init__(
8
+ self,
9
+ name: str,
10
+ func: Callable[[Any], bool],
11
+ _test_values: dict[bool, list[str]],
12
+ labels: list[str] = [],
13
+ proportion: float = 1,
14
+ tags: list[str] = [],
15
+ ) -> None:
16
+ """
17
+ Instanciates a Format object.
18
+
19
+ Args:
20
+ name: the name of the format.
21
+ func: the value test for the format (returns whether a string is valid).
22
+ _test_values: lists of valid and invalid values, used in the tests
23
+ labels: the list of hint headers for the header score
24
+ proportion: the tolerance (between 0 and 1) to say a column is valid for a format. (1 => 100% of the column has to pass the func check for the column to be considered valid)
25
+ tags: to allow users to submit a file to only a subset of formats
26
+ """
27
+ self.name: str = name
28
+ self.func: Callable = func
29
+ self._test_values: dict[bool, list[str]] = _test_values
30
+ self.labels: list[str] = labels
31
+ self.proportion: float = proportion
32
+ self.tags: list[str] = tags
33
+
34
+ def is_valid_label(self, val: str) -> float:
35
+ return header_score(val, self.labels)
36
+
37
+
38
+ class FormatsManager:
39
+ formats: dict[str, Format]
40
+
41
+ def __init__(self) -> None:
42
+ import csv_detective.formats as formats
43
+
44
+ format_labels = [f for f in dir(formats) if "_is" in dir(getattr(formats, f))]
45
+ self.formats = {
46
+ label: Format(
47
+ name=label,
48
+ func=(module := getattr(formats, label))._is,
49
+ _test_values=module._test_values,
50
+ **{
51
+ attr: val
52
+ for attr in ["labels", "proportion", "tags"]
53
+ if (val := getattr(module, attr, None))
54
+ },
55
+ )
56
+ for label in format_labels
57
+ }
58
+
59
+ def get_formats_from_tags(self, tags: list[str]) -> dict[str, Format]:
60
+ return {
61
+ label: fmt
62
+ for label, fmt in self.formats.items()
63
+ if all(tag in fmt.tags for tag in tags)
64
+ }
65
+
66
+ def available_tags(self) -> set[str]:
67
+ return set(tag for format in self.formats.values() for tag in format.tags)
@@ -0,0 +1,9 @@
1
+ import importlib
2
+ import os
3
+
4
+ for file in os.listdir(os.path.dirname(__file__)):
5
+ if file.endswith(".py") and not file.startswith("_"):
6
+ module_name = file[:-3]
7
+ module = importlib.import_module(f"csv_detective.formats.{module_name}")
8
+ globals()[module_name] = module
9
+ del module
@@ -1,100 +1,116 @@
1
- from csv_detective.parsing.text import _process_text
2
-
3
- PROPORTION = 0.55
4
- # ajouts d'espaces en fin de mots pour s'assurer que le str n'est pas juste une substr d'un mot plus long
5
- voies = {
6
- "aire ",
7
- "allee ",
8
- "avenue ",
9
- "base ",
10
- "boulevard ",
11
- "cami ",
12
- "carrefour ",
13
- "chemin ",
14
- "cheminement ",
15
- "chaussee ",
16
- "cite ",
17
- "clos ",
18
- "coin ",
19
- "corniche ",
20
- "cote ",
21
- "cour ",
22
- "cours ",
23
- "domaine ",
24
- "descente ",
25
- "ecart ",
26
- "esplanade ",
27
- "faubourg ",
28
- "gare ",
29
- "grande rue",
30
- "hameau ",
31
- "halle ",
32
- "ilot ",
33
- "impasse ",
34
- "lieu dit",
35
- "lotissement ",
36
- "marche ",
37
- "montee ",
38
- "parc ",
39
- "passage ",
40
- "place ",
41
- "plan ",
42
- "plaine ",
43
- "plateau ",
44
- "pont ",
45
- "port ",
46
- "promenade ",
47
- "parvis ",
48
- "quartier ",
49
- "quai ",
50
- "residence ",
51
- "ruelle ",
52
- "rocade ",
53
- "rond point",
54
- "route ",
55
- "rue ",
56
- # 'sente - sentier',
57
- "square ",
58
- "tour ",
59
- # 'terre-plein',
60
- "traverse ",
61
- "villa ",
62
- "village ",
63
- "voie ",
64
- "zone artisanale",
65
- "zone d’amenagement concerte",
66
- "zone d’amenagement differe",
67
- "zone industrielle",
68
- "zone ",
69
- # 'r',
70
- "av ",
71
- "pl ",
72
- "bd ",
73
- "cami ",
74
- # 'che',
75
- "chs ",
76
- "dom ",
77
- "ham ",
78
- "ld ",
79
- # 'pro',
80
- # 'rte',
81
- "vlge ",
82
- "za ",
83
- "zac ",
84
- "zad ",
85
- "zi ",
86
- # 'car',
87
- "fg ",
88
- # 'lot',
89
- "imp ",
90
- # 'qu',
91
- "mte",
92
- }
93
-
94
-
95
- def _is(val):
96
- """Repere des adresses"""
97
- if not isinstance(val, str) or len(val) > 150:
98
- return False
99
- val = _process_text(val)
100
- return any(x in val for x in voies)
1
+ from csv_detective.parsing.text import _process_text
2
+
3
+ proportion = 0.55
4
+ tags = ["fr", "geo"]
5
+ labels = [
6
+ "adresse",
7
+ "localisation",
8
+ "adresse postale",
9
+ "adresse geographique",
10
+ "adr",
11
+ "adresse complete",
12
+ "adresse station",
13
+ ]
14
+
15
+ voies = {
16
+ "aire ",
17
+ "allee ",
18
+ "avenue ",
19
+ "base ",
20
+ "boulevard ",
21
+ "cami ",
22
+ "carrefour ",
23
+ "chemin ",
24
+ "cheminement ",
25
+ "chaussee ",
26
+ "cite ",
27
+ "clos ",
28
+ "coin ",
29
+ "corniche ",
30
+ "cote ",
31
+ "cour ",
32
+ "cours ",
33
+ "domaine ",
34
+ "descente ",
35
+ "ecart ",
36
+ "esplanade ",
37
+ "faubourg ",
38
+ "gare ",
39
+ "grande rue",
40
+ "hameau ",
41
+ "halle ",
42
+ "ilot ",
43
+ "impasse ",
44
+ "lieu dit",
45
+ "lotissement ",
46
+ "marche ",
47
+ "montee ",
48
+ "parc ",
49
+ "passage ",
50
+ "place ",
51
+ "plan ",
52
+ "plaine ",
53
+ "plateau ",
54
+ "pont ",
55
+ "port ",
56
+ "promenade ",
57
+ "parvis ",
58
+ "quartier ",
59
+ "quai ",
60
+ "residence ",
61
+ "ruelle ",
62
+ "rocade ",
63
+ "rond point",
64
+ "route ",
65
+ "rue ",
66
+ # 'sente - sentier',
67
+ "square ",
68
+ "tour ",
69
+ # 'terre-plein',
70
+ "traverse ",
71
+ "villa ",
72
+ "village ",
73
+ "voie ",
74
+ "zone artisanale",
75
+ "zone d’amenagement concerte",
76
+ "zone d’amenagement differe",
77
+ "zone industrielle",
78
+ "zone ",
79
+ # 'r',
80
+ "av ",
81
+ "pl ",
82
+ "bd ",
83
+ "cami ",
84
+ # 'che',
85
+ "chs ",
86
+ "dom ",
87
+ "ham ",
88
+ "ld ",
89
+ # 'pro',
90
+ # 'rte',
91
+ "vlge ",
92
+ "za ",
93
+ "zac ",
94
+ "zad ",
95
+ "zi ",
96
+ # 'car',
97
+ "fg ",
98
+ # 'lot',
99
+ "imp ",
100
+ # 'qu',
101
+ "mte",
102
+ }
103
+
104
+
105
+ def _is(val):
106
+ """Repere des adresses"""
107
+ if not isinstance(val, str) or len(val) > 150:
108
+ return False
109
+ val = _process_text(val)
110
+ return any(x in val for x in voies)
111
+
112
+
113
+ _test_values = {
114
+ True: ["rue du martyr"],
115
+ False: ["un batiment"],
116
+ }
@@ -1,27 +1,35 @@
1
- PROPORTION = 1
2
- bool_mapping = {
3
- "1": True,
4
- "0": False,
5
- "vrai": True,
6
- "faux": False,
7
- "true": True,
8
- "false": False,
9
- "oui": True,
10
- "non": False,
11
- "yes": True,
12
- "no": False,
13
- "y": True,
14
- "n": False,
15
- "o": True,
16
- }
17
-
18
- liste_bool = set(bool_mapping.keys())
19
-
20
-
21
- def bool_casting(val: str) -> bool:
22
- return bool_mapping.get(val.lower())
23
-
24
-
25
- def _is(val: str) -> bool:
26
- """Détecte les booléens"""
27
- return isinstance(val, str) and val.lower() in liste_bool
1
+ proportion = 1
2
+ tags = ["type"]
3
+ labels = ["is ", "has ", "est "]
4
+
5
+ bool_mapping = {
6
+ "1": True,
7
+ "0": False,
8
+ "vrai": True,
9
+ "faux": False,
10
+ "true": True,
11
+ "false": False,
12
+ "oui": True,
13
+ "non": False,
14
+ "yes": True,
15
+ "no": False,
16
+ "y": True,
17
+ "n": False,
18
+ "o": True,
19
+ }
20
+
21
+ liste_bool = set(bool_mapping.keys())
22
+
23
+
24
+ def bool_casting(val: str) -> bool:
25
+ return bool_mapping.get(val.lower())
26
+
27
+
28
+ def _is(val):
29
+ return isinstance(val, str) and val.lower() in liste_bool
30
+
31
+
32
+ _test_values = {
33
+ True: ["oui", "0", "1", "yes", "false", "True"],
34
+ False: ["nein", "ja", "2", "-0"],
35
+ }
@@ -0,0 +1,26 @@
1
+ from frformat import CodeCommuneInsee, Millesime
2
+
3
+ proportion = 0.75
4
+ tags = ["fr", "geo"]
5
+ labels = [
6
+ "code commune insee",
7
+ "code insee",
8
+ "codes insee",
9
+ "code commune",
10
+ "code insee commune",
11
+ "insee",
12
+ "code com",
13
+ "com",
14
+ ]
15
+
16
+ _code_commune_insee = CodeCommuneInsee(Millesime.LATEST)
17
+
18
+
19
+ def _is(val):
20
+ return isinstance(val, str) and _code_commune_insee.is_valid(val)
21
+
22
+
23
+ _test_values = {
24
+ True: ["91471", "01053"],
25
+ False: ["914712", "01000"],
26
+ }
@@ -1,29 +1,36 @@
1
- import re
2
-
3
- from csv_detective.parsing.text import _process_text
4
-
5
- PROPORTION = 1
6
-
7
-
8
- def _is(val):
9
- """Repère les code csp telles que définies par l'INSEE"""
10
- if not isinstance(val, str):
11
- return False
12
- val = _process_text(val)
13
- if len(val) != 4:
14
- return False
15
- a = bool(re.match(r"^[123456][0-9]{2}[abcdefghijkl]$", val))
16
- b = val in {
17
- "7100",
18
- "7200",
19
- "7400",
20
- "7500",
21
- "7700",
22
- "7800",
23
- "8100",
24
- "8300",
25
- "8400",
26
- "8500",
27
- "8600",
28
- }
29
- return a or b
1
+ import re
2
+
3
+ from csv_detective.parsing.text import _process_text
4
+
5
+ proportion = 1
6
+ tags = ["fr"]
7
+ labels = ["code csp insee", "code csp"]
8
+
9
+
10
+ def _is(val):
11
+ if not isinstance(val, str):
12
+ return False
13
+ val = _process_text(val)
14
+ if len(val) != 4:
15
+ return False
16
+ a = bool(re.match(r"^[123456][0-9]{2}[abcdefghijkl]$", val))
17
+ b = val in {
18
+ "7100",
19
+ "7200",
20
+ "7400",
21
+ "7500",
22
+ "7700",
23
+ "7800",
24
+ "8100",
25
+ "8300",
26
+ "8400",
27
+ "8500",
28
+ "8600",
29
+ }
30
+ return a or b
31
+
32
+
33
+ _test_values = {
34
+ True: ["121f"],
35
+ False: ["121x"],
36
+ }
@@ -1,15 +1,29 @@
1
- from frformat import Millesime, NumeroDepartement, Options
2
-
3
- PROPORTION = 1
4
-
5
- _options = Options(
6
- ignore_case=True,
7
- ignore_accents=True,
8
- replace_non_alphanumeric_with_space=True,
9
- ignore_extra_whitespace=True,
10
- )
11
- _numero_departement = NumeroDepartement(Millesime.LATEST, _options)
12
-
13
-
14
- def _is(val):
15
- return isinstance(val, str) and _numero_departement.is_valid(val)
1
+ from frformat import Millesime, NumeroDepartement, Options
2
+
3
+ proportion = 1
4
+ tags = ["fr", "geo"]
5
+ labels = [
6
+ "code departement",
7
+ "code_departement",
8
+ "dep",
9
+ "departement",
10
+ "dept",
11
+ ]
12
+
13
+ _options = Options(
14
+ ignore_case=True,
15
+ ignore_accents=True,
16
+ replace_non_alphanumeric_with_space=True,
17
+ ignore_extra_whitespace=True,
18
+ )
19
+ _numero_departement = NumeroDepartement(Millesime.LATEST, _options)
20
+
21
+
22
+ def _is(val):
23
+ return isinstance(val, str) and _numero_departement.is_valid(val)
24
+
25
+
26
+ _test_values = {
27
+ True: ["75", "2A", "2b", "974", "01"],
28
+ False: ["00", "96", "101"],
29
+ }