csv-detective 0.8.1.dev1469__py3-none-any.whl → 0.8.1.dev1491__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
csv_detective/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- from .explore_csv import routine, routine_minio, validate_then_detect # noqa
2
- from .output.example import create_example_csv_file # noqa
1
+ from csv_detective.explore_csv import routine, routine_minio, validate_then_detect # noqa
2
+ from csv_detective.output.example import create_example_csv_file # noqa
3
3
 
4
4
  __version__ = '0.8.1.dev'
csv_detective/cli.py CHANGED
@@ -4,7 +4,7 @@ Command line client for csv_detective
4
4
 
5
5
  import argparse
6
6
  import json
7
- from .explore_csv import routine
7
+ from csv_detective.explore_csv import routine
8
8
 
9
9
 
10
10
  def run():
@@ -7,12 +7,12 @@ from typing import Optional, Union
7
7
 
8
8
  import pandas as pd
9
9
 
10
- from .detection.formats import detect_formats
11
- from .output import generate_output, generate_table_schema
12
- from .parsing.load import load_file
13
- from .s3_utils import download_from_minio, upload_to_minio
14
- from .utils import display_logs_depending_process_time, is_url
15
- from .validate import validate
10
+ from csv_detective.detection.formats import detect_formats
11
+ from csv_detective.output import generate_output, generate_table_schema
12
+ from csv_detective.parsing.load import load_file
13
+ from csv_detective.s3_utils import download_from_minio, upload_to_minio
14
+ from csv_detective.utils import display_logs_depending_process_time, is_url
15
+ from csv_detective.validate import validate
16
16
 
17
17
  logging.basicConfig(level=logging.INFO)
18
18
 
File without changes
@@ -0,0 +1,141 @@
1
+ import logging
2
+ from time import time
3
+ from typing import Callable
4
+
5
+ import pandas as pd
6
+
7
+ from csv_detective.utils import display_logs_depending_process_time
8
+
9
+ MAX_ROWS_ANALYSIS = 1e5
10
+
11
+
12
+ def test_col_val(
13
+ serie: pd.Series,
14
+ test_func: Callable,
15
+ proportion: float = 0.9,
16
+ skipna: bool = True,
17
+ limited_output: bool = False,
18
+ verbose: bool = False,
19
+ ):
20
+ """Tests values of the serie using test_func.
21
+ - skipna : if True indicates that NaNs are not counted as False
22
+ - proportion : indicates the proportion of values that have to pass the test
23
+ for the serie to be detected as a certain format
24
+ """
25
+ if verbose:
26
+ start = time()
27
+
28
+ # TODO : change for a cleaner method and only test columns in modules labels
29
+ def apply_test_func(serie: pd.Series, test_func: Callable, _range: int):
30
+ return serie.sample(n=_range).apply(test_func)
31
+ try:
32
+ if skipna:
33
+ serie = serie[serie.notnull()]
34
+ ser_len = len(serie)
35
+ if ser_len == 0:
36
+ return 0.0
37
+ if not limited_output:
38
+ result = apply_test_func(serie, test_func, ser_len).sum() / ser_len
39
+ return result if result >= proportion else 0.0
40
+ else:
41
+ if proportion == 1: # Then try first 1 value, then 5, then all
42
+ for _range in [
43
+ min(1, ser_len),
44
+ min(5, ser_len),
45
+ ser_len,
46
+ ]: # Pour ne pas faire d'opérations inutiles, on commence par 1,
47
+ # puis 5 valeurs puis la serie complète
48
+ if all(apply_test_func(serie, test_func, _range)):
49
+ # print(serie.name, ': check OK')
50
+ pass
51
+ else:
52
+ return 0.0
53
+ return 1.0
54
+ else:
55
+ # if we have a proportion, statistically it's OK to analyse up to 10k rows
56
+ # (arbitrary number) and get a significant result
57
+ to_analyse = min(ser_len, MAX_ROWS_ANALYSIS)
58
+ result = apply_test_func(serie, test_func, to_analyse).sum() / to_analyse
59
+ return result if result >= proportion else 0.0
60
+ finally:
61
+ if verbose and time() - start > 3:
62
+ display_logs_depending_process_time(
63
+ f"\t/!\\ Column '{serie.name}' took too long ({round(time() - start, 3)}s)",
64
+ time() - start
65
+ )
66
+
67
+
68
+ def test_col_label(label: str, test_func: Callable, proportion: float = 1, limited_output: bool = False):
69
+ """Tests label (from header) using test_func.
70
+ - proportion : indicates the minimum score to pass the test for the serie
71
+ to be detected as a certain format
72
+ """
73
+ if not limited_output:
74
+ return test_func(label)
75
+ else:
76
+ result = test_func(label)
77
+ return result if result >= proportion else 0
78
+
79
+
80
+ def test_col(table: pd.DataFrame, all_tests: list, limited_output: bool, skipna: bool = True, verbose: bool = False):
81
+ if verbose:
82
+ start = time()
83
+ logging.info("Testing columns to get types")
84
+ test_funcs = dict()
85
+ for test in all_tests:
86
+ name = test.__name__.split(".")[-1]
87
+ test_funcs[name] = {"func": test._is, "prop": test.PROPORTION}
88
+ return_table = pd.DataFrame(columns=table.columns)
89
+ for idx, (key, value) in enumerate(test_funcs.items()):
90
+ if verbose:
91
+ start_type = time()
92
+ logging.info(f"\t- Starting with type '{key}'")
93
+ # improvement lead : put the longest tests behind and make them only if previous tests not satisfactory
94
+ # => the following needs to change, "apply" means all columns are tested for one type at once
95
+ return_table.loc[key] = table.apply(
96
+ lambda serie: test_col_val(
97
+ serie,
98
+ value["func"],
99
+ value["prop"],
100
+ skipna=skipna,
101
+ limited_output=limited_output,
102
+ verbose=verbose,
103
+ )
104
+ )
105
+ if verbose:
106
+ display_logs_depending_process_time(
107
+ f'\t> Done with type "{key}" in {round(time() - start_type, 3)}s ({idx+1}/{len(test_funcs)})',
108
+ time() - start_type
109
+ )
110
+ if verbose:
111
+ display_logs_depending_process_time(f"Done testing columns in {round(time() - start, 3)}s", time() - start)
112
+ return return_table
113
+
114
+
115
+ def test_label(table: pd.DataFrame, all_tests: list, limited_output: bool, verbose: bool = False):
116
+ if verbose:
117
+ start = time()
118
+ logging.info("Testing labels to get types")
119
+ test_funcs = dict()
120
+ for test in all_tests:
121
+ name = test.__name__.split(".")[-1]
122
+ test_funcs[name] = {"func": test._is, "prop": test.PROPORTION}
123
+
124
+ return_table = pd.DataFrame(columns=table.columns)
125
+ for idx, (key, value) in enumerate(test_funcs.items()):
126
+ if verbose:
127
+ start_type = time()
128
+ return_table.loc[key] = [
129
+ test_col_label(
130
+ col_name, value["func"], value["prop"], limited_output=limited_output
131
+ )
132
+ for col_name in table.columns
133
+ ]
134
+ if verbose:
135
+ display_logs_depending_process_time(
136
+ f'\t- Done with type "{key}" in {round(time() - start_type, 3)}s ({idx+1}/{len(test_funcs)})',
137
+ time() - start_type
138
+ )
139
+ if verbose:
140
+ display_logs_depending_process_time(f"Done testing labels in {round(time() - start, 3)}s", time() - start)
141
+ return return_table
@@ -0,0 +1,11 @@
1
+ import gzip
2
+ from io import BytesIO
3
+
4
+
5
+ def unzip(binary_file: BytesIO, engine: str) -> BytesIO:
6
+ if engine == "gzip":
7
+ with gzip.open(binary_file, mode="rb") as binary_file:
8
+ file_content = binary_file.read()
9
+ else:
10
+ raise NotImplementedError(f"{engine} is not yet supported")
11
+ return BytesIO(file_content)
@@ -0,0 +1,55 @@
1
+ import logging
2
+ from time import time
3
+ from typing import TextIO
4
+
5
+ import pandas as pd
6
+
7
+ from csv_detective.utils import display_logs_depending_process_time
8
+
9
+
10
+ def parse_csv(
11
+ the_file: TextIO,
12
+ encoding: str,
13
+ sep: str,
14
+ num_rows: int,
15
+ skiprows: int,
16
+ random_state: int = 42,
17
+ verbose: bool = False,
18
+ ) -> tuple[pd.DataFrame, int, int]:
19
+ if verbose:
20
+ start = time()
21
+ logging.info("Parsing table")
22
+ table = None
23
+
24
+ if not isinstance(the_file, str):
25
+ the_file.seek(0)
26
+
27
+ total_lines = None
28
+ for encoding in [encoding, "ISO-8859-1", "utf-8"]:
29
+ if encoding is None:
30
+ continue
31
+
32
+ if "ISO-8859" in encoding:
33
+ encoding = "ISO-8859-1"
34
+ try:
35
+ table = pd.read_csv(
36
+ the_file, sep=sep, dtype="unicode", encoding=encoding, skiprows=skiprows
37
+ )
38
+ total_lines = len(table)
39
+ nb_duplicates = len(table.loc[table.duplicated()])
40
+ if num_rows > 0:
41
+ num_rows = min(num_rows - 1, total_lines)
42
+ table = table.sample(num_rows, random_state=random_state)
43
+ # else : table is unchanged
44
+ break
45
+ except TypeError:
46
+ print("Trying encoding : {encoding}".format(encoding=encoding))
47
+
48
+ if table is None:
49
+ raise ValueError("Could not load file")
50
+ if verbose:
51
+ display_logs_depending_process_time(
52
+ f'Table parsed successfully in {round(time() - start, 3)}s',
53
+ time() - start,
54
+ )
55
+ return table, total_lines, nb_duplicates
@@ -0,0 +1,169 @@
1
+ from io import BytesIO
2
+ from time import time
3
+ from typing import Optional
4
+
5
+ import openpyxl
6
+ import pandas as pd
7
+ import requests
8
+ import xlrd
9
+
10
+ from csv_detective.detection.engine import engine_to_file
11
+ from csv_detective.detection.rows import remove_empty_first_rows
12
+ from csv_detective.utils import (
13
+ display_logs_depending_process_time,
14
+ is_url,
15
+ )
16
+
17
+ NEW_EXCEL_EXT = [".xlsx", ".xlsm", ".xltx", ".xltm"]
18
+ OLD_EXCEL_EXT = [".xls"]
19
+ OPEN_OFFICE_EXT = [".odf", ".ods", ".odt"]
20
+ XLS_LIKE_EXT = NEW_EXCEL_EXT + OLD_EXCEL_EXT + OPEN_OFFICE_EXT
21
+
22
+
23
+ def parse_excel(
24
+ file_path: str,
25
+ num_rows: int = -1,
26
+ engine: Optional[str] = None,
27
+ sheet_name: Optional[str] = None,
28
+ random_state: int = 42,
29
+ verbose: bool = False,
30
+ ) -> tuple[pd.DataFrame, int, int, str, str, int]:
31
+ """"Excel-like parsing is really slow, could be a good improvement for future development"""
32
+ if verbose:
33
+ start = time()
34
+ no_sheet_specified = sheet_name is None
35
+
36
+ if (
37
+ engine in ['openpyxl', 'xlrd'] or
38
+ any([file_path.endswith(k) for k in NEW_EXCEL_EXT + OLD_EXCEL_EXT])
39
+ ):
40
+ remote_content = None
41
+ if is_url(file_path):
42
+ r = requests.get(file_path)
43
+ r.raise_for_status()
44
+ remote_content = BytesIO(r.content)
45
+ if not engine:
46
+ if any([file_path.endswith(k) for k in NEW_EXCEL_EXT]):
47
+ engine = "openpyxl"
48
+ else:
49
+ engine = "xlrd"
50
+ if sheet_name is None:
51
+ if verbose:
52
+ display_logs_depending_process_time(
53
+ f'Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one',
54
+ time() - start,
55
+ )
56
+ try:
57
+ if engine == "openpyxl":
58
+ # openpyxl doesn't want to open files that don't have a valid extension
59
+ # see: https://foss.heptapod.net/openpyxl/openpyxl/-/issues/2157
60
+ # if the file is remote, we have a remote content anyway so it's fine
61
+ if not remote_content and '.' not in file_path.split('/')[-1]:
62
+ with open(file_path, 'rb') as f:
63
+ remote_content = BytesIO(f.read())
64
+ # faster than loading all sheets
65
+ wb = openpyxl.load_workbook(remote_content or file_path, read_only=True)
66
+ try:
67
+ sizes = {s.title: s.max_row * s.max_column for s in wb.worksheets}
68
+ except TypeError:
69
+ # sometimes read_only can't get the info, so we have to open the file for real
70
+ # this takes more time but it's for a limited number of files
71
+ # and it's this or nothing
72
+ wb = openpyxl.load_workbook(remote_content or file_path)
73
+ sizes = {s.title: s.max_row * s.max_column for s in wb.worksheets}
74
+ else:
75
+ if remote_content:
76
+ wb = xlrd.open_workbook(file_contents=remote_content.read())
77
+ else:
78
+ wb = xlrd.open_workbook(file_path)
79
+ sizes = {s.name: s.nrows * s.ncols for s in wb.sheets()}
80
+ sheet_name = max(sizes, key=sizes.get)
81
+ except xlrd.biffh.XLRDError:
82
+ # sometimes a xls file is recognized as ods
83
+ if verbose:
84
+ display_logs_depending_process_time(
85
+ 'Could not read file with classic xls reader, trying with ODS',
86
+ time() - start,
87
+ )
88
+ engine = "odf"
89
+
90
+ if engine == "odf" or any([file_path.endswith(k) for k in OPEN_OFFICE_EXT]):
91
+ # for ODS files, no way to get sheets' sizes without
92
+ # loading the file one way or another (pandas or pure odfpy)
93
+ # so all in one
94
+ engine = "odf"
95
+ if sheet_name is None:
96
+ if verbose:
97
+ display_logs_depending_process_time(
98
+ f'Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one',
99
+ time() - start,
100
+ )
101
+ tables = pd.read_excel(
102
+ file_path,
103
+ engine="odf",
104
+ sheet_name=None,
105
+ dtype="unicode",
106
+ )
107
+ sizes = {sheet_name: table.size for sheet_name, table in tables.items()}
108
+ sheet_name = max(sizes, key=sizes.get)
109
+ if verbose:
110
+ display_logs_depending_process_time(
111
+ f'Going forwards with sheet "{sheet_name}"',
112
+ time() - start,
113
+ )
114
+ table = tables[sheet_name]
115
+ else:
116
+ if verbose:
117
+ display_logs_depending_process_time(
118
+ f'Detected {engine_to_file[engine]} file, reading sheet "{sheet_name}"',
119
+ time() - start,
120
+ )
121
+ table = pd.read_excel(
122
+ file_path,
123
+ engine="odf",
124
+ sheet_name=sheet_name,
125
+ dtype="unicode",
126
+ )
127
+ table, header_row_idx = remove_empty_first_rows(table)
128
+ total_lines = len(table)
129
+ nb_duplicates = len(table.loc[table.duplicated()])
130
+ if num_rows > 0:
131
+ num_rows = min(num_rows - 1, total_lines)
132
+ table = table.sample(num_rows, random_state=random_state)
133
+ if verbose:
134
+ display_logs_depending_process_time(
135
+ f'Table parsed successfully in {round(time() - start, 3)}s',
136
+ time() - start,
137
+ )
138
+ return table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx
139
+
140
+ # so here we end up with (old and new) excel files only
141
+ if verbose:
142
+ if no_sheet_specified:
143
+ display_logs_depending_process_time(
144
+ f'Going forwards with sheet "{sheet_name}"',
145
+ time() - start,
146
+ )
147
+ else:
148
+ display_logs_depending_process_time(
149
+ f'Detected {engine_to_file[engine]} file, reading sheet "{sheet_name}"',
150
+ time() - start,
151
+ )
152
+ table = pd.read_excel(
153
+ file_path,
154
+ engine=engine,
155
+ sheet_name=sheet_name,
156
+ dtype="unicode",
157
+ )
158
+ table, header_row_idx = remove_empty_first_rows(table)
159
+ total_lines = len(table)
160
+ nb_duplicates = len(table.loc[table.duplicated()])
161
+ if num_rows > 0:
162
+ num_rows = min(num_rows - 1, total_lines)
163
+ table = table.sample(num_rows, random_state=random_state)
164
+ if verbose:
165
+ display_logs_depending_process_time(
166
+ f'Table parsed successfully in {round(time() - start, 3)}s',
167
+ time() - start,
168
+ )
169
+ return table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx
@@ -0,0 +1,97 @@
1
+ from io import BytesIO, StringIO
2
+ from typing import Optional, Union
3
+
4
+ import pandas as pd
5
+ import requests
6
+
7
+ from csv_detective.detection.columns import detect_heading_columns, detect_trailing_columns
8
+ from csv_detective.detection.encoding import detect_encoding
9
+ from csv_detective.detection.engine import (
10
+ COMPRESSION_ENGINES,
11
+ EXCEL_ENGINES,
12
+ detect_engine,
13
+ )
14
+ from csv_detective.detection.headers import detect_headers
15
+ from csv_detective.detection.separator import detect_separator
16
+ from csv_detective.utils import is_url
17
+ from .compression import unzip
18
+ from .csv import parse_csv
19
+ from .excel import (
20
+ XLS_LIKE_EXT,
21
+ parse_excel,
22
+ )
23
+
24
+
25
+ def load_file(
26
+ file_path: str,
27
+ num_rows: int = 500,
28
+ encoding: Optional[str] = None,
29
+ sep: Optional[str] = None,
30
+ verbose: bool = False,
31
+ sheet_name: Optional[Union[str, int]] = None,
32
+ ) -> tuple[pd.DataFrame, dict]:
33
+ file_name = file_path.split('/')[-1]
34
+ engine = None
35
+ if '.' not in file_name or not file_name.endswith("csv"):
36
+ # file has no extension, we'll investigate how to read it
37
+ engine = detect_engine(file_path, verbose=verbose)
38
+
39
+ if engine in EXCEL_ENGINES or any([file_path.endswith(k) for k in XLS_LIKE_EXT]):
40
+ table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx = parse_excel(
41
+ file_path=file_path,
42
+ num_rows=num_rows,
43
+ engine=engine,
44
+ sheet_name=sheet_name,
45
+ verbose=verbose,
46
+ )
47
+ header = table.columns.to_list()
48
+ analysis = {
49
+ "engine": engine,
50
+ "sheet_name": sheet_name,
51
+ }
52
+ else:
53
+ # fetching or reading file as binary
54
+ if is_url(file_path):
55
+ r = requests.get(file_path, allow_redirects=True)
56
+ r.raise_for_status()
57
+ binary_file = BytesIO(r.content)
58
+ else:
59
+ binary_file = open(file_path, "rb")
60
+ # handling compression
61
+ if engine in COMPRESSION_ENGINES:
62
+ binary_file: BytesIO = unzip(binary_file=binary_file, engine=engine)
63
+ # detecting encoding if not specified
64
+ if encoding is None:
65
+ encoding: str = detect_encoding(binary_file, verbose=verbose)
66
+ binary_file.seek(0)
67
+ # decoding and reading file
68
+ if is_url(file_path) or engine in COMPRESSION_ENGINES:
69
+ str_file = StringIO(binary_file.read().decode(encoding=encoding))
70
+ else:
71
+ str_file = open(file_path, "r", encoding=encoding)
72
+ if sep is None:
73
+ sep = detect_separator(str_file, verbose=verbose)
74
+ header_row_idx, header = detect_headers(str_file, sep, verbose=verbose)
75
+ if header is None:
76
+ return {"error": True}
77
+ elif isinstance(header, list):
78
+ if any([x is None for x in header]):
79
+ return {"error": True}
80
+ heading_columns = detect_heading_columns(str_file, sep, verbose=verbose)
81
+ trailing_columns = detect_trailing_columns(str_file, sep, heading_columns, verbose=verbose)
82
+ table, total_lines, nb_duplicates = parse_csv(
83
+ str_file, encoding, sep, num_rows, header_row_idx, verbose=verbose
84
+ )
85
+ analysis = {
86
+ "encoding": encoding,
87
+ "separator": sep,
88
+ "heading_columns": heading_columns,
89
+ "trailing_columns": trailing_columns,
90
+ }
91
+ analysis.update({
92
+ "header_row_idx": header_row_idx,
93
+ "header": header,
94
+ "total_lines": total_lines,
95
+ "nb_duplicates": nb_duplicates,
96
+ })
97
+ return table, analysis
@@ -0,0 +1,61 @@
1
+ from re import finditer
2
+
3
+
4
+ def camel_case_split(identifier: str):
5
+ matches = finditer(
6
+ ".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)", identifier
7
+ )
8
+ return " ".join([m.group(0) for m in matches])
9
+
10
+
11
+ translate_dict = {
12
+ " ": ["-", "_", "'", ",", " "],
13
+ "a": ["à", "â"],
14
+ "c": ["ç"],
15
+ "e": ["é", "è", "ê", "é"],
16
+ "i": ["î", "ï"],
17
+ "o": ["ô", "ö"],
18
+ "u": ["ù", "û", "ü"],
19
+ }
20
+
21
+
22
+ # Process text
23
+ def _process_text(val: str):
24
+ """Traitement des chaînes de caractères pour les standardiser.
25
+ Plusieurs alternatives ont été testées : .translate, unidecode.unidecode,
26
+ des méthodes hybrides, mais aucune ne s'est avérée plus performante."""
27
+ val = camel_case_split(val)
28
+ val = val.lower()
29
+ for target in translate_dict:
30
+ for source in translate_dict[target]:
31
+ val = val.replace(source, target)
32
+ val = val.strip()
33
+ return val
34
+
35
+
36
+ def is_word_in_string(word: str, string: str):
37
+ # if the substring is too short, the test can become irrelevant
38
+ return len(word) > 2 and word in string
39
+
40
+
41
+ def header_score(header: str, words_combinations_list: list[str]) -> float:
42
+ """Returns:
43
+ - 1 if the header is exactly in the specified list
44
+ - 0.5 if any of the words is within the header
45
+ - 0 otherwise"""
46
+ processed_header = _process_text(header)
47
+
48
+ header_matches_words_combination = float(
49
+ any(
50
+ words_combination == processed_header for words_combination in words_combinations_list
51
+ )
52
+ )
53
+ words_combination_in_header = 0.5 * (
54
+ any(
55
+ is_word_in_string(
56
+ words_combination, processed_header
57
+ ) for words_combination in words_combinations_list
58
+ )
59
+ )
60
+
61
+ return max(header_matches_words_combination, words_combination_in_header)
csv_detective/validate.py CHANGED
@@ -4,7 +4,7 @@ from typing import Optional, Union
4
4
  import pandas as pd
5
5
 
6
6
  from csv_detective.load_tests import return_all_tests
7
- from .parsing.load import load_file
7
+ from csv_detective.parsing.load import load_file
8
8
 
9
9
  logging.basicConfig(level=logging.INFO)
10
10
 
@@ -6,7 +6,7 @@
6
6
  - Refactor repo metadata and requirements [#120](https://github.com/datagouv/csv-detective/pull/120) [#122](https://github.com/datagouv/csv-detective/pull/122)
7
7
  - Better URL detection [#121](https://github.com/datagouv/csv-detective/pull/121)
8
8
  - For big files, analyse on sample then validate on whole file [#124](https://github.com/datagouv/csv-detective/pull/124)
9
- - Fix imports [#125](https://github.com/datagouv/csv-detective/pull/125)
9
+ - Fix imports [#125](https://github.com/datagouv/csv-detective/pull/125) [#126](https://github.com/datagouv/csv-detective/pull/126) [#127](https://github.com/datagouv/csv-detective/pull/127)
10
10
 
11
11
  ## 0.8.0 (2025-05-20)
12
12
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csv_detective
3
- Version: 0.8.1.dev1469
3
+ Version: 0.8.1.dev1491
4
4
  Summary: Detect tabular files column content
5
5
  Home-page: https://github.com/datagouv/csv_detective
6
6
  Author: Etalab
@@ -1,10 +1,10 @@
1
- csv_detective/__init__.py,sha256=fxctDlEyUexNk_ePriWu6V05xZEeirMV0v_StnEZ8vQ,165
2
- csv_detective/cli.py,sha256=itooHtpyfC6DUsL_DchPKe1xo7m0MYJIp1L4R8eqoTk,1401
3
- csv_detective/explore_csv.py,sha256=YxXgaUqUNdAGsU8bC-cs_TVvSza4wc4aMJQjWRkRT5s,9144
1
+ csv_detective/__init__.py,sha256=TwRP1gozmEmweSbK-lqihSsb-EqmCFSKUnJXz2x-dHE,191
2
+ csv_detective/cli.py,sha256=VNztFz2nc90E3zkghF8PYtXTEZ6TrBSCQMi9v1ljkJs,1414
3
+ csv_detective/explore_csv.py,sha256=VEeAJaz3FPOmGmQ-Yuf3FuSRRPULM03FrTf3qwZX52s,9222
4
4
  csv_detective/load_tests.py,sha256=GILvfkd4OVI-72mA4nzbPlZqgcXZ4wznOhGfZ1ucWkM,2385
5
5
  csv_detective/s3_utils.py,sha256=1cIVdQUYY2ovErbMwp72Gqtqx2bkB8nfVhn-QaOFTT0,1451
6
6
  csv_detective/utils.py,sha256=-tIs9yV7RJPGj65lQ7LjRGch6Iws9UeuIPQsd2uUUJM,1025
7
- csv_detective/validate.py,sha256=4e7f8bNXPU9GqNx4QXXiaoINyotozbL52JB6psVAjyY,2631
7
+ csv_detective/validate.py,sha256=d_4Phmjk6Y0Z0YYVw4vpoZy8E79K370reGgkpzx1mcQ,2644
8
8
  csv_detective/detect_fields/__init__.py,sha256=7Tz0Niaz0BboA3YVsp_6WPA6ywciwDN4-lOy_Ie_0Y8,976
9
9
  csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -136,10 +136,17 @@ csv_detective/output/example.py,sha256=EdPX1iqHhIG4DsiHuYdy-J7JxOkjgUh_o2D5nrfM5
136
136
  csv_detective/output/profile.py,sha256=B8YU541T_YPDezJGh4dkHckOShiwHSrZd9GS8jbmz7A,2919
137
137
  csv_detective/output/schema.py,sha256=ZDBWDOD8IYp7rcB0_n8l9JXGIhOQ6bTZHFWfTmnNNEQ,13480
138
138
  csv_detective/output/utils.py,sha256=HbmvCCCmFo7NJxhD_UsJIveuw-rrfhrvYckv1CJn_10,2301
139
- csv_detective-0.8.1.dev1469.data/data/share/csv_detective/CHANGELOG.md,sha256=-Ut6d9FycTm_ax8QNjBEATCH9NOWOq3fwVLeSgjRTDU,8798
140
- csv_detective-0.8.1.dev1469.data/data/share/csv_detective/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
141
- csv_detective-0.8.1.dev1469.data/data/share/csv_detective/README.md,sha256=gKLFmC8kuCCywS9eAhMak_JNriUWWNOsBKleAu5TIEY,8501
142
- csv_detective-0.8.1.dev1469.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
139
+ csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
140
+ csv_detective/parsing/columns.py,sha256=e0xVmeXNvSC3su5HTFSNClgkz8PlFkoHmNwRYdS57mk,5670
141
+ csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
142
+ csv_detective/parsing/csv.py,sha256=11mibDnJhIjykXLGZvA5ZEU5U7KgxIrbyO6BNv6jlro,1626
143
+ csv_detective/parsing/excel.py,sha256=AslE2S1e67o8yTIAIhp-lAnJ6-XqeBBRz1-VMFqhZBM,7055
144
+ csv_detective/parsing/load.py,sha256=u6fbGFZsL2GwPQRzhAXgt32JpUur7vbQdErREHxNJ-w,3661
145
+ csv_detective/parsing/text.py,sha256=_TprGi0gHZlRsafizI3dqQhBehZW4BazqxmypMcAZ-o,1824
146
+ csv_detective-0.8.1.dev1491.data/data/share/csv_detective/CHANGELOG.md,sha256=cfs5oHz9y-jeXsxyJ8tImHbpUVxtRdLmB03om8a0rco,8916
147
+ csv_detective-0.8.1.dev1491.data/data/share/csv_detective/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
148
+ csv_detective-0.8.1.dev1491.data/data/share/csv_detective/README.md,sha256=gKLFmC8kuCCywS9eAhMak_JNriUWWNOsBKleAu5TIEY,8501
149
+ csv_detective-0.8.1.dev1491.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
143
150
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
144
151
  tests/test_example.py,sha256=JeHxSK0IVDcSrOhSZlNGSQv4JAc_r6mzvJM8PfmLTMw,2018
145
152
  tests/test_fields.py,sha256=d2tNvjtal6ZbO646x1GDbp_CGgp-EIcdg2SgMG72J6E,10270
@@ -147,8 +154,8 @@ tests/test_file.py,sha256=FWVtYHlD5uU7tPeYsqlQg6O4lpU8Ct35vddkbzhvvjA,8508
147
154
  tests/test_labels.py,sha256=Nkr645bUewrj8hjNDKr67FQ6Sy_TID6f3E5Kfkl231M,464
148
155
  tests/test_structure.py,sha256=bv-tjgXohvQAxwmxzH0BynFpK2TyPjcxvtIAmIRlZmA,1393
149
156
  tests/test_validation.py,sha256=CTGonR6htxcWF9WH8MxumDD8cF45Y-G4hm94SM4lFjU,3246
150
- csv_detective-0.8.1.dev1469.dist-info/METADATA,sha256=J9fGXJjtRLS17DxyfwmzjteKpx23J01Cr3oNZaw0DSg,10443
151
- csv_detective-0.8.1.dev1469.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
152
- csv_detective-0.8.1.dev1469.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
153
- csv_detective-0.8.1.dev1469.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
154
- csv_detective-0.8.1.dev1469.dist-info/RECORD,,
157
+ csv_detective-0.8.1.dev1491.dist-info/METADATA,sha256=x0WDskrI4p-HHHSGpBBXmYgF010VKmFUG59dadKSXYI,10443
158
+ csv_detective-0.8.1.dev1491.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
159
+ csv_detective-0.8.1.dev1491.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
160
+ csv_detective-0.8.1.dev1491.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
161
+ csv_detective-0.8.1.dev1491.dist-info/RECORD,,