csv-detective 0.9.3.dev2348__py3-none-any.whl → 0.9.3.dev2382__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {csv_detective-0.9.3.dev2348.dist-info → csv_detective-0.9.3.dev2382.dist-info}/METADATA +23 -23
- {csv_detective-0.9.3.dev2348.dist-info → csv_detective-0.9.3.dev2382.dist-info}/RECORD +6 -6
- {csv_detective-0.9.3.dev2348.dist-info → csv_detective-0.9.3.dev2382.dist-info}/WHEEL +0 -0
- {csv_detective-0.9.3.dev2348.dist-info → csv_detective-0.9.3.dev2382.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.9.3.dev2348.dist-info → csv_detective-0.9.3.dev2382.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.9.3.dev2348.dist-info → csv_detective-0.9.3.dev2382.dist-info}/top_level.txt +0 -0
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: csv-detective
|
|
3
|
-
Version: 0.9.3.
|
|
3
|
+
Version: 0.9.3.dev2382
|
|
4
4
|
Summary: Detect tabular files column content
|
|
5
|
-
Author-email:
|
|
5
|
+
Author-email: "data.gouv.fr" <opendatateam@data.gouv.fr>
|
|
6
6
|
License: MIT
|
|
7
7
|
Project-URL: Source, https://github.com/datagouv/csv_detective
|
|
8
8
|
Keywords: CSV,data processing,encoding,guess,parser,tabular
|
|
9
|
-
Requires-Python: <3.
|
|
9
|
+
Requires-Python: <3.15,>=3.10
|
|
10
10
|
Description-Content-Type: text/markdown
|
|
11
11
|
License-File: LICENSE
|
|
12
12
|
Requires-Dist: dateparser<2,>=1.2.0
|
|
@@ -14,14 +14,14 @@ Requires-Dist: faust-cchardet==2.1.19
|
|
|
14
14
|
Requires-Dist: pandas<3,>=2.2.0
|
|
15
15
|
Requires-Dist: python-dateutil<3,>=2.8.2
|
|
16
16
|
Requires-Dist: Unidecode<2,>=1.3.6
|
|
17
|
-
Requires-Dist: openpyxl
|
|
18
|
-
Requires-Dist: xlrd
|
|
19
|
-
Requires-Dist: odfpy
|
|
17
|
+
Requires-Dist: openpyxl>=3.1.5
|
|
18
|
+
Requires-Dist: xlrd>=2.0.1
|
|
19
|
+
Requires-Dist: odfpy>=1.4.1
|
|
20
20
|
Requires-Dist: requests<3,>=2.32.3
|
|
21
|
-
Requires-Dist: python-magic
|
|
21
|
+
Requires-Dist: python-magic>=0.4.27
|
|
22
22
|
Requires-Dist: frformat==0.4.0
|
|
23
23
|
Requires-Dist: Faker>=33.0.0
|
|
24
|
-
Requires-Dist: rstr
|
|
24
|
+
Requires-Dist: rstr>=3.2.2
|
|
25
25
|
Requires-Dist: more-itertools>=10.8.0
|
|
26
26
|
Provides-Extra: dev
|
|
27
27
|
Requires-Dist: pytest>=8.3.0; extra == "dev"
|
|
@@ -37,13 +37,13 @@ Currently supported file types: csv(.gz), xls, xlsx, ods.
|
|
|
37
37
|
|
|
38
38
|
You can also directly feed the URL of a remote file (from data.gouv.fr for instance).
|
|
39
39
|
|
|
40
|
-
## How To
|
|
40
|
+
## How To?
|
|
41
41
|
|
|
42
42
|
### Install the package
|
|
43
43
|
|
|
44
|
-
You need to have
|
|
44
|
+
You need to have Python >= 3.10 installed. We recommend using a virtual environment.
|
|
45
45
|
|
|
46
|
-
```
|
|
46
|
+
```bash
|
|
47
47
|
pip install csv-detective
|
|
48
48
|
```
|
|
49
49
|
|
|
@@ -64,8 +64,8 @@ inspection_results = routine(
|
|
|
64
64
|
file_path, # or file URL
|
|
65
65
|
num_rows=-1, # Value -1 will analyze all lines of your file, you can change with the number of lines you wish to analyze
|
|
66
66
|
save_results=False, # Default False. If True, it will save result output into the same directory as the analyzed file, using the same name as your file and .json extension
|
|
67
|
-
output_profile=True, # Default False. If True, returned dict will contain a property "profile" indicating profile (min, max, mean, tops...) of every column of
|
|
68
|
-
output_schema=True, # Default False. If True, returned dict will contain a property "schema" containing basic [tableschema](https://specs.frictionlessdata.io/table-schema/) of your file. This can be
|
|
67
|
+
output_profile=True, # Default False. If True, returned dict will contain a property "profile" indicating profile (min, max, mean, tops...) of every column of your csv
|
|
68
|
+
output_schema=True, # Default False. If True, returned dict will contain a property "schema" containing basic [tableschema](https://specs.frictionlessdata.io/table-schema/) of your file. This can be used to validate structure of other csv which should match same structure.
|
|
69
69
|
tags=["fr"], # Default None. If set as a list of strings, only performs checks related to the specified tags (you can see the available tags with FormatsManager().available_tags())
|
|
70
70
|
)
|
|
71
71
|
```
|
|
@@ -74,7 +74,7 @@ inspection_results = routine(
|
|
|
74
74
|
|
|
75
75
|
### Output
|
|
76
76
|
|
|
77
|
-
The program creates a `python`
|
|
77
|
+
The program creates a `python` dictionary with the following information :
|
|
78
78
|
|
|
79
79
|
```
|
|
80
80
|
{
|
|
@@ -111,7 +111,7 @@ The program creates a `python` dictionnary with the following information :
|
|
|
111
111
|
"profile": {
|
|
112
112
|
"column_name" : {
|
|
113
113
|
"min": 1, # only int and float
|
|
114
|
-
"max: 12, # only int and float
|
|
114
|
+
"max": 12, # only int and float
|
|
115
115
|
"mean": 5, # only int and float
|
|
116
116
|
"std": 5, # only int and float
|
|
117
117
|
"tops": [ # 10 most frequent values in the column
|
|
@@ -161,11 +161,11 @@ The program creates a `python` dictionnary with the following information :
|
|
|
161
161
|
|
|
162
162
|
The output slightly differs depending on the file format:
|
|
163
163
|
- csv files have `encoding` and `separator` (and `compression` if relevant)
|
|
164
|
-
- xls,
|
|
164
|
+
- xls, xlsx, ods files have `engine` and `sheet_name`
|
|
165
165
|
|
|
166
166
|
You may also set `output_df` to `True`, in which case the output is a tuple of two elements:
|
|
167
167
|
- the analysis (as described above)
|
|
168
|
-
- an
|
|
168
|
+
- an iterator of `pd.DataFrame`s which contain the columns cast with the detected types (which can be used with `pd.concat` or in a loop):
|
|
169
169
|
```python
|
|
170
170
|
inspection, df_chunks = routine(
|
|
171
171
|
file_path=file_path,
|
|
@@ -188,7 +188,7 @@ Includes :
|
|
|
188
188
|
- UUIDs, Mongo ObjectIds
|
|
189
189
|
|
|
190
190
|
### Validation
|
|
191
|
-
If you have a pre-made analysis of a file, you can check whether
|
|
191
|
+
If you have a pre-made analysis of a file, you can check whether another file conforms to the same analysis:
|
|
192
192
|
```python
|
|
193
193
|
from csv_detective import validate
|
|
194
194
|
is_valid, *_ = validate(
|
|
@@ -226,7 +226,7 @@ Related ideas:
|
|
|
226
226
|
- store column names to make a learning model based on column names for (possible pre-screen)
|
|
227
227
|
- entity resolution (good luck...)
|
|
228
228
|
|
|
229
|
-
## Why Could This Be of Any Use
|
|
229
|
+
## Why Could This Be of Any Use?
|
|
230
230
|
|
|
231
231
|
Organisations such as [data.gouv.fr](http://data.gouv.fr) aggregate huge amounts of un-normalised data. Performing cross-examination across datasets can be difficult. This tool could help enrich the datasets metadata and facilitate linking them together.
|
|
232
232
|
|
|
@@ -247,6 +247,8 @@ ruff format .
|
|
|
247
247
|
|
|
248
248
|
The release process uses the [`tag_version.sh`](tag_version.sh) script to create git tags and update [CHANGELOG.md](CHANGELOG.md) and [pyproject.toml](pyproject.toml) automatically.
|
|
249
249
|
|
|
250
|
+
**Prerequisites**: [GitHub CLI](https://cli.github.com/) (`gh`) must be installed and authenticated, and you must be on the main branch with a clean working directory.
|
|
251
|
+
|
|
250
252
|
```bash
|
|
251
253
|
# Create a new release
|
|
252
254
|
./tag_version.sh <version>
|
|
@@ -258,11 +260,9 @@ The release process uses the [`tag_version.sh`](tag_version.sh) script to create
|
|
|
258
260
|
./tag_version.sh 2.5.0 --dry-run
|
|
259
261
|
```
|
|
260
262
|
|
|
261
|
-
**Prerequisites**: GitHub CLI (`gh`) must be installed and authenticated, and you must be on the main branch with a clean working directory.
|
|
262
|
-
|
|
263
263
|
The script automatically:
|
|
264
|
-
- Updates the version in pyproject.toml
|
|
265
|
-
- Extracts commits since the last tag and formats them for CHANGELOG.md
|
|
264
|
+
- Updates the version in `pyproject.toml`
|
|
265
|
+
- Extracts commits since the last tag and formats them for `CHANGELOG.md`
|
|
266
266
|
- Identifies breaking changes (commits with `!:` in the subject)
|
|
267
267
|
- Creates a git tag and pushes it to the remote repository
|
|
268
268
|
- Creates a GitHub release with the changelog content
|
|
@@ -85,7 +85,7 @@ csv_detective/parsing/csv.py,sha256=0T0gpaXzwJo-sq41IoLQD704GiMUYeDVVASVbat-zWg,
|
|
|
85
85
|
csv_detective/parsing/excel.py,sha256=oAVTuoDccJc4-kVjHXiIPLQx3lq3aZRRZQxkG1c06JQ,6992
|
|
86
86
|
csv_detective/parsing/load.py,sha256=f-8aKiNpy_47qg4Lq-UZUR4NNrbJ_-KEGvcUQZ8cmb0,4317
|
|
87
87
|
csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
|
|
88
|
-
csv_detective-0.9.3.
|
|
88
|
+
csv_detective-0.9.3.dev2382.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
|
|
89
89
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
90
90
|
tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
|
|
91
91
|
tests/test_fields.py,sha256=EWHIKwRSdIh74bBSoozYmZBETf7V03JMWpglyxA0ci0,5616
|
|
@@ -95,8 +95,8 @@ tests/test_structure.py,sha256=XDbviuuvk-0Mu9Y9PI6He2e5hry2dXVJ6yBVwEqF_2o,1043
|
|
|
95
95
|
tests/test_validation.py,sha256=9djBT-PDhu_563OFgWyE20o-wPEWEIQGXp6Pjh0_MQM,3463
|
|
96
96
|
venv/bin/activate_this.py,sha256=wS7qPipy8R-dS_0ICD8PqqUQ8F-PrtcpiJw2DUPngYM,1287
|
|
97
97
|
venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
|
|
98
|
-
csv_detective-0.9.3.
|
|
99
|
-
csv_detective-0.9.3.
|
|
100
|
-
csv_detective-0.9.3.
|
|
101
|
-
csv_detective-0.9.3.
|
|
102
|
-
csv_detective-0.9.3.
|
|
98
|
+
csv_detective-0.9.3.dev2382.dist-info/METADATA,sha256=gGotUngB4Ch3dhlapEv97KEq1JUX-xI1NsT51rOCZ1U,11084
|
|
99
|
+
csv_detective-0.9.3.dev2382.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
100
|
+
csv_detective-0.9.3.dev2382.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
101
|
+
csv_detective-0.9.3.dev2382.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
|
|
102
|
+
csv_detective-0.9.3.dev2382.dist-info/RECORD,,
|
|
File without changes
|
{csv_detective-0.9.3.dev2348.dist-info → csv_detective-0.9.3.dev2382.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{csv_detective-0.9.3.dev2348.dist-info → csv_detective-0.9.3.dev2382.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{csv_detective-0.9.3.dev2348.dist-info → csv_detective-0.9.3.dev2382.dist-info}/top_level.txt
RENAMED
|
File without changes
|