datagrunt 0.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datagrunt-0.0.0/LICENSE +21 -0
- datagrunt-0.0.0/PKG-INFO +155 -0
- datagrunt-0.0.0/README.md +119 -0
- datagrunt-0.0.0/pyproject.toml +71 -0
- datagrunt-0.0.0/setup.cfg +4 -0
- datagrunt-0.0.0/src/datagrunt/__init__.py +39 -0
- datagrunt-0.0.0/src/datagrunt/core/__init__.py +0 -0
- datagrunt-0.0.0/src/datagrunt/core/databases.py +53 -0
- datagrunt-0.0.0/src/datagrunt/core/engines.py +231 -0
- datagrunt-0.0.0/src/datagrunt/core/fileproperties.py +296 -0
- datagrunt-0.0.0/src/datagrunt/core/logger.py +47 -0
- datagrunt-0.0.0/src/datagrunt/core/queries.py +103 -0
- datagrunt-0.0.0/src/datagrunt/csvfile.py +156 -0
- datagrunt-0.0.0/src/datagrunt.egg-info/PKG-INFO +155 -0
- datagrunt-0.0.0/src/datagrunt.egg-info/SOURCES.txt +17 -0
- datagrunt-0.0.0/src/datagrunt.egg-info/dependency_links.txt +1 -0
- datagrunt-0.0.0/src/datagrunt.egg-info/requires.txt +16 -0
- datagrunt-0.0.0/src/datagrunt.egg-info/top_level.txt +1 -0
- datagrunt-0.0.0/tests/test_csvfile.py +630 -0
datagrunt-0.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Martin Graham
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
datagrunt-0.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: datagrunt
|
|
3
|
+
Version: 0.0.0
|
|
4
|
+
Summary: Read CSV files and convert to other file formats easily
|
|
5
|
+
Author-email: Martin Graham <datagrunt@datagrunt.io>
|
|
6
|
+
License: MIT License
|
|
7
|
+
Project-URL: Homepage, https://pmgraham.github.io/datagrunt-docs
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/pmgraham/datagrunt/issues
|
|
9
|
+
Project-URL: Documentation, https://pmgraham.github.io/datagrunt-docs
|
|
10
|
+
Project-URL: Source Code, https://github.com/pmgraham/datagrunt
|
|
11
|
+
Keywords: csv,data,duckdb,polars,pyarrow,xlsx,delimiter
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Intended Audience :: Developers
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: duckdb>=1.1.0
|
|
23
|
+
Requires-Dist: polars>=1.7.1
|
|
24
|
+
Requires-Dist: pyarrow>=17.0.0
|
|
25
|
+
Requires-Dist: XlsxWriter>=3.2.0
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
28
|
+
Requires-Dist: pytest-cov>=3.0; extra == "dev"
|
|
29
|
+
Requires-Dist: black; extra == "dev"
|
|
30
|
+
Requires-Dist: isort; extra == "dev"
|
|
31
|
+
Requires-Dist: flake8; extra == "dev"
|
|
32
|
+
Provides-Extra: build
|
|
33
|
+
Requires-Dist: build; extra == "build"
|
|
34
|
+
Requires-Dist: twine; extra == "build"
|
|
35
|
+
Requires-Dist: bumpver; extra == "build"
|
|
36
|
+
|
|
37
|
+
# Welcome To Datagrunt
|
|
38
|
+
|
|
39
|
+
Datagrunt is a Python library designed to simplify the way you work with CSV files. It provides a streamlined approach to reading, processing, and transforming your data into various formats, making data manipulation efficient and intuitive.
|
|
40
|
+
|
|
41
|
+
## Why Datagrunt?
|
|
42
|
+
|
|
43
|
+
Born out of real-world frustration, Datagrunt eliminates the need For repetitive coding when handling CSV files. Whether you're a data analyst, data engineer, or data scientist, Datagrunt empowers you to focus on insights, not tedious data wrangling.
|
|
44
|
+
|
|
45
|
+
## Key Features
|
|
46
|
+
|
|
47
|
+
- **Intelligent Delimiter Inference:** Datagrunt automatically detects and applies the correct delimiter for your csv files.
|
|
48
|
+
- **Seamless Data Processing:** Leverage the robust capabilities of [DuckDB](https://duckdb.org) and [Polars](https://pola.rs) to perform advanced data processing tasks directly on your CSV data.
|
|
49
|
+
- **Flexible Transformation:** Easily convert your processed CSV data into various formats to suit your needs.
|
|
50
|
+
- **Pythonic API:** Enjoy a clean and intuitive API that integrates seamlessly into your existing Python workflows.
|
|
51
|
+
|
|
52
|
+
## Installation
|
|
53
|
+
|
|
54
|
+
Get started with Datagrunt in seconds using pip:
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install datagrunt
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Getting Started
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from datagrunt import CSVReader
|
|
64
|
+
|
|
65
|
+
# Load your CSV file
|
|
66
|
+
csv_file = 'electric_vehicle_population_data.csv'
|
|
67
|
+
engine = 'duckdb'
|
|
68
|
+
|
|
69
|
+
# Set duckdb as the processing engine. Engine set to 'polars' by default
|
|
70
|
+
dg = CSVReader(csv_file, engine=engine)
|
|
71
|
+
|
|
72
|
+
# return sample of the data to get a peek at the schema
|
|
73
|
+
dg.get_sample()
|
|
74
|
+
┌────────────┬───────────┬──────────────┬───┬──────────────────────┬──────────────────────┬───────────────────┐
|
|
75
|
+
│ VIN (1-10) │ County │ City │ … │ Vehicle Location │ Electric Utility │ 2020 Census Tract │
|
|
76
|
+
│ varchar │ varchar │ varchar │ │ varchar │ varchar │ varchar │
|
|
77
|
+
├────────────┼───────────┼──────────────┼───┼──────────────────────┼──────────────────────┼───────────────────┤
|
|
78
|
+
│ 5YJSA1E28K │ Snohomish │ Mukilteo │ … │ POINT (-122.29943 … │ PUGET SOUND ENERGY… │ 53061042001 │
|
|
79
|
+
│ 1C4JJXP68P │ Yakima │ Yakima │ … │ POINT (-120.468875… │ PACIFICORP │ 53077001601 │
|
|
80
|
+
│ WBY8P6C05L │ Kitsap │ Kingston │ … │ POINT (-122.517835… │ PUGET SOUND ENERGY… │ 53035090102 │
|
|
81
|
+
│ JTDKARFP1J │ Kitsap │ Port Orchard │ … │ POINT (-122.653005… │ PUGET SOUND ENERGY… │ 53035092802 │
|
|
82
|
+
│ 5UXTA6C09N │ Snohomish │ Everett │ … │ POINT (-122.203234… │ PUGET SOUND ENERGY… │ 53061041605 │
|
|
83
|
+
│ 5YJYGDEF8L │ King │ Seattle │ … │ POINT (-122.378886… │ CITY OF SEATTLE - … │ 53033004703 │
|
|
84
|
+
│ JTMAB3FV7P │ Thurston │ Rainier │ … │ POINT (-122.677141… │ PUGET SOUND ENERGY… │ 53067012530 │
|
|
85
|
+
│ JN1AZ0CPXC │ King │ Kirkland │ … │ POINT (-122.192596… │ PUGET SOUND ENERGY… │ 53033022402 │
|
|
86
|
+
│ JN1AZ0CP7B │ King │ Kirkland │ … │ POINT (-122.192596… │ PUGET SOUND ENERGY… │ 53033022603 │
|
|
87
|
+
│ 1N4AZ0CP0F │ Thurston │ Olympia │ … │ POINT (-122.86491 … │ PUGET SOUND ENERGY… │ 53067010300 │
|
|
88
|
+
│ · │ · │ · │ · │ · │ · │ · │
|
|
89
|
+
│ · │ · │ · │ · │ · │ · │ · │
|
|
90
|
+
│ · │ · │ · │ · │ · │ · │ · │
|
|
91
|
+
│ 5YJYGDEE7M │ Clark │ Vancouver │ … │ POINT (-122.515805… │ BONNEVILLE POWER A… │ 53011041310 │
|
|
92
|
+
│ 7SAYGAEE0P │ Snohomish │ Monroe │ … │ POINT (-121.968385… │ PUGET SOUND ENERGY… │ 53061052203 │
|
|
93
|
+
│ 2C4RC1N75P │ King │ Burien │ … │ POINT (-122.347227… │ CITY OF SEATTLE - … │ 53033027600 │
|
|
94
|
+
│ 1FTVW1EVXP │ King │ Kirkland │ … │ POINT (-122.202653… │ PUGET SOUND ENERGY… │ 53033022300 │
|
|
95
|
+
│ 4JGGM1CB2P │ King │ Seattle │ … │ POINT (-122.2453 4… │ CITY OF SEATTLE - … │ 53033011700 │
|
|
96
|
+
│ 1N4BZ0CP0G │ King │ Seattle │ … │ POINT (-122.334079… │ CITY OF SEATTLE - … │ 53033008300 │
|
|
97
|
+
│ 7SAYGDEF2N │ King │ Bellevue │ … │ POINT (-122.144149… │ PUGET SOUND ENERGY… │ 53033024704 │
|
|
98
|
+
│ 1N4BZ1DP7L │ King │ Bellevue │ … │ POINT (-122.144149… │ PUGET SOUND ENERGY… │ 53033024902 │
|
|
99
|
+
...
|
|
100
|
+
├────────────┴───────────┴──────────────┴───┴──────────────────────┴──────────────────────┴───────────────────┤
|
|
101
|
+
│ ? rows (>9999 rows, 20 shown) 17 columns (6 shown) │
|
|
102
|
+
└─────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## DuckDB Integration for Performant SQL Queries
|
|
106
|
+
```python
|
|
107
|
+
from datagrunt import CSVReader
|
|
108
|
+
|
|
109
|
+
csv_file = 'electric_vehicle_population_data.csv'
|
|
110
|
+
engine = 'duckdb'
|
|
111
|
+
|
|
112
|
+
dg = CSVReader(csv_file, engine=engine)
|
|
113
|
+
|
|
114
|
+
# Construct your SQL query
|
|
115
|
+
query = f"""
|
|
116
|
+
WITH core AS (
|
|
117
|
+
SELECT
|
|
118
|
+
City AS city,
|
|
119
|
+
"VIN (1-10)" AS vin
|
|
120
|
+
FROM {dg.db_table}
|
|
121
|
+
)
|
|
122
|
+
SELECT
|
|
123
|
+
city,
|
|
124
|
+
COUNT(vin) AS vehicle_count
|
|
125
|
+
FROM core
|
|
126
|
+
GROUP BY 1
|
|
127
|
+
ORDER BY 2 DESC
|
|
128
|
+
"""
|
|
129
|
+
|
|
130
|
+
# Execute the query and get results as a Polars DataFrame
|
|
131
|
+
df = dg.query_data(query).pl()
|
|
132
|
+
print(df)
|
|
133
|
+
┌────────────────┬───────────────┐
|
|
134
|
+
│ city ┆ vehicle_count │
|
|
135
|
+
│ --- ┆ --- │
|
|
136
|
+
│ str ┆ i64 │
|
|
137
|
+
╞════════════════╪═══════════════╡
|
|
138
|
+
│ Seattle ┆ 32602 │
|
|
139
|
+
│ Bellevue ┆ 9960 │
|
|
140
|
+
│ Redmond ┆ 7165 │
|
|
141
|
+
│ Vancouver ┆ 7081 │
|
|
142
|
+
│ Bothell ┆ 6602 │
|
|
143
|
+
│ … ┆ … │
|
|
144
|
+
│ Glenwood ┆ 1 │
|
|
145
|
+
│ Walla Walla Co ┆ 1 │
|
|
146
|
+
│ Pittsburg ┆ 1 │
|
|
147
|
+
│ Decatur ┆ 1 │
|
|
148
|
+
│ Redwood City ┆ 1 │
|
|
149
|
+
└────────────────┴───────────────┘
|
|
150
|
+
```
|
|
151
|
+
## License
|
|
152
|
+
This project is licensed under the [MIT License](https://opensource.org/license/mit)
|
|
153
|
+
|
|
154
|
+
## Acknowledgements
|
|
155
|
+
A HUGE thank you to the open source community and the creators of [DuckDB](https://duckdb.org) and [Polars](https://pola.rs) for their fantastic libraries that power Datagrunt.
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
# Welcome To Datagrunt
|
|
2
|
+
|
|
3
|
+
Datagrunt is a Python library designed to simplify the way you work with CSV files. It provides a streamlined approach to reading, processing, and transforming your data into various formats, making data manipulation efficient and intuitive.
|
|
4
|
+
|
|
5
|
+
## Why Datagrunt?
|
|
6
|
+
|
|
7
|
+
Born out of real-world frustration, Datagrunt eliminates the need For repetitive coding when handling CSV files. Whether you're a data analyst, data engineer, or data scientist, Datagrunt empowers you to focus on insights, not tedious data wrangling.
|
|
8
|
+
|
|
9
|
+
## Key Features
|
|
10
|
+
|
|
11
|
+
- **Intelligent Delimiter Inference:** Datagrunt automatically detects and applies the correct delimiter for your csv files.
|
|
12
|
+
- **Seamless Data Processing:** Leverage the robust capabilities of [DuckDB](https://duckdb.org) and [Polars](https://pola.rs) to perform advanced data processing tasks directly on your CSV data.
|
|
13
|
+
- **Flexible Transformation:** Easily convert your processed CSV data into various formats to suit your needs.
|
|
14
|
+
- **Pythonic API:** Enjoy a clean and intuitive API that integrates seamlessly into your existing Python workflows.
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
Get started with Datagrunt in seconds using pip:
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install datagrunt
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Getting Started
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
from datagrunt import CSVReader
|
|
28
|
+
|
|
29
|
+
# Load your CSV file
|
|
30
|
+
csv_file = 'electric_vehicle_population_data.csv'
|
|
31
|
+
engine = 'duckdb'
|
|
32
|
+
|
|
33
|
+
# Set duckdb as the processing engine. Engine set to 'polars' by default
|
|
34
|
+
dg = CSVReader(csv_file, engine=engine)
|
|
35
|
+
|
|
36
|
+
# return sample of the data to get a peek at the schema
|
|
37
|
+
dg.get_sample()
|
|
38
|
+
┌────────────┬───────────┬──────────────┬───┬──────────────────────┬──────────────────────┬───────────────────┐
|
|
39
|
+
│ VIN (1-10) │ County │ City │ … │ Vehicle Location │ Electric Utility │ 2020 Census Tract │
|
|
40
|
+
│ varchar │ varchar │ varchar │ │ varchar │ varchar │ varchar │
|
|
41
|
+
├────────────┼───────────┼──────────────┼───┼──────────────────────┼──────────────────────┼───────────────────┤
|
|
42
|
+
│ 5YJSA1E28K │ Snohomish │ Mukilteo │ … │ POINT (-122.29943 … │ PUGET SOUND ENERGY… │ 53061042001 │
|
|
43
|
+
│ 1C4JJXP68P │ Yakima │ Yakima │ … │ POINT (-120.468875… │ PACIFICORP │ 53077001601 │
|
|
44
|
+
│ WBY8P6C05L │ Kitsap │ Kingston │ … │ POINT (-122.517835… │ PUGET SOUND ENERGY… │ 53035090102 │
|
|
45
|
+
│ JTDKARFP1J │ Kitsap │ Port Orchard │ … │ POINT (-122.653005… │ PUGET SOUND ENERGY… │ 53035092802 │
|
|
46
|
+
│ 5UXTA6C09N │ Snohomish │ Everett │ … │ POINT (-122.203234… │ PUGET SOUND ENERGY… │ 53061041605 │
|
|
47
|
+
│ 5YJYGDEF8L │ King │ Seattle │ … │ POINT (-122.378886… │ CITY OF SEATTLE - … │ 53033004703 │
|
|
48
|
+
│ JTMAB3FV7P │ Thurston │ Rainier │ … │ POINT (-122.677141… │ PUGET SOUND ENERGY… │ 53067012530 │
|
|
49
|
+
│ JN1AZ0CPXC │ King │ Kirkland │ … │ POINT (-122.192596… │ PUGET SOUND ENERGY… │ 53033022402 │
|
|
50
|
+
│ JN1AZ0CP7B │ King │ Kirkland │ … │ POINT (-122.192596… │ PUGET SOUND ENERGY… │ 53033022603 │
|
|
51
|
+
│ 1N4AZ0CP0F │ Thurston │ Olympia │ … │ POINT (-122.86491 … │ PUGET SOUND ENERGY… │ 53067010300 │
|
|
52
|
+
│ · │ · │ · │ · │ · │ · │ · │
|
|
53
|
+
│ · │ · │ · │ · │ · │ · │ · │
|
|
54
|
+
│ · │ · │ · │ · │ · │ · │ · │
|
|
55
|
+
│ 5YJYGDEE7M │ Clark │ Vancouver │ … │ POINT (-122.515805… │ BONNEVILLE POWER A… │ 53011041310 │
|
|
56
|
+
│ 7SAYGAEE0P │ Snohomish │ Monroe │ … │ POINT (-121.968385… │ PUGET SOUND ENERGY… │ 53061052203 │
|
|
57
|
+
│ 2C4RC1N75P │ King │ Burien │ … │ POINT (-122.347227… │ CITY OF SEATTLE - … │ 53033027600 │
|
|
58
|
+
│ 1FTVW1EVXP │ King │ Kirkland │ … │ POINT (-122.202653… │ PUGET SOUND ENERGY… │ 53033022300 │
|
|
59
|
+
│ 4JGGM1CB2P │ King │ Seattle │ … │ POINT (-122.2453 4… │ CITY OF SEATTLE - … │ 53033011700 │
|
|
60
|
+
│ 1N4BZ0CP0G │ King │ Seattle │ … │ POINT (-122.334079… │ CITY OF SEATTLE - … │ 53033008300 │
|
|
61
|
+
│ 7SAYGDEF2N │ King │ Bellevue │ … │ POINT (-122.144149… │ PUGET SOUND ENERGY… │ 53033024704 │
|
|
62
|
+
│ 1N4BZ1DP7L │ King │ Bellevue │ … │ POINT (-122.144149… │ PUGET SOUND ENERGY… │ 53033024902 │
|
|
63
|
+
...
|
|
64
|
+
├────────────┴───────────┴──────────────┴───┴──────────────────────┴──────────────────────┴───────────────────┤
|
|
65
|
+
│ ? rows (>9999 rows, 20 shown) 17 columns (6 shown) │
|
|
66
|
+
└─────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## DuckDB Integration for Performant SQL Queries
|
|
70
|
+
```python
|
|
71
|
+
from datagrunt import CSVReader
|
|
72
|
+
|
|
73
|
+
csv_file = 'electric_vehicle_population_data.csv'
|
|
74
|
+
engine = 'duckdb'
|
|
75
|
+
|
|
76
|
+
dg = CSVReader(csv_file, engine=engine)
|
|
77
|
+
|
|
78
|
+
# Construct your SQL query
|
|
79
|
+
query = f"""
|
|
80
|
+
WITH core AS (
|
|
81
|
+
SELECT
|
|
82
|
+
City AS city,
|
|
83
|
+
"VIN (1-10)" AS vin
|
|
84
|
+
FROM {dg.db_table}
|
|
85
|
+
)
|
|
86
|
+
SELECT
|
|
87
|
+
city,
|
|
88
|
+
COUNT(vin) AS vehicle_count
|
|
89
|
+
FROM core
|
|
90
|
+
GROUP BY 1
|
|
91
|
+
ORDER BY 2 DESC
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
# Execute the query and get results as a Polars DataFrame
|
|
95
|
+
df = dg.query_data(query).pl()
|
|
96
|
+
print(df)
|
|
97
|
+
┌────────────────┬───────────────┐
|
|
98
|
+
│ city ┆ vehicle_count │
|
|
99
|
+
│ --- ┆ --- │
|
|
100
|
+
│ str ┆ i64 │
|
|
101
|
+
╞════════════════╪═══════════════╡
|
|
102
|
+
│ Seattle ┆ 32602 │
|
|
103
|
+
│ Bellevue ┆ 9960 │
|
|
104
|
+
│ Redmond ┆ 7165 │
|
|
105
|
+
│ Vancouver ┆ 7081 │
|
|
106
|
+
│ Bothell ┆ 6602 │
|
|
107
|
+
│ … ┆ … │
|
|
108
|
+
│ Glenwood ┆ 1 │
|
|
109
|
+
│ Walla Walla Co ┆ 1 │
|
|
110
|
+
│ Pittsburg ┆ 1 │
|
|
111
|
+
│ Decatur ┆ 1 │
|
|
112
|
+
│ Redwood City ┆ 1 │
|
|
113
|
+
└────────────────┴───────────────┘
|
|
114
|
+
```
|
|
115
|
+
## License
|
|
116
|
+
This project is licensed under the [MIT License](https://opensource.org/license/mit)
|
|
117
|
+
|
|
118
|
+
## Acknowledgements
|
|
119
|
+
A HUGE thank you to the open source community and the creators of [DuckDB](https://duckdb.org) and [Polars](https://pola.rs) for their fantastic libraries that power Datagrunt.
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=74.0.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "datagrunt"
|
|
7
|
+
version = "0.0.0"
|
|
8
|
+
description = "Read CSV files and convert to other file formats easily"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
authors = [{ name = "Martin Graham", email = "datagrunt@datagrunt.io" }]
|
|
11
|
+
license = {text = "MIT License"}
|
|
12
|
+
classifiers = [
|
|
13
|
+
"License :: OSI Approved :: MIT License",
|
|
14
|
+
"Programming Language :: Python",
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"Programming Language :: Python :: 3.10",
|
|
17
|
+
"Programming Language :: Python :: 3.11",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
20
|
+
]
|
|
21
|
+
keywords = ["csv", "data", "duckdb", "polars", "pyarrow", "xlsx", "delimiter"]
|
|
22
|
+
dependencies = [
|
|
23
|
+
"duckdb>=1.1.0",
|
|
24
|
+
"polars>=1.7.1",
|
|
25
|
+
"pyarrow>=17.0.0",
|
|
26
|
+
"XlsxWriter>=3.2.0"
|
|
27
|
+
]
|
|
28
|
+
requires-python = ">=3.10"
|
|
29
|
+
|
|
30
|
+
[project.optional-dependencies]
|
|
31
|
+
dev = ["pytest>=7.0", "pytest-cov>=3.0", "black", "isort", "flake8"]
|
|
32
|
+
build = ["build", "twine", "bumpver"]
|
|
33
|
+
|
|
34
|
+
[project.urls]
|
|
35
|
+
Homepage = "https://pmgraham.github.io/datagrunt-docs"
|
|
36
|
+
"Bug Tracker" = "https://github.com/pmgraham/datagrunt/issues"
|
|
37
|
+
Documentation = "https://pmgraham.github.io/datagrunt-docs"
|
|
38
|
+
"Source Code" = "https://github.com/pmgraham/datagrunt"
|
|
39
|
+
|
|
40
|
+
[tool.setuptools.packages.find]
|
|
41
|
+
where = ["src"]
|
|
42
|
+
include = ["datagrunt*"]
|
|
43
|
+
exclude = ["tests*"]
|
|
44
|
+
|
|
45
|
+
[tool.bumpver]
|
|
46
|
+
current_version = "0.0.0"
|
|
47
|
+
version_pattern = "MAJOR.MINOR.PATCH"
|
|
48
|
+
commit_message = "bump version {old_version} -> {new_version}"
|
|
49
|
+
commit = true
|
|
50
|
+
tag = true
|
|
51
|
+
push = false
|
|
52
|
+
|
|
53
|
+
[tool.bumpver.file_patterns]
|
|
54
|
+
"pyproject.toml" = [
|
|
55
|
+
'current_version = "{version}"',
|
|
56
|
+
'version = "{version}"',
|
|
57
|
+
]
|
|
58
|
+
"src/datagrunt/__init__.py" = [
|
|
59
|
+
'^__version__ = "{version}"$',
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
[tool.black]
|
|
63
|
+
line-length = 88
|
|
64
|
+
target-version = ['py310']
|
|
65
|
+
|
|
66
|
+
[tool.isort]
|
|
67
|
+
profile = "black"
|
|
68
|
+
|
|
69
|
+
[tool.flake8]
|
|
70
|
+
max-line-length = 88
|
|
71
|
+
extend-ignore = "E203"
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Datagrunt
|
|
3
|
+
|
|
4
|
+
A Python library designed to simplify the way you work with CSV files.
|
|
5
|
+
|
|
6
|
+
This module provides inferred CSV delimiters and helper methods for reading and writing CSV files.
|
|
7
|
+
|
|
8
|
+
Example:
|
|
9
|
+
A simple example of how to use the main functionality of your package:
|
|
10
|
+
|
|
11
|
+
from datagrunt.csvfile import CSVReader
|
|
12
|
+
|
|
13
|
+
csv_file = 'electric_vehicle_population_data.csv'
|
|
14
|
+
engine = 'duckdb'
|
|
15
|
+
|
|
16
|
+
dg = CSVReader(csv_file, engine=engine)
|
|
17
|
+
|
|
18
|
+
dg.get_sample()
|
|
19
|
+
|
|
20
|
+
Attributes:
|
|
21
|
+
__version__: A string representing the version of this module.
|
|
22
|
+
__author__: The name of the package author.
|
|
23
|
+
__license__: The license under which the package is released.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
__version__ = "0.0.0"
|
|
27
|
+
__author__ = "Martin Graham"
|
|
28
|
+
__license__ = "MIT"
|
|
29
|
+
|
|
30
|
+
# Import key classes, functions, or submodules that should be available at the package level
|
|
31
|
+
from .csvfile import CSVReader, CSVWriter
|
|
32
|
+
|
|
33
|
+
# You can define __all__ to specify what gets imported with "from package import *"
|
|
34
|
+
__all__ = ['CSVReader', 'CSVWriter']
|
|
35
|
+
|
|
36
|
+
# Optionally, you can include a logger for your package
|
|
37
|
+
import logging
|
|
38
|
+
|
|
39
|
+
logging.getLogger(__name__).addHandler(logging.NullHandler())
|
|
File without changes
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Module for interfacing with databases."""
|
|
2
|
+
|
|
3
|
+
# standard library
|
|
4
|
+
import os
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
# third party libraries
|
|
9
|
+
import duckdb
|
|
10
|
+
|
|
11
|
+
class DuckDBDatabase:
|
|
12
|
+
"""Class to configure local database for file processing.
|
|
13
|
+
Utilizes duckdb as the processing engine.
|
|
14
|
+
"""
|
|
15
|
+
DEFAULT_ENCODING = 'utf-8'
|
|
16
|
+
DEFAULT_THREAD_COUNT = 16
|
|
17
|
+
|
|
18
|
+
def __init__(self, filepath):
|
|
19
|
+
"""
|
|
20
|
+
Initialize the FileDatabase class.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
filepath (str): Path to the file.
|
|
24
|
+
"""
|
|
25
|
+
self.filepath = filepath
|
|
26
|
+
self.database_filename = self._set_database_filename()
|
|
27
|
+
self.database_table_name = self._set_database_table_name()
|
|
28
|
+
self.database_connection = self._set_database_connection()
|
|
29
|
+
|
|
30
|
+
def __del__(self):
|
|
31
|
+
"""Delete .db files after use."""
|
|
32
|
+
if os.path.exists(self.database_filename):
|
|
33
|
+
os.remove(self.database_filename)
|
|
34
|
+
|
|
35
|
+
def _format_filename_string(self):
|
|
36
|
+
"""Remove all non alphanumeric characters from filename."""
|
|
37
|
+
return re.sub(r'[^a-zA-Z0-9]', '', Path(self.filepath).stem)
|
|
38
|
+
|
|
39
|
+
def _set_database_filename(self):
|
|
40
|
+
"""Return name of duckdb file created at runtime."""
|
|
41
|
+
return f'{self._format_filename_string()}.db'
|
|
42
|
+
|
|
43
|
+
def _set_database_table_name(self):
|
|
44
|
+
"""Return name of duckdb import table created during file import."""
|
|
45
|
+
return f'{self._format_filename_string()}'
|
|
46
|
+
|
|
47
|
+
def _set_database_connection(self, threads=DEFAULT_THREAD_COUNT):
|
|
48
|
+
"""Establish a connection with duckdb.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
threads (int): Number of threads to use for duckdb.
|
|
52
|
+
"""
|
|
53
|
+
return duckdb.connect(self.database_filename, config = {'threads': threads})
|