datagrunt 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datagrunt/__init__.py +39 -0
- datagrunt/core/__init__.py +0 -0
- datagrunt/core/databases.py +53 -0
- datagrunt/core/engines.py +231 -0
- datagrunt/core/fileproperties.py +296 -0
- datagrunt/core/logger.py +47 -0
- datagrunt/core/queries.py +103 -0
- datagrunt/csvfile.py +156 -0
- datagrunt-0.0.0.dist-info/LICENSE +21 -0
- datagrunt-0.0.0.dist-info/METADATA +155 -0
- datagrunt-0.0.0.dist-info/RECORD +13 -0
- datagrunt-0.0.0.dist-info/WHEEL +5 -0
- datagrunt-0.0.0.dist-info/top_level.txt +1 -0
datagrunt/__init__.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Datagrunt
|
|
3
|
+
|
|
4
|
+
A Python library designed to simplify the way you work with CSV files.
|
|
5
|
+
|
|
6
|
+
This module provides inferred CSV delimiters and helper methods for reading and writing CSV files.
|
|
7
|
+
|
|
8
|
+
Example:
|
|
9
|
+
A simple example of how to use the main functionality of your package:
|
|
10
|
+
|
|
11
|
+
from datagrunt.csvfile import CSVReader
|
|
12
|
+
|
|
13
|
+
csv_file = 'electric_vehicle_population_data.csv'
|
|
14
|
+
engine = 'duckdb'
|
|
15
|
+
|
|
16
|
+
dg = CSVReader(csv_file, engine=engine)
|
|
17
|
+
|
|
18
|
+
dg.get_sample()
|
|
19
|
+
|
|
20
|
+
Attributes:
|
|
21
|
+
__version__: A string representing the version of this module.
|
|
22
|
+
__author__: The name of the package author.
|
|
23
|
+
__license__: The license under which the package is released.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
__version__ = "0.0.0"
|
|
27
|
+
__author__ = "Martin Graham"
|
|
28
|
+
__license__ = "MIT"
|
|
29
|
+
|
|
30
|
+
# Import key classes, functions, or submodules that should be available at the package level
|
|
31
|
+
from .csvfile import CSVReader, CSVWriter
|
|
32
|
+
|
|
33
|
+
# You can define __all__ to specify what gets imported with "from package import *"
|
|
34
|
+
__all__ = ['CSVReader', 'CSVWriter']
|
|
35
|
+
|
|
36
|
+
# Optionally, you can include a logger for your package
|
|
37
|
+
import logging
|
|
38
|
+
|
|
39
|
+
logging.getLogger(__name__).addHandler(logging.NullHandler())
|
|
File without changes
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Module for interfacing with databases."""
|
|
2
|
+
|
|
3
|
+
# standard library
|
|
4
|
+
import os
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
# third party libraries
|
|
9
|
+
import duckdb
|
|
10
|
+
|
|
11
|
+
class DuckDBDatabase:
|
|
12
|
+
"""Class to configure local database for file processing.
|
|
13
|
+
Utilizes duckdb as the processing engine.
|
|
14
|
+
"""
|
|
15
|
+
DEFAULT_ENCODING = 'utf-8'
|
|
16
|
+
DEFAULT_THREAD_COUNT = 16
|
|
17
|
+
|
|
18
|
+
def __init__(self, filepath):
|
|
19
|
+
"""
|
|
20
|
+
Initialize the FileDatabase class.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
filepath (str): Path to the file.
|
|
24
|
+
"""
|
|
25
|
+
self.filepath = filepath
|
|
26
|
+
self.database_filename = self._set_database_filename()
|
|
27
|
+
self.database_table_name = self._set_database_table_name()
|
|
28
|
+
self.database_connection = self._set_database_connection()
|
|
29
|
+
|
|
30
|
+
def __del__(self):
|
|
31
|
+
"""Delete .db files after use."""
|
|
32
|
+
if os.path.exists(self.database_filename):
|
|
33
|
+
os.remove(self.database_filename)
|
|
34
|
+
|
|
35
|
+
def _format_filename_string(self):
|
|
36
|
+
"""Remove all non alphanumeric characters from filename."""
|
|
37
|
+
return re.sub(r'[^a-zA-Z0-9]', '', Path(self.filepath).stem)
|
|
38
|
+
|
|
39
|
+
def _set_database_filename(self):
|
|
40
|
+
"""Return name of duckdb file created at runtime."""
|
|
41
|
+
return f'{self._format_filename_string()}.db'
|
|
42
|
+
|
|
43
|
+
def _set_database_table_name(self):
|
|
44
|
+
"""Return name of duckdb import table created during file import."""
|
|
45
|
+
return f'{self._format_filename_string()}'
|
|
46
|
+
|
|
47
|
+
def _set_database_connection(self, threads=DEFAULT_THREAD_COUNT):
|
|
48
|
+
"""Establish a connection with duckdb.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
threads (int): Number of threads to use for duckdb.
|
|
52
|
+
"""
|
|
53
|
+
return duckdb.connect(self.database_filename, config = {'threads': threads})
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
"""Module engines to enable data processing."""
|
|
2
|
+
|
|
3
|
+
# standard library
|
|
4
|
+
|
|
5
|
+
# third party libraries
|
|
6
|
+
import duckdb
|
|
7
|
+
import polars as pl
|
|
8
|
+
|
|
9
|
+
# local libraries
|
|
10
|
+
from .fileproperties import CSVProperties
|
|
11
|
+
from .queries import DuckDBQueries
|
|
12
|
+
from .logger import show_large_file_warning, show_dataframe_sample
|
|
13
|
+
|
|
14
|
+
class CSVReaderDuckDBEngine(CSVProperties):
|
|
15
|
+
"""Class to read CSV files and convert CSV files powered by DuckDB."""
|
|
16
|
+
|
|
17
|
+
def __init__(self, filepath):
|
|
18
|
+
"""
|
|
19
|
+
Initialize the CSVReader class.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
filepath (str): Path to the file to read.
|
|
23
|
+
"""
|
|
24
|
+
super().__init__(filepath)
|
|
25
|
+
self.db_table = DuckDBQueries(self.filepath).database_table_name
|
|
26
|
+
|
|
27
|
+
def _read_csv(self):
|
|
28
|
+
"""Reads a CSV using DuckDB.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
A DuckDB DuckDBPyRelation.
|
|
32
|
+
"""
|
|
33
|
+
return duckdb.read_csv(self.filepath,
|
|
34
|
+
delimiter=self.delimiter,
|
|
35
|
+
null_padding=True,
|
|
36
|
+
all_varchar=True
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
def get_sample(self):
|
|
40
|
+
"""Return a sample of the CSV file."""
|
|
41
|
+
self._read_csv().show()
|
|
42
|
+
|
|
43
|
+
def to_dataframe(self):
|
|
44
|
+
"""Converts CSV to a Polars dataframe.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
A Polars dataframe.
|
|
48
|
+
"""
|
|
49
|
+
if self.is_large:
|
|
50
|
+
show_large_file_warning()
|
|
51
|
+
return self._read_csv().pl()
|
|
52
|
+
|
|
53
|
+
def to_arrow_table(self):
|
|
54
|
+
"""Converts CSV to a PyArrow table.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
A PyArrow table.
|
|
58
|
+
"""
|
|
59
|
+
arrow_table = self._read_csv().arrow()
|
|
60
|
+
return arrow_table
|
|
61
|
+
|
|
62
|
+
def to_dicts(self):
|
|
63
|
+
"""Converts CSV to a list of Python dictionaries.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
A list of dictionaries.
|
|
67
|
+
"""
|
|
68
|
+
dicts = self.to_dataframe().to_dicts()
|
|
69
|
+
return dicts
|
|
70
|
+
|
|
71
|
+
class CSVReaderPolarsEngine(CSVProperties):
|
|
72
|
+
"""Class to read CSV files and convert CSV files powered by Polars."""
|
|
73
|
+
|
|
74
|
+
def get_sample(self):
|
|
75
|
+
"""Return a sample of the CSV file."""
|
|
76
|
+
df = pl.read_csv(self.filepath,
|
|
77
|
+
separator=self.delimiter,
|
|
78
|
+
truncate_ragged_lines=True,
|
|
79
|
+
n_rows=self.DATAFRAME_SAMPLE_ROWS
|
|
80
|
+
)
|
|
81
|
+
show_dataframe_sample(df)
|
|
82
|
+
|
|
83
|
+
def to_dataframe(self):
|
|
84
|
+
"""Converts CSV to a Polars dataframe.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
A Polars dataframe.
|
|
88
|
+
"""
|
|
89
|
+
if self.is_large:
|
|
90
|
+
show_large_file_warning()
|
|
91
|
+
return pl.read_csv(self.filepath,
|
|
92
|
+
separator=self.delimiter,
|
|
93
|
+
truncate_ragged_lines=True
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
def to_arrow_table(self):
|
|
97
|
+
"""Converts CSV to a PyArrow table.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
A PyArrow table.
|
|
101
|
+
"""
|
|
102
|
+
df = self.to_dataframe().to_arrow()
|
|
103
|
+
return df
|
|
104
|
+
|
|
105
|
+
def to_dicts(self):
|
|
106
|
+
"""Converts CSV to a list of Python dictionaries.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
A list of dictionaries.
|
|
110
|
+
"""
|
|
111
|
+
dicts = self.to_dataframe().to_dicts()
|
|
112
|
+
return dicts
|
|
113
|
+
|
|
114
|
+
class CSVWriterDuckDBEngine(CSVProperties):
|
|
115
|
+
"""Class to convert CSV files to various other supported file types powered by DuckDB."""
|
|
116
|
+
|
|
117
|
+
def __init__(self, filepath):
|
|
118
|
+
"""
|
|
119
|
+
Initialize the CSVWriter class.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
filepath (str): Path to the file to write.
|
|
123
|
+
"""
|
|
124
|
+
super().__init__(filepath)
|
|
125
|
+
self.queries = DuckDBQueries(self.filepath)
|
|
126
|
+
|
|
127
|
+
def write_csv(self, out_filename=None):
|
|
128
|
+
"""Query to export a DuckDB table to a CSV file.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
out_filename str: The name of the output file.
|
|
132
|
+
"""
|
|
133
|
+
duckdb.sql(self.queries.import_csv_query(self.delimiter))
|
|
134
|
+
duckdb.sql(self.queries.export_csv_query(out_filename))
|
|
135
|
+
|
|
136
|
+
def write_excel(self, out_filename=None):
|
|
137
|
+
"""Query to export a DuckDB table to an Excel file.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
out_filename (optional, str): The name of the output file.
|
|
141
|
+
"""
|
|
142
|
+
duckdb.sql(self.queries.import_csv_query(self.delimiter))
|
|
143
|
+
duckdb.sql(self.queries.export_excel_query(out_filename))
|
|
144
|
+
|
|
145
|
+
def write_json(self, out_filename=None):
|
|
146
|
+
"""Query to export a DuckDB table to a JSON file.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
out_filename (optional, str): The name of the output file.
|
|
150
|
+
"""
|
|
151
|
+
duckdb.sql(self.queries.import_csv_query(self.delimiter))
|
|
152
|
+
duckdb.sql(self.queries.export_json_query(out_filename))
|
|
153
|
+
|
|
154
|
+
def write_json_newline_delimited(self, out_filename=None):
|
|
155
|
+
"""Query to export a DuckDB table to a JSON newline delimited file.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
out_filename (optional, str): The name of the output file.
|
|
159
|
+
"""
|
|
160
|
+
duckdb.sql(self.queries.import_csv_query(self.delimiter))
|
|
161
|
+
duckdb.sql(self.queries.export_json_newline_delimited_query(out_filename))
|
|
162
|
+
|
|
163
|
+
def write_parquet(self, out_filename=None):
|
|
164
|
+
"""Query to export a DuckDB table to a Parquet file.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
out_filename (optional, str): The name of the output file.
|
|
168
|
+
"""
|
|
169
|
+
duckdb.execute(self.queries.import_csv_query(self.delimiter))
|
|
170
|
+
duckdb.execute(self.queries.export_parquet_query(out_filename))
|
|
171
|
+
|
|
172
|
+
class CSVWriterPolarsEngine(CSVProperties):
|
|
173
|
+
"""Class to write CSVs to other file formats powered by Polars."""
|
|
174
|
+
|
|
175
|
+
def _set_out_filename(self, default_filename, out_filename=None):
|
|
176
|
+
"""Evaluate if a filename is passed in and if not, return default filename."""
|
|
177
|
+
if out_filename:
|
|
178
|
+
filename = out_filename
|
|
179
|
+
else:
|
|
180
|
+
filename = default_filename
|
|
181
|
+
return filename
|
|
182
|
+
|
|
183
|
+
def write_csv(self, out_filename=None):
|
|
184
|
+
"""Export a Polars dataframe to a CSV file.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
out_filename (optional, str): The name of the output file.
|
|
188
|
+
"""
|
|
189
|
+
filename = self._set_out_filename(self.CSV_OUT_FILENAME, out_filename)
|
|
190
|
+
df = CSVReaderPolarsEngine(self.filepath).to_dataframe()
|
|
191
|
+
df.write_csv(filename)
|
|
192
|
+
|
|
193
|
+
def write_excel(self, out_filename=None):
|
|
194
|
+
"""Export a Polars dataframe to an Excel file.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
out_filename (optional, str): The name of the output file.
|
|
198
|
+
"""
|
|
199
|
+
filename = self._set_out_filename(self.EXCEL_OUT_FILENAME, out_filename)
|
|
200
|
+
df = CSVReaderPolarsEngine(self.filepath).to_dataframe()
|
|
201
|
+
df.write_excel(filename)
|
|
202
|
+
|
|
203
|
+
def write_json(self, out_filename=None):
|
|
204
|
+
"""Export a Polars dataframe to a JSON file.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
out_filename (optional, str): The name of the output file.
|
|
208
|
+
"""
|
|
209
|
+
filename = self._set_out_filename(self.JSON_OUT_FILENAME, out_filename)
|
|
210
|
+
df = CSVReaderPolarsEngine(self.filepath).to_dataframe()
|
|
211
|
+
df.write_json(filename)
|
|
212
|
+
|
|
213
|
+
def write_json_newline_delimited(self, out_filename=None):
|
|
214
|
+
"""Export a Polars dataframe to a JSON newline delimited file.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
out_filename (optional, str): The name of the output file.
|
|
218
|
+
"""
|
|
219
|
+
filename = self._set_out_filename(self.JSON_NEWLINE_OUT_FILENAME, out_filename)
|
|
220
|
+
df = CSVReaderPolarsEngine(self.filepath).to_dataframe()
|
|
221
|
+
df.write_ndjson(filename)
|
|
222
|
+
|
|
223
|
+
def write_parquet(self, out_filename=None):
|
|
224
|
+
"""Export a Polars dataframe to a Parquet file.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
out_filename (optional, str): The name of the output file.
|
|
228
|
+
"""
|
|
229
|
+
filename = self._set_out_filename(self.PARQUET_OUT_FILENAME, out_filename)
|
|
230
|
+
df = CSVReaderPolarsEngine(self.filepath).to_dataframe()
|
|
231
|
+
df.write_parquet(filename)
|
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
"""Module for deriving and evaluating file properties."""
|
|
2
|
+
|
|
3
|
+
# standard library
|
|
4
|
+
from collections import Counter
|
|
5
|
+
import csv
|
|
6
|
+
from functools import lru_cache
|
|
7
|
+
import os
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
import re
|
|
10
|
+
|
|
11
|
+
class FileProperties:
|
|
12
|
+
"""Base class for file objects."""
|
|
13
|
+
|
|
14
|
+
FILE_SIZE_DIVISOR = 1024
|
|
15
|
+
DEFAULT_ENCODING = 'utf-8'
|
|
16
|
+
EXCEL_FILE_EXTENSIONS = [
|
|
17
|
+
'xlsx',
|
|
18
|
+
'xlsm',
|
|
19
|
+
'xlsb',
|
|
20
|
+
'xltx',
|
|
21
|
+
'xltm',
|
|
22
|
+
'xls',
|
|
23
|
+
'xlt',
|
|
24
|
+
'xls'
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
CSV_FILE_EXTENSIONS = ['csv']
|
|
28
|
+
TAB_SEPARATED_FILES = ['tsv']
|
|
29
|
+
|
|
30
|
+
TABULAR_FILES = list(set(CSV_FILE_EXTENSIONS +
|
|
31
|
+
EXCEL_FILE_EXTENSIONS +
|
|
32
|
+
TAB_SEPARATED_FILES
|
|
33
|
+
)
|
|
34
|
+
)
|
|
35
|
+
TABULAR_FILES.sort()
|
|
36
|
+
|
|
37
|
+
APACHE_FILE_EXTENSIONS = ['parquet', 'avro']
|
|
38
|
+
|
|
39
|
+
STRUCTURED_FILE_EXTENSIONS = list(set(CSV_FILE_EXTENSIONS +
|
|
40
|
+
EXCEL_FILE_EXTENSIONS +
|
|
41
|
+
TABULAR_FILES +
|
|
42
|
+
TAB_SEPARATED_FILES +
|
|
43
|
+
APACHE_FILE_EXTENSIONS
|
|
44
|
+
)
|
|
45
|
+
)
|
|
46
|
+
STRUCTURED_FILE_EXTENSIONS.sort()
|
|
47
|
+
|
|
48
|
+
SEMI_STRUCTURED_FILE_EXTENSIONS = ['json', 'jsonl']
|
|
49
|
+
|
|
50
|
+
STANDARD_FILE_EXTENSIONS = list(set(CSV_FILE_EXTENSIONS +
|
|
51
|
+
TAB_SEPARATED_FILES +
|
|
52
|
+
SEMI_STRUCTURED_FILE_EXTENSIONS +
|
|
53
|
+
APACHE_FILE_EXTENSIONS
|
|
54
|
+
)
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
STANDARD_FILE_EXTENSIONS.sort()
|
|
58
|
+
|
|
59
|
+
PROPRIETARY_FILE_EXTENSIONS = EXCEL_FILE_EXTENSIONS
|
|
60
|
+
|
|
61
|
+
EXCEL_ROW_LIMIT = 1_048_576
|
|
62
|
+
|
|
63
|
+
JSON_OUT_FILENAME = 'output.json'
|
|
64
|
+
JSON_NEWLINE_OUT_FILENAME = 'output.jsonl'
|
|
65
|
+
CSV_OUT_FILENAME = 'output.csv'
|
|
66
|
+
EXCEL_OUT_FILENAME = 'output.xlsx'
|
|
67
|
+
PARQUET_OUT_FILENAME = 'output.parquet'
|
|
68
|
+
AVRO_OUT_FILENAME = 'output.avro'
|
|
69
|
+
|
|
70
|
+
def __init__(self, filepath):
|
|
71
|
+
"""
|
|
72
|
+
Initialize the FileBase class.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
filepath (str): Path to the file.
|
|
76
|
+
"""
|
|
77
|
+
self.filepath = filepath
|
|
78
|
+
self.filename = Path(filepath).name
|
|
79
|
+
self.extension = Path(filepath).suffix
|
|
80
|
+
self.extension_string = self.extension.replace('.', '')
|
|
81
|
+
self.size_in_bytes = os.path.getsize(filepath)
|
|
82
|
+
self.size_in_kb = round((self.size_in_bytes / self.FILE_SIZE_DIVISOR), 5)
|
|
83
|
+
self.size_in_mb = round((self.size_in_kb / self.FILE_SIZE_DIVISOR), 5)
|
|
84
|
+
self.size_in_gb = round((self.size_in_mb / self.FILE_SIZE_DIVISOR), 5)
|
|
85
|
+
self.size_in_tb = round((self.size_in_gb / self.FILE_SIZE_DIVISOR), 5)
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def is_structured(self):
|
|
89
|
+
"""Check if the file is structured."""
|
|
90
|
+
return self.extension_string.lower() in self.STRUCTURED_FILE_EXTENSIONS
|
|
91
|
+
|
|
92
|
+
@property
|
|
93
|
+
def is_semi_structured(self):
|
|
94
|
+
"""Check if the file is semi-structured."""
|
|
95
|
+
return self.extension_string.lower() in self.SEMI_STRUCTURED_FILE_EXTENSIONS
|
|
96
|
+
|
|
97
|
+
@property
|
|
98
|
+
def is_unstructured(self):
|
|
99
|
+
"""Check if the file is unstructured."""
|
|
100
|
+
return self.extension_string.lower() not in self.STRUCTURED_FILE_EXTENSIONS and \
|
|
101
|
+
self.extension_string.lower() not in self.SEMI_STRUCTURED_FILE_EXTENSIONS
|
|
102
|
+
|
|
103
|
+
@property
|
|
104
|
+
def is_standard(self):
|
|
105
|
+
"""Check if the file is standard."""
|
|
106
|
+
return self.extension_string.lower() in self.STANDARD_FILE_EXTENSIONS
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def is_proprietary(self):
|
|
110
|
+
"""Check if the file is proprietary."""
|
|
111
|
+
return self.extension_string.lower() in self.PROPRIETARY_FILE_EXTENSIONS
|
|
112
|
+
|
|
113
|
+
@property
|
|
114
|
+
def is_csv(self):
|
|
115
|
+
"""Check if the file is a CSV file."""
|
|
116
|
+
return self.extension_string.lower() in self.CSV_FILE_EXTENSIONS
|
|
117
|
+
|
|
118
|
+
@property
|
|
119
|
+
def is_excel(self):
|
|
120
|
+
"""Check if the file is an Excel file."""
|
|
121
|
+
return self.extension_string.lower() in self.EXCEL_FILE_EXTENSIONS
|
|
122
|
+
|
|
123
|
+
@property
|
|
124
|
+
def is_apache(self):
|
|
125
|
+
"""Check if the file is an Apache formatted file."""
|
|
126
|
+
return self.extension_string.lower() in self.APACHE_FILE_EXTENSIONS
|
|
127
|
+
|
|
128
|
+
@property
|
|
129
|
+
def is_empty(self):
|
|
130
|
+
"""Check if the file is empty."""
|
|
131
|
+
return self.size_in_bytes == 0
|
|
132
|
+
|
|
133
|
+
@property
|
|
134
|
+
def is_large(self):
|
|
135
|
+
"""Check if the file is greater than or equal to 1 GB."""
|
|
136
|
+
return self.size_in_gb >= 1.0
|
|
137
|
+
|
|
138
|
+
@property
|
|
139
|
+
def is_tabular(self):
|
|
140
|
+
"""Check if the file is tabular."""
|
|
141
|
+
return self.extension_string.lower() in self.TABULAR_FILES
|
|
142
|
+
|
|
143
|
+
class CSVProperties(FileProperties):
|
|
144
|
+
"""Class for parsing CSV files. Mostly determining the delimiter."""
|
|
145
|
+
|
|
146
|
+
DELIMITER_REGEX_PATTERN = r'[^0-9a-zA-Z_ "-]'
|
|
147
|
+
DEFAULT_DELIMITER = ','
|
|
148
|
+
DEFAULT_SAMPLE_ROWS = 1
|
|
149
|
+
CSV_SNIFF_SAMPLE_ROWS = 5
|
|
150
|
+
DATAFRAME_SAMPLE_ROWS = 20
|
|
151
|
+
|
|
152
|
+
QUOTING_MAP = {
|
|
153
|
+
0: 'no quoting',
|
|
154
|
+
1: 'quote all',
|
|
155
|
+
2: 'quote minimal',
|
|
156
|
+
3: 'quote non-numeric'
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
def __init__(self, filepath):
|
|
160
|
+
"""
|
|
161
|
+
Initialize the CSVParser class.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
filepath (str): Path to the file to read.
|
|
165
|
+
"""
|
|
166
|
+
super().__init__(filepath)
|
|
167
|
+
self.first_row = self._get_first_row_from_file()
|
|
168
|
+
self.delimiter = self._infer_csv_file_delimiter()
|
|
169
|
+
if not self.is_csv:
|
|
170
|
+
raise ValueError(
|
|
171
|
+
f"File extension '{self.extension_string}' is not a valid CSV file extension."
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
def _get_first_row_from_file(self):
|
|
175
|
+
"""Reads and returns the first line of a file.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
filename: The path to the file.
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
The first line of the file, stripped of leading/trailing whitespace,
|
|
182
|
+
or None if the file is empty.
|
|
183
|
+
"""
|
|
184
|
+
with open(self.filepath, 'r', encoding=self.DEFAULT_ENCODING) as csv_file:
|
|
185
|
+
first_line = csv_file.readline().strip()
|
|
186
|
+
return first_line
|
|
187
|
+
|
|
188
|
+
def _get_most_common_non_alpha_numeric_character_from_string(self):
|
|
189
|
+
"""Get the most common non-alpha-numeric character from a given string.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
text (str): The string to get the most common non-alpha-numeric character from.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
str: The most common non-alpha-numeric character from the string.
|
|
196
|
+
"""
|
|
197
|
+
columns_no_spaces = self.first_row.replace(' ', '')
|
|
198
|
+
regex = re.compile(self.DELIMITER_REGEX_PATTERN)
|
|
199
|
+
counts = Counter(char for char in regex.findall(columns_no_spaces))
|
|
200
|
+
most_common = counts.most_common()
|
|
201
|
+
return most_common
|
|
202
|
+
|
|
203
|
+
def _infer_csv_file_delimiter(self):
|
|
204
|
+
"""Infer the delimiter of a CSV file.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
csv_file (str): The path to the CSV file.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
str: The delimiter of the CSV file.
|
|
211
|
+
"""
|
|
212
|
+
delimiter_candidates = self._get_most_common_non_alpha_numeric_character_from_string()
|
|
213
|
+
|
|
214
|
+
if self.is_empty:
|
|
215
|
+
delimiter = self.DEFAULT_DELIMITER
|
|
216
|
+
elif len(delimiter_candidates) == 0:
|
|
217
|
+
delimiter = ' '
|
|
218
|
+
else:
|
|
219
|
+
delimiter = delimiter_candidates[0][0]
|
|
220
|
+
return delimiter
|
|
221
|
+
|
|
222
|
+
def _get_attributes(self):
|
|
223
|
+
"""Generate a dictionary of CSV attributes."""
|
|
224
|
+
columns_list = self.first_row.split(self.delimiter)
|
|
225
|
+
columns = {c: 'VARCHAR' for c in columns_list}
|
|
226
|
+
with open(self.filepath, 'r', encoding=self.DEFAULT_ENCODING) as csvfile:
|
|
227
|
+
# Sniff the file to detect parameters
|
|
228
|
+
dialect = csv.Sniffer().sniff(csvfile.read(self.CSV_SNIFF_SAMPLE_ROWS))
|
|
229
|
+
csvfile.seek(0) # Reset file pointer to the beginning
|
|
230
|
+
|
|
231
|
+
attributes = {
|
|
232
|
+
'delimiter': self.delimiter,
|
|
233
|
+
'quotechar': dialect.quotechar,
|
|
234
|
+
'escapechar': dialect.escapechar,
|
|
235
|
+
'doublequote': dialect.doublequote,
|
|
236
|
+
'newline_delimiter': dialect.lineterminator,
|
|
237
|
+
'skipinitialspace': dialect.skipinitialspace,
|
|
238
|
+
'quoting': self.QUOTING_MAP.get(dialect.quoting),
|
|
239
|
+
'row_count_with_header': self.row_count_with_header,
|
|
240
|
+
'row_count_without_header': self.row_count_without_header,
|
|
241
|
+
'columns_schema': columns,
|
|
242
|
+
'columns_original_format': self.first_row,
|
|
243
|
+
'columns_list': columns_list,
|
|
244
|
+
'columns_string': ", ".join(columns_list),
|
|
245
|
+
'columns_byte_string': ", ".join(columns_list).encode(),
|
|
246
|
+
'column_count': len(columns_list)
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
return attributes
|
|
250
|
+
|
|
251
|
+
@property
|
|
252
|
+
@lru_cache()
|
|
253
|
+
def row_count_with_header(self):
|
|
254
|
+
"""Return the number of lines in the CSV file including the header."""
|
|
255
|
+
with open(self.filepath, 'rb') as csv_file:
|
|
256
|
+
return sum(1 for _ in csv_file)
|
|
257
|
+
|
|
258
|
+
@property
|
|
259
|
+
def row_count_without_header(self):
|
|
260
|
+
"""Return the number of lines in the CSV file excluding the header."""
|
|
261
|
+
return self.row_count_with_header - 1
|
|
262
|
+
|
|
263
|
+
@property
|
|
264
|
+
def columns(self):
|
|
265
|
+
"""Return the schema of the columns in the CSV file."""
|
|
266
|
+
return self._get_attributes()['columns_list']
|
|
267
|
+
|
|
268
|
+
@property
|
|
269
|
+
def columns_string(self):
|
|
270
|
+
"""Return the first row of a CSV file as a string."""
|
|
271
|
+
return self._get_attributes()['columns_string']
|
|
272
|
+
|
|
273
|
+
@property
|
|
274
|
+
def columns_byte_string(self):
|
|
275
|
+
"""Return the first row of the CSV file as bytes."""
|
|
276
|
+
return self._get_attributes()['columns_byte_string']
|
|
277
|
+
|
|
278
|
+
@property
|
|
279
|
+
def column_count(self):
|
|
280
|
+
"""Return the number of columns in the CSV file."""
|
|
281
|
+
return self._get_attributes()['column_count']
|
|
282
|
+
|
|
283
|
+
@property
|
|
284
|
+
def quotechar(self):
|
|
285
|
+
"""Return the quote character used in the CSV file."""
|
|
286
|
+
return self._get_attributes()['quotechar']
|
|
287
|
+
|
|
288
|
+
@property
|
|
289
|
+
def escapechar(self):
|
|
290
|
+
"""Return the escape character used in the CSV file."""
|
|
291
|
+
return self._get_attributes()['escapechar']
|
|
292
|
+
|
|
293
|
+
@property
|
|
294
|
+
def newline_delimiter(self):
|
|
295
|
+
"""Return the newline delimiter used in the CSV file."""
|
|
296
|
+
return self._get_attributes()['newline_delimiter']
|
datagrunt/core/logger.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Module for showing logging messages."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
LARGE_FILE_WARNING = "File is large and may load into memory slowly or exceed memory capacity."
|
|
6
|
+
DUCKDB_ENGINE_ERROR = """DuckDB engine failed due to the following error: {error}. \
|
|
7
|
+
Switching to Polars."""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def show_warning(message):
|
|
11
|
+
"""Show a warning message.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
message (str): The message to show.
|
|
15
|
+
"""
|
|
16
|
+
logging.basicConfig(level=logging.WARNING, format='%(levelname)s - %(message)s')
|
|
17
|
+
return logging.warning(message)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def show_info_message(message):
|
|
21
|
+
"""Show an info message.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
message (str): The message to show.
|
|
25
|
+
"""
|
|
26
|
+
logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(message)s')
|
|
27
|
+
return logging.info(message)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def show_large_file_warning():
|
|
31
|
+
"""Show a warning message if the file is large."""
|
|
32
|
+
show_warning(LARGE_FILE_WARNING)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def duckdb_query_error(error_message):
|
|
36
|
+
"""Show error message if duckdb query fails."""
|
|
37
|
+
message = DUCKDB_ENGINE_ERROR.format(error=error_message)
|
|
38
|
+
return show_warning(message)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def show_dataframe_sample(dataframe):
|
|
42
|
+
"""Show dataframe output.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
dataframe (dataframe): The dataframe to show.
|
|
46
|
+
"""
|
|
47
|
+
return show_info_message(dataframe)
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""Module to store database queries and query strings."""
|
|
2
|
+
|
|
3
|
+
# standard library
|
|
4
|
+
|
|
5
|
+
# third party libraries
|
|
6
|
+
|
|
7
|
+
# local libraries
|
|
8
|
+
from .databases import DuckDBDatabase
|
|
9
|
+
from .fileproperties import FileProperties
|
|
10
|
+
|
|
11
|
+
class DuckDBQueries(DuckDBDatabase):
|
|
12
|
+
"""Class to store DuckDB database queries and query strings."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, filepath):
|
|
15
|
+
"""
|
|
16
|
+
Initialize the DuckDBQueries class.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
filepath (str): Path to the file.
|
|
20
|
+
"""
|
|
21
|
+
super().__init__(filepath)
|
|
22
|
+
self.export_properties = FileProperties(self.filepath)
|
|
23
|
+
|
|
24
|
+
def _set_out_filename(self, default_filename, out_filename=None):
|
|
25
|
+
"""Evaluate if a filename is passed in and if not, return default filename."""
|
|
26
|
+
if out_filename:
|
|
27
|
+
filename = out_filename
|
|
28
|
+
else:
|
|
29
|
+
filename = default_filename
|
|
30
|
+
return filename
|
|
31
|
+
|
|
32
|
+
def import_csv_query(self, delimiter):
|
|
33
|
+
"""Query to import a CSV file into a DuckDB table.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
filepath str: Path to the file.
|
|
37
|
+
delimiter str: The delimiter to use.
|
|
38
|
+
"""
|
|
39
|
+
return f"""
|
|
40
|
+
CREATE OR REPLACE TABLE {self.database_table_name} AS
|
|
41
|
+
SELECT *
|
|
42
|
+
FROM read_csv('{self.filepath}',
|
|
43
|
+
auto_detect=true,
|
|
44
|
+
delim='{delimiter}',
|
|
45
|
+
header=true,
|
|
46
|
+
null_padding=true,
|
|
47
|
+
all_varchar=True);
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def select_from_duckdb_table(self):
|
|
51
|
+
"""Query to select from a DuckDB table."""
|
|
52
|
+
return f"SELECT * FROM {self.database_table_name}"
|
|
53
|
+
|
|
54
|
+
def export_csv_query(self, out_filename=None):
|
|
55
|
+
"""Query to export a DuckDB table to a CSV file.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
out_filename (str, optional): The name of the output file.
|
|
59
|
+
"""
|
|
60
|
+
filename = self._set_out_filename(self.export_properties.CSV_OUT_FILENAME, out_filename)
|
|
61
|
+
return f"COPY {self.database_table_name} TO '{filename}' (HEADER, DELIMITER ',');"
|
|
62
|
+
|
|
63
|
+
def export_excel_query(self, out_filename=None):
|
|
64
|
+
"""Query to export a DuckDB table to an Excel file.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
out_filename (str, optional): The name of the output file.
|
|
68
|
+
"""
|
|
69
|
+
filename = self._set_out_filename(self.export_properties.EXCEL_OUT_FILENAME, out_filename)
|
|
70
|
+
return f"""
|
|
71
|
+
INSTALL spatial;
|
|
72
|
+
LOAD spatial;
|
|
73
|
+
COPY (SELECT * FROM {self.database_table_name})
|
|
74
|
+
TO '{filename}'(FORMAT GDAL, DRIVER 'xlsx')
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
def export_json_query(self, out_filename=None):
|
|
78
|
+
"""Query to export a DuckDB table to a JSON file.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
out_filename (str, optional): The name of the output file.
|
|
82
|
+
"""
|
|
83
|
+
filename = self._set_out_filename(self.export_properties.JSON_OUT_FILENAME, out_filename)
|
|
84
|
+
return f"COPY (SELECT * FROM {self.database_table_name}) TO '{filename}' (ARRAY true) "
|
|
85
|
+
|
|
86
|
+
def export_json_newline_delimited_query(self, out_filename=None):
|
|
87
|
+
"""Query to export a DuckDB table to a JSON file with newline delimited.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
out_filename (str, optional): The name of the output file.
|
|
91
|
+
"""
|
|
92
|
+
filename = self._set_out_filename(self.export_properties.JSON_NEWLINE_OUT_FILENAME,
|
|
93
|
+
out_filename)
|
|
94
|
+
return f"COPY (SELECT * FROM {self.database_table_name}) TO '{filename}'"
|
|
95
|
+
|
|
96
|
+
def export_parquet_query(self, out_filename=None):
|
|
97
|
+
"""Query to export a DuckDB table to a Parquet file.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
out_filename (str, optional): The name of the output file.
|
|
101
|
+
"""
|
|
102
|
+
filename = self._set_out_filename(self.export_properties.PARQUET_OUT_FILENAME, out_filename)
|
|
103
|
+
return f"COPY (SELECT * FROM {self.database_table_name}) TO '{filename}'(FORMAT PARQUET)"
|
datagrunt/csvfile.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""Module for reading CSV files and converting CSV files to different standard file formats."""
|
|
2
|
+
|
|
3
|
+
# standard library
|
|
4
|
+
|
|
5
|
+
# third party libraries
|
|
6
|
+
import duckdb
|
|
7
|
+
|
|
8
|
+
# local libraries
|
|
9
|
+
from .core.fileproperties import CSVProperties
|
|
10
|
+
from .core.engines import CSVReaderDuckDBEngine, CSVReaderPolarsEngine
|
|
11
|
+
from .core.engines import CSVWriterDuckDBEngine, CSVWriterPolarsEngine
|
|
12
|
+
from .core.queries import DuckDBQueries
|
|
13
|
+
|
|
14
|
+
class CSVReader(CSVProperties):
|
|
15
|
+
"""Class to unify the interface for reading CSV files."""
|
|
16
|
+
|
|
17
|
+
READER_ENGINES = ['duckdb', 'polars']
|
|
18
|
+
VALUE_ERROR_MESSAGE = """Reader engine '{engine}' is not 'duckdb' or 'polars'. Pass either 'duckdb' or 'polars' as valid engine params."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, filepath, engine='polars'):
|
|
21
|
+
"""Initialize the CSV Reader class.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
filepath (str): Path to the file to read.
|
|
25
|
+
engine (str, default 'polars'): Determines which reader engine class to instantiate.
|
|
26
|
+
"""
|
|
27
|
+
super().__init__(filepath)
|
|
28
|
+
self.db_table = DuckDBQueries(self.filepath).database_table_name
|
|
29
|
+
self.engine = engine.lower().replace(' ', '')
|
|
30
|
+
if self.engine not in self.READER_ENGINES:
|
|
31
|
+
raise ValueError(self.VALUE_ERROR_MESSAGE.format(engine=self.engine))
|
|
32
|
+
|
|
33
|
+
def _set_reader_engine(self):
|
|
34
|
+
"""Sets the CSV reader engine as either DuckDB or Polars.
|
|
35
|
+
Default engine is Polars.
|
|
36
|
+
"""
|
|
37
|
+
if self.engine != 'polars':
|
|
38
|
+
engine = CSVReaderDuckDBEngine(self.filepath)
|
|
39
|
+
else:
|
|
40
|
+
engine = CSVReaderPolarsEngine(self.filepath)
|
|
41
|
+
return engine
|
|
42
|
+
|
|
43
|
+
def get_sample(self):
|
|
44
|
+
"""Return a sample of the CSV file."""
|
|
45
|
+
self._set_reader_engine().get_sample()
|
|
46
|
+
|
|
47
|
+
def to_dataframe(self):
|
|
48
|
+
"""Converts CSV to a Polars dataframe.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
A Polars dataframe.
|
|
52
|
+
"""
|
|
53
|
+
return self._set_reader_engine().to_dataframe()
|
|
54
|
+
|
|
55
|
+
def to_arrow_table(self):
|
|
56
|
+
"""Converts CSV to a Polars dataframe.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
A PyArrow table.
|
|
60
|
+
"""
|
|
61
|
+
return self._set_reader_engine().to_arrow_table()
|
|
62
|
+
|
|
63
|
+
def to_dicts(self):
|
|
64
|
+
"""Converts CSV to a Polars dataframe.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
A list of dictionaries.
|
|
68
|
+
"""
|
|
69
|
+
return self._set_reader_engine().to_dicts()
|
|
70
|
+
|
|
71
|
+
def query_data(self, sql_query):
|
|
72
|
+
"""Queries as CSV file after importing into DuckDB.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
sql_query (str): Query to run against DuckDB.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
A DuckDB DuckDBPyRelation with the query results.
|
|
79
|
+
|
|
80
|
+
Example if DuckDB Engine:
|
|
81
|
+
dg = CSVReader('myfile.csv')
|
|
82
|
+
query = "SELECT col1, col2 FROM {dg.db_table}" # f string assumed
|
|
83
|
+
dg.query_csv_data(query)
|
|
84
|
+
"""
|
|
85
|
+
queries = DuckDBQueries(self.filepath)
|
|
86
|
+
duckdb.sql(queries.import_csv_query(self.delimiter))
|
|
87
|
+
return duckdb.sql(sql_query)
|
|
88
|
+
|
|
89
|
+
class CSVWriter(CSVProperties):
|
|
90
|
+
"""Class to unify the interface for converting CSV files to various other supported file types."""
|
|
91
|
+
|
|
92
|
+
WRITER_ENGINES = ['duckdb', 'polars']
|
|
93
|
+
VALUE_ERROR_MESSAGE = """Writer engine '{engine}' is not 'duckdb' or 'polars'. Pass either 'duckdb' or 'polars' as valid engine params."""
|
|
94
|
+
|
|
95
|
+
def __init__(self, filepath, engine='duckdb'):
|
|
96
|
+
"""Initialize the CSV Writer class.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
filepath (str): Path to the file to write.
|
|
100
|
+
engine (str, default 'duckdb'): Determines which writer engine class to instantiate.
|
|
101
|
+
"""
|
|
102
|
+
super().__init__(filepath)
|
|
103
|
+
self.db_table = DuckDBQueries(self.filepath).database_table_name
|
|
104
|
+
self.engine = engine.lower().replace(' ', '')
|
|
105
|
+
if self.engine not in self.WRITER_ENGINES:
|
|
106
|
+
raise ValueError(self.VALUE_ERROR_MESSAGE.format(engine=self.engine))
|
|
107
|
+
|
|
108
|
+
def _set_writer_engine(self):
|
|
109
|
+
"""Sets the CSV reader engine as either DuckDB or Polars.
|
|
110
|
+
Default engine is Polars.
|
|
111
|
+
"""
|
|
112
|
+
if self.engine != 'polars':
|
|
113
|
+
engine = CSVWriterDuckDBEngine(self.filepath)
|
|
114
|
+
else:
|
|
115
|
+
engine = CSVWriterPolarsEngine(self.filepath)
|
|
116
|
+
return engine
|
|
117
|
+
|
|
118
|
+
def write_csv(self, out_filename=None):
|
|
119
|
+
"""Query to export a DuckDB table to a CSV file.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
out_filename str: The name of the output file.
|
|
123
|
+
"""
|
|
124
|
+
return self._set_writer_engine().write_csv(out_filename)
|
|
125
|
+
|
|
126
|
+
def write_excel(self, out_filename=None):
|
|
127
|
+
"""Query to export a DuckDB table to an Excel file.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
out_filename str: The name of the output file.
|
|
131
|
+
"""
|
|
132
|
+
return self._set_writer_engine().write_excel(out_filename)
|
|
133
|
+
|
|
134
|
+
def write_json(self, out_filename=None):
|
|
135
|
+
"""Query to export a DuckDB table to a JSON file.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
out_filename str: The name of the output file.
|
|
139
|
+
"""
|
|
140
|
+
return self._set_writer_engine().write_json(out_filename)
|
|
141
|
+
|
|
142
|
+
def write_json_newline_delimited(self, out_filename=None):
|
|
143
|
+
"""Query to export a DuckDB table to a JSON newline delimited file.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
out_filename str: The name of the output file.
|
|
147
|
+
"""
|
|
148
|
+
return self._set_writer_engine().write_json_newline_delimited(out_filename)
|
|
149
|
+
|
|
150
|
+
def write_parquet(self, out_filename=None):
|
|
151
|
+
"""Query to export a DuckDB table to a Parquet file.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
out_filename str: The name of the output file.
|
|
155
|
+
"""
|
|
156
|
+
return self._set_writer_engine().write_parquet(out_filename)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Martin Graham
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: datagrunt
|
|
3
|
+
Version: 0.0.0
|
|
4
|
+
Summary: Read CSV files and convert to other file formats easily
|
|
5
|
+
Author-email: Martin Graham <datagrunt@datagrunt.io>
|
|
6
|
+
License: MIT License
|
|
7
|
+
Project-URL: Homepage, https://pmgraham.github.io/datagrunt-docs
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/pmgraham/datagrunt/issues
|
|
9
|
+
Project-URL: Documentation, https://pmgraham.github.io/datagrunt-docs
|
|
10
|
+
Project-URL: Source Code, https://github.com/pmgraham/datagrunt
|
|
11
|
+
Keywords: csv,data,duckdb,polars,pyarrow,xlsx,delimiter
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Intended Audience :: Developers
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: duckdb>=1.1.0
|
|
23
|
+
Requires-Dist: polars>=1.7.1
|
|
24
|
+
Requires-Dist: pyarrow>=17.0.0
|
|
25
|
+
Requires-Dist: XlsxWriter>=3.2.0
|
|
26
|
+
Provides-Extra: build
|
|
27
|
+
Requires-Dist: build; extra == "build"
|
|
28
|
+
Requires-Dist: twine; extra == "build"
|
|
29
|
+
Requires-Dist: bumpver; extra == "build"
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
32
|
+
Requires-Dist: pytest-cov>=3.0; extra == "dev"
|
|
33
|
+
Requires-Dist: black; extra == "dev"
|
|
34
|
+
Requires-Dist: isort; extra == "dev"
|
|
35
|
+
Requires-Dist: flake8; extra == "dev"
|
|
36
|
+
|
|
37
|
+
# Welcome To Datagrunt
|
|
38
|
+
|
|
39
|
+
Datagrunt is a Python library designed to simplify the way you work with CSV files. It provides a streamlined approach to reading, processing, and transforming your data into various formats, making data manipulation efficient and intuitive.
|
|
40
|
+
|
|
41
|
+
## Why Datagrunt?
|
|
42
|
+
|
|
43
|
+
Born out of real-world frustration, Datagrunt eliminates the need For repetitive coding when handling CSV files. Whether you're a data analyst, data engineer, or data scientist, Datagrunt empowers you to focus on insights, not tedious data wrangling.
|
|
44
|
+
|
|
45
|
+
## Key Features
|
|
46
|
+
|
|
47
|
+
- **Intelligent Delimiter Inference:** Datagrunt automatically detects and applies the correct delimiter for your csv files.
|
|
48
|
+
- **Seamless Data Processing:** Leverage the robust capabilities of [DuckDB](https://duckdb.org) and [Polars](https://pola.rs) to perform advanced data processing tasks directly on your CSV data.
|
|
49
|
+
- **Flexible Transformation:** Easily convert your processed CSV data into various formats to suit your needs.
|
|
50
|
+
- **Pythonic API:** Enjoy a clean and intuitive API that integrates seamlessly into your existing Python workflows.
|
|
51
|
+
|
|
52
|
+
## Installation
|
|
53
|
+
|
|
54
|
+
Get started with Datagrunt in seconds using pip:
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install datagrunt
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Getting Started
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from datagrunt import CSVReader
|
|
64
|
+
|
|
65
|
+
# Load your CSV file
|
|
66
|
+
csv_file = 'electric_vehicle_population_data.csv'
|
|
67
|
+
engine = 'duckdb'
|
|
68
|
+
|
|
69
|
+
# Set duckdb as the processing engine. Engine set to 'polars' by default
|
|
70
|
+
dg = CSVReader(csv_file, engine=engine)
|
|
71
|
+
|
|
72
|
+
# return sample of the data to get a peek at the schema
|
|
73
|
+
dg.get_sample()
|
|
74
|
+
┌────────────┬───────────┬──────────────┬───┬──────────────────────┬──────────────────────┬───────────────────┐
|
|
75
|
+
│ VIN (1-10) │ County │ City │ … │ Vehicle Location │ Electric Utility │ 2020 Census Tract │
|
|
76
|
+
│ varchar │ varchar │ varchar │ │ varchar │ varchar │ varchar │
|
|
77
|
+
├────────────┼───────────┼──────────────┼───┼──────────────────────┼──────────────────────┼───────────────────┤
|
|
78
|
+
│ 5YJSA1E28K │ Snohomish │ Mukilteo │ … │ POINT (-122.29943 … │ PUGET SOUND ENERGY… │ 53061042001 │
|
|
79
|
+
│ 1C4JJXP68P │ Yakima │ Yakima │ … │ POINT (-120.468875… │ PACIFICORP │ 53077001601 │
|
|
80
|
+
│ WBY8P6C05L │ Kitsap │ Kingston │ … │ POINT (-122.517835… │ PUGET SOUND ENERGY… │ 53035090102 │
|
|
81
|
+
│ JTDKARFP1J │ Kitsap │ Port Orchard │ … │ POINT (-122.653005… │ PUGET SOUND ENERGY… │ 53035092802 │
|
|
82
|
+
│ 5UXTA6C09N │ Snohomish │ Everett │ … │ POINT (-122.203234… │ PUGET SOUND ENERGY… │ 53061041605 │
|
|
83
|
+
│ 5YJYGDEF8L │ King │ Seattle │ … │ POINT (-122.378886… │ CITY OF SEATTLE - … │ 53033004703 │
|
|
84
|
+
│ JTMAB3FV7P │ Thurston │ Rainier │ … │ POINT (-122.677141… │ PUGET SOUND ENERGY… │ 53067012530 │
|
|
85
|
+
│ JN1AZ0CPXC │ King │ Kirkland │ … │ POINT (-122.192596… │ PUGET SOUND ENERGY… │ 53033022402 │
|
|
86
|
+
│ JN1AZ0CP7B │ King │ Kirkland │ … │ POINT (-122.192596… │ PUGET SOUND ENERGY… │ 53033022603 │
|
|
87
|
+
│ 1N4AZ0CP0F │ Thurston │ Olympia │ … │ POINT (-122.86491 … │ PUGET SOUND ENERGY… │ 53067010300 │
|
|
88
|
+
│ · │ · │ · │ · │ · │ · │ · │
|
|
89
|
+
│ · │ · │ · │ · │ · │ · │ · │
|
|
90
|
+
│ · │ · │ · │ · │ · │ · │ · │
|
|
91
|
+
│ 5YJYGDEE7M │ Clark │ Vancouver │ … │ POINT (-122.515805… │ BONNEVILLE POWER A… │ 53011041310 │
|
|
92
|
+
│ 7SAYGAEE0P │ Snohomish │ Monroe │ … │ POINT (-121.968385… │ PUGET SOUND ENERGY… │ 53061052203 │
|
|
93
|
+
│ 2C4RC1N75P │ King │ Burien │ … │ POINT (-122.347227… │ CITY OF SEATTLE - … │ 53033027600 │
|
|
94
|
+
│ 1FTVW1EVXP │ King │ Kirkland │ … │ POINT (-122.202653… │ PUGET SOUND ENERGY… │ 53033022300 │
|
|
95
|
+
│ 4JGGM1CB2P │ King │ Seattle │ … │ POINT (-122.2453 4… │ CITY OF SEATTLE - … │ 53033011700 │
|
|
96
|
+
│ 1N4BZ0CP0G │ King │ Seattle │ … │ POINT (-122.334079… │ CITY OF SEATTLE - … │ 53033008300 │
|
|
97
|
+
│ 7SAYGDEF2N │ King │ Bellevue │ … │ POINT (-122.144149… │ PUGET SOUND ENERGY… │ 53033024704 │
|
|
98
|
+
│ 1N4BZ1DP7L │ King │ Bellevue │ … │ POINT (-122.144149… │ PUGET SOUND ENERGY… │ 53033024902 │
|
|
99
|
+
...
|
|
100
|
+
├────────────┴───────────┴──────────────┴───┴──────────────────────┴──────────────────────┴───────────────────┤
|
|
101
|
+
│ ? rows (>9999 rows, 20 shown) 17 columns (6 shown) │
|
|
102
|
+
└─────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## DuckDB Integration for Performant SQL Queries
|
|
106
|
+
```python
|
|
107
|
+
from datagrunt import CSVReader
|
|
108
|
+
|
|
109
|
+
csv_file = 'electric_vehicle_population_data.csv'
|
|
110
|
+
engine = 'duckdb'
|
|
111
|
+
|
|
112
|
+
dg = CSVReader(csv_file, engine=engine)
|
|
113
|
+
|
|
114
|
+
# Construct your SQL query
|
|
115
|
+
query = f"""
|
|
116
|
+
WITH core AS (
|
|
117
|
+
SELECT
|
|
118
|
+
City AS city,
|
|
119
|
+
"VIN (1-10)" AS vin
|
|
120
|
+
FROM {dg.db_table}
|
|
121
|
+
)
|
|
122
|
+
SELECT
|
|
123
|
+
city,
|
|
124
|
+
COUNT(vin) AS vehicle_count
|
|
125
|
+
FROM core
|
|
126
|
+
GROUP BY 1
|
|
127
|
+
ORDER BY 2 DESC
|
|
128
|
+
"""
|
|
129
|
+
|
|
130
|
+
# Execute the query and get results as a Polars DataFrame
|
|
131
|
+
df = dg.query_data(query).pl()
|
|
132
|
+
print(df)
|
|
133
|
+
┌────────────────┬───────────────┐
|
|
134
|
+
│ city ┆ vehicle_count │
|
|
135
|
+
│ --- ┆ --- │
|
|
136
|
+
│ str ┆ i64 │
|
|
137
|
+
╞════════════════╪═══════════════╡
|
|
138
|
+
│ Seattle ┆ 32602 │
|
|
139
|
+
│ Bellevue ┆ 9960 │
|
|
140
|
+
│ Redmond ┆ 7165 │
|
|
141
|
+
│ Vancouver ┆ 7081 │
|
|
142
|
+
│ Bothell ┆ 6602 │
|
|
143
|
+
│ … ┆ … │
|
|
144
|
+
│ Glenwood ┆ 1 │
|
|
145
|
+
│ Walla Walla Co ┆ 1 │
|
|
146
|
+
│ Pittsburg ┆ 1 │
|
|
147
|
+
│ Decatur ┆ 1 │
|
|
148
|
+
│ Redwood City ┆ 1 │
|
|
149
|
+
└────────────────┴───────────────┘
|
|
150
|
+
```
|
|
151
|
+
## License
|
|
152
|
+
This project is licensed under the [MIT License](https://opensource.org/license/mit)
|
|
153
|
+
|
|
154
|
+
## Acknowledgements
|
|
155
|
+
A HUGE thank you to the open source community and the creators of [DuckDB](https://duckdb.org) and [Polars](https://pola.rs) for their fantastic libraries that power Datagrunt.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
datagrunt/__init__.py,sha256=rWgU2R8hvXqr9hw_DH_7j74PI2gnXlHCybQSpwDiZfg,1122
|
|
2
|
+
datagrunt/csvfile.py,sha256=jbIvA9_0ucRJ4qCu6yQ1InSg0ENxN2dkCZcX8rVN4iI,5476
|
|
3
|
+
datagrunt/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
datagrunt/core/databases.py,sha256=yo8ILK9dtj8zusrM8HgoA5s629b-ghGRp2HumvJS4eE,1702
|
|
5
|
+
datagrunt/core/engines.py,sha256=K__2kkdKwvxeL6QwmyEyZdeuUT5MDPpY0lzKwYHrouM,7543
|
|
6
|
+
datagrunt/core/fileproperties.py,sha256=e5mP7bAsQzKpppryZqEV425AXNXsMGZnjrPhYz8gaJI,10198
|
|
7
|
+
datagrunt/core/logger.py,sha256=ck5HmwHmk0SUmTjQudqLgpQk5mpC9T0UOlDKqi6qssI,1250
|
|
8
|
+
datagrunt/core/queries.py,sha256=2OPD2Zz1Toryr5_nVANkJvifwg6MzCx4obah34m-IBY,3846
|
|
9
|
+
datagrunt-0.0.0.dist-info/LICENSE,sha256=qY7AmIOAwE05aVD7siEo8-2HOUz5WYUEhxbX0Ts9fyg,1069
|
|
10
|
+
datagrunt-0.0.0.dist-info/METADATA,sha256=K64t3rDDe8IiHUrJrcrweVYPJSa3COMhC0jWvZ01TpQ,9250
|
|
11
|
+
datagrunt-0.0.0.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
|
|
12
|
+
datagrunt-0.0.0.dist-info/top_level.txt,sha256=kVGx0P9sIaw_RegsAR94f7fyE4Um4ayE5vf_aTImMoE,10
|
|
13
|
+
datagrunt-0.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
datagrunt
|