datagrunt 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datagrunt/__init__.py ADDED
@@ -0,0 +1,39 @@
1
+ """
2
+ Datagrunt
3
+
4
+ A Python library designed to simplify the way you work with CSV files.
5
+
6
+ This module provides inferred CSV delimiters and helper methods for reading and writing CSV files.
7
+
8
+ Example:
9
+ A simple example of how to use the main functionality of your package:
10
+
11
+ from datagrunt.csvfile import CSVReader
12
+
13
+ csv_file = 'electric_vehicle_population_data.csv'
14
+ engine = 'duckdb'
15
+
16
+ dg = CSVReader(csv_file, engine=engine)
17
+
18
+ dg.get_sample()
19
+
20
+ Attributes:
21
+ __version__: A string representing the version of this module.
22
+ __author__: The name of the package author.
23
+ __license__: The license under which the package is released.
24
+ """
25
+
26
+ __version__ = "0.0.0"
27
+ __author__ = "Martin Graham"
28
+ __license__ = "MIT"
29
+
30
+ # Import key classes, functions, or submodules that should be available at the package level
31
+ from .csvfile import CSVReader, CSVWriter
32
+
33
+ # You can define __all__ to specify what gets imported with "from package import *"
34
+ __all__ = ['CSVReader', 'CSVWriter']
35
+
36
+ # Optionally, you can include a logger for your package
37
+ import logging
38
+
39
+ logging.getLogger(__name__).addHandler(logging.NullHandler())
File without changes
@@ -0,0 +1,53 @@
1
+ """Module for interfacing with databases."""
2
+
3
+ # standard library
4
+ import os
5
+ from pathlib import Path
6
+ import re
7
+
8
+ # third party libraries
9
+ import duckdb
10
+
11
+ class DuckDBDatabase:
12
+ """Class to configure local database for file processing.
13
+ Utilizes duckdb as the processing engine.
14
+ """
15
+ DEFAULT_ENCODING = 'utf-8'
16
+ DEFAULT_THREAD_COUNT = 16
17
+
18
+ def __init__(self, filepath):
19
+ """
20
+ Initialize the FileDatabase class.
21
+
22
+ Args:
23
+ filepath (str): Path to the file.
24
+ """
25
+ self.filepath = filepath
26
+ self.database_filename = self._set_database_filename()
27
+ self.database_table_name = self._set_database_table_name()
28
+ self.database_connection = self._set_database_connection()
29
+
30
+ def __del__(self):
31
+ """Delete .db files after use."""
32
+ if os.path.exists(self.database_filename):
33
+ os.remove(self.database_filename)
34
+
35
+ def _format_filename_string(self):
36
+ """Remove all non alphanumeric characters from filename."""
37
+ return re.sub(r'[^a-zA-Z0-9]', '', Path(self.filepath).stem)
38
+
39
+ def _set_database_filename(self):
40
+ """Return name of duckdb file created at runtime."""
41
+ return f'{self._format_filename_string()}.db'
42
+
43
+ def _set_database_table_name(self):
44
+ """Return name of duckdb import table created during file import."""
45
+ return f'{self._format_filename_string()}'
46
+
47
+ def _set_database_connection(self, threads=DEFAULT_THREAD_COUNT):
48
+ """Establish a connection with duckdb.
49
+
50
+ Args:
51
+ threads (int): Number of threads to use for duckdb.
52
+ """
53
+ return duckdb.connect(self.database_filename, config = {'threads': threads})
@@ -0,0 +1,231 @@
1
+ """Module engines to enable data processing."""
2
+
3
+ # standard library
4
+
5
+ # third party libraries
6
+ import duckdb
7
+ import polars as pl
8
+
9
+ # local libraries
10
+ from .fileproperties import CSVProperties
11
+ from .queries import DuckDBQueries
12
+ from .logger import show_large_file_warning, show_dataframe_sample
13
+
14
+ class CSVReaderDuckDBEngine(CSVProperties):
15
+ """Class to read CSV files and convert CSV files powered by DuckDB."""
16
+
17
+ def __init__(self, filepath):
18
+ """
19
+ Initialize the CSVReader class.
20
+
21
+ Args:
22
+ filepath (str): Path to the file to read.
23
+ """
24
+ super().__init__(filepath)
25
+ self.db_table = DuckDBQueries(self.filepath).database_table_name
26
+
27
+ def _read_csv(self):
28
+ """Reads a CSV using DuckDB.
29
+
30
+ Returns:
31
+ A DuckDB DuckDBPyRelation.
32
+ """
33
+ return duckdb.read_csv(self.filepath,
34
+ delimiter=self.delimiter,
35
+ null_padding=True,
36
+ all_varchar=True
37
+ )
38
+
39
+ def get_sample(self):
40
+ """Return a sample of the CSV file."""
41
+ self._read_csv().show()
42
+
43
+ def to_dataframe(self):
44
+ """Converts CSV to a Polars dataframe.
45
+
46
+ Returns:
47
+ A Polars dataframe.
48
+ """
49
+ if self.is_large:
50
+ show_large_file_warning()
51
+ return self._read_csv().pl()
52
+
53
+ def to_arrow_table(self):
54
+ """Converts CSV to a PyArrow table.
55
+
56
+ Returns:
57
+ A PyArrow table.
58
+ """
59
+ arrow_table = self._read_csv().arrow()
60
+ return arrow_table
61
+
62
+ def to_dicts(self):
63
+ """Converts CSV to a list of Python dictionaries.
64
+
65
+ Returns:
66
+ A list of dictionaries.
67
+ """
68
+ dicts = self.to_dataframe().to_dicts()
69
+ return dicts
70
+
71
+ class CSVReaderPolarsEngine(CSVProperties):
72
+ """Class to read CSV files and convert CSV files powered by Polars."""
73
+
74
+ def get_sample(self):
75
+ """Return a sample of the CSV file."""
76
+ df = pl.read_csv(self.filepath,
77
+ separator=self.delimiter,
78
+ truncate_ragged_lines=True,
79
+ n_rows=self.DATAFRAME_SAMPLE_ROWS
80
+ )
81
+ show_dataframe_sample(df)
82
+
83
+ def to_dataframe(self):
84
+ """Converts CSV to a Polars dataframe.
85
+
86
+ Returns:
87
+ A Polars dataframe.
88
+ """
89
+ if self.is_large:
90
+ show_large_file_warning()
91
+ return pl.read_csv(self.filepath,
92
+ separator=self.delimiter,
93
+ truncate_ragged_lines=True
94
+ )
95
+
96
+ def to_arrow_table(self):
97
+ """Converts CSV to a PyArrow table.
98
+
99
+ Returns:
100
+ A PyArrow table.
101
+ """
102
+ df = self.to_dataframe().to_arrow()
103
+ return df
104
+
105
+ def to_dicts(self):
106
+ """Converts CSV to a list of Python dictionaries.
107
+
108
+ Returns:
109
+ A list of dictionaries.
110
+ """
111
+ dicts = self.to_dataframe().to_dicts()
112
+ return dicts
113
+
114
+ class CSVWriterDuckDBEngine(CSVProperties):
115
+ """Class to convert CSV files to various other supported file types powered by DuckDB."""
116
+
117
+ def __init__(self, filepath):
118
+ """
119
+ Initialize the CSVWriter class.
120
+
121
+ Args:
122
+ filepath (str): Path to the file to write.
123
+ """
124
+ super().__init__(filepath)
125
+ self.queries = DuckDBQueries(self.filepath)
126
+
127
+ def write_csv(self, out_filename=None):
128
+ """Query to export a DuckDB table to a CSV file.
129
+
130
+ Args:
131
+ out_filename str: The name of the output file.
132
+ """
133
+ duckdb.sql(self.queries.import_csv_query(self.delimiter))
134
+ duckdb.sql(self.queries.export_csv_query(out_filename))
135
+
136
+ def write_excel(self, out_filename=None):
137
+ """Query to export a DuckDB table to an Excel file.
138
+
139
+ Args:
140
+ out_filename (optional, str): The name of the output file.
141
+ """
142
+ duckdb.sql(self.queries.import_csv_query(self.delimiter))
143
+ duckdb.sql(self.queries.export_excel_query(out_filename))
144
+
145
+ def write_json(self, out_filename=None):
146
+ """Query to export a DuckDB table to a JSON file.
147
+
148
+ Args:
149
+ out_filename (optional, str): The name of the output file.
150
+ """
151
+ duckdb.sql(self.queries.import_csv_query(self.delimiter))
152
+ duckdb.sql(self.queries.export_json_query(out_filename))
153
+
154
+ def write_json_newline_delimited(self, out_filename=None):
155
+ """Query to export a DuckDB table to a JSON newline delimited file.
156
+
157
+ Args:
158
+ out_filename (optional, str): The name of the output file.
159
+ """
160
+ duckdb.sql(self.queries.import_csv_query(self.delimiter))
161
+ duckdb.sql(self.queries.export_json_newline_delimited_query(out_filename))
162
+
163
+ def write_parquet(self, out_filename=None):
164
+ """Query to export a DuckDB table to a Parquet file.
165
+
166
+ Args:
167
+ out_filename (optional, str): The name of the output file.
168
+ """
169
+ duckdb.execute(self.queries.import_csv_query(self.delimiter))
170
+ duckdb.execute(self.queries.export_parquet_query(out_filename))
171
+
172
+ class CSVWriterPolarsEngine(CSVProperties):
173
+ """Class to write CSVs to other file formats powered by Polars."""
174
+
175
+ def _set_out_filename(self, default_filename, out_filename=None):
176
+ """Evaluate if a filename is passed in and if not, return default filename."""
177
+ if out_filename:
178
+ filename = out_filename
179
+ else:
180
+ filename = default_filename
181
+ return filename
182
+
183
+ def write_csv(self, out_filename=None):
184
+ """Export a Polars dataframe to a CSV file.
185
+
186
+ Args:
187
+ out_filename (optional, str): The name of the output file.
188
+ """
189
+ filename = self._set_out_filename(self.CSV_OUT_FILENAME, out_filename)
190
+ df = CSVReaderPolarsEngine(self.filepath).to_dataframe()
191
+ df.write_csv(filename)
192
+
193
+ def write_excel(self, out_filename=None):
194
+ """Export a Polars dataframe to an Excel file.
195
+
196
+ Args:
197
+ out_filename (optional, str): The name of the output file.
198
+ """
199
+ filename = self._set_out_filename(self.EXCEL_OUT_FILENAME, out_filename)
200
+ df = CSVReaderPolarsEngine(self.filepath).to_dataframe()
201
+ df.write_excel(filename)
202
+
203
+ def write_json(self, out_filename=None):
204
+ """Export a Polars dataframe to a JSON file.
205
+
206
+ Args:
207
+ out_filename (optional, str): The name of the output file.
208
+ """
209
+ filename = self._set_out_filename(self.JSON_OUT_FILENAME, out_filename)
210
+ df = CSVReaderPolarsEngine(self.filepath).to_dataframe()
211
+ df.write_json(filename)
212
+
213
+ def write_json_newline_delimited(self, out_filename=None):
214
+ """Export a Polars dataframe to a JSON newline delimited file.
215
+
216
+ Args:
217
+ out_filename (optional, str): The name of the output file.
218
+ """
219
+ filename = self._set_out_filename(self.JSON_NEWLINE_OUT_FILENAME, out_filename)
220
+ df = CSVReaderPolarsEngine(self.filepath).to_dataframe()
221
+ df.write_ndjson(filename)
222
+
223
+ def write_parquet(self, out_filename=None):
224
+ """Export a Polars dataframe to a Parquet file.
225
+
226
+ Args:
227
+ out_filename (optional, str): The name of the output file.
228
+ """
229
+ filename = self._set_out_filename(self.PARQUET_OUT_FILENAME, out_filename)
230
+ df = CSVReaderPolarsEngine(self.filepath).to_dataframe()
231
+ df.write_parquet(filename)
@@ -0,0 +1,296 @@
1
+ """Module for deriving and evaluating file properties."""
2
+
3
+ # standard library
4
+ from collections import Counter
5
+ import csv
6
+ from functools import lru_cache
7
+ import os
8
+ from pathlib import Path
9
+ import re
10
+
11
+ class FileProperties:
12
+ """Base class for file objects."""
13
+
14
+ FILE_SIZE_DIVISOR = 1024
15
+ DEFAULT_ENCODING = 'utf-8'
16
+ EXCEL_FILE_EXTENSIONS = [
17
+ 'xlsx',
18
+ 'xlsm',
19
+ 'xlsb',
20
+ 'xltx',
21
+ 'xltm',
22
+ 'xls',
23
+ 'xlt',
24
+ 'xls'
25
+ ]
26
+
27
+ CSV_FILE_EXTENSIONS = ['csv']
28
+ TAB_SEPARATED_FILES = ['tsv']
29
+
30
+ TABULAR_FILES = list(set(CSV_FILE_EXTENSIONS +
31
+ EXCEL_FILE_EXTENSIONS +
32
+ TAB_SEPARATED_FILES
33
+ )
34
+ )
35
+ TABULAR_FILES.sort()
36
+
37
+ APACHE_FILE_EXTENSIONS = ['parquet', 'avro']
38
+
39
+ STRUCTURED_FILE_EXTENSIONS = list(set(CSV_FILE_EXTENSIONS +
40
+ EXCEL_FILE_EXTENSIONS +
41
+ TABULAR_FILES +
42
+ TAB_SEPARATED_FILES +
43
+ APACHE_FILE_EXTENSIONS
44
+ )
45
+ )
46
+ STRUCTURED_FILE_EXTENSIONS.sort()
47
+
48
+ SEMI_STRUCTURED_FILE_EXTENSIONS = ['json', 'jsonl']
49
+
50
+ STANDARD_FILE_EXTENSIONS = list(set(CSV_FILE_EXTENSIONS +
51
+ TAB_SEPARATED_FILES +
52
+ SEMI_STRUCTURED_FILE_EXTENSIONS +
53
+ APACHE_FILE_EXTENSIONS
54
+ )
55
+ )
56
+
57
+ STANDARD_FILE_EXTENSIONS.sort()
58
+
59
+ PROPRIETARY_FILE_EXTENSIONS = EXCEL_FILE_EXTENSIONS
60
+
61
+ EXCEL_ROW_LIMIT = 1_048_576
62
+
63
+ JSON_OUT_FILENAME = 'output.json'
64
+ JSON_NEWLINE_OUT_FILENAME = 'output.jsonl'
65
+ CSV_OUT_FILENAME = 'output.csv'
66
+ EXCEL_OUT_FILENAME = 'output.xlsx'
67
+ PARQUET_OUT_FILENAME = 'output.parquet'
68
+ AVRO_OUT_FILENAME = 'output.avro'
69
+
70
+ def __init__(self, filepath):
71
+ """
72
+ Initialize the FileBase class.
73
+
74
+ Args:
75
+ filepath (str): Path to the file.
76
+ """
77
+ self.filepath = filepath
78
+ self.filename = Path(filepath).name
79
+ self.extension = Path(filepath).suffix
80
+ self.extension_string = self.extension.replace('.', '')
81
+ self.size_in_bytes = os.path.getsize(filepath)
82
+ self.size_in_kb = round((self.size_in_bytes / self.FILE_SIZE_DIVISOR), 5)
83
+ self.size_in_mb = round((self.size_in_kb / self.FILE_SIZE_DIVISOR), 5)
84
+ self.size_in_gb = round((self.size_in_mb / self.FILE_SIZE_DIVISOR), 5)
85
+ self.size_in_tb = round((self.size_in_gb / self.FILE_SIZE_DIVISOR), 5)
86
+
87
+ @property
88
+ def is_structured(self):
89
+ """Check if the file is structured."""
90
+ return self.extension_string.lower() in self.STRUCTURED_FILE_EXTENSIONS
91
+
92
+ @property
93
+ def is_semi_structured(self):
94
+ """Check if the file is semi-structured."""
95
+ return self.extension_string.lower() in self.SEMI_STRUCTURED_FILE_EXTENSIONS
96
+
97
+ @property
98
+ def is_unstructured(self):
99
+ """Check if the file is unstructured."""
100
+ return self.extension_string.lower() not in self.STRUCTURED_FILE_EXTENSIONS and \
101
+ self.extension_string.lower() not in self.SEMI_STRUCTURED_FILE_EXTENSIONS
102
+
103
+ @property
104
+ def is_standard(self):
105
+ """Check if the file is standard."""
106
+ return self.extension_string.lower() in self.STANDARD_FILE_EXTENSIONS
107
+
108
+ @property
109
+ def is_proprietary(self):
110
+ """Check if the file is proprietary."""
111
+ return self.extension_string.lower() in self.PROPRIETARY_FILE_EXTENSIONS
112
+
113
+ @property
114
+ def is_csv(self):
115
+ """Check if the file is a CSV file."""
116
+ return self.extension_string.lower() in self.CSV_FILE_EXTENSIONS
117
+
118
+ @property
119
+ def is_excel(self):
120
+ """Check if the file is an Excel file."""
121
+ return self.extension_string.lower() in self.EXCEL_FILE_EXTENSIONS
122
+
123
+ @property
124
+ def is_apache(self):
125
+ """Check if the file is an Apache formatted file."""
126
+ return self.extension_string.lower() in self.APACHE_FILE_EXTENSIONS
127
+
128
+ @property
129
+ def is_empty(self):
130
+ """Check if the file is empty."""
131
+ return self.size_in_bytes == 0
132
+
133
+ @property
134
+ def is_large(self):
135
+ """Check if the file is greater than or equal to 1 GB."""
136
+ return self.size_in_gb >= 1.0
137
+
138
+ @property
139
+ def is_tabular(self):
140
+ """Check if the file is tabular."""
141
+ return self.extension_string.lower() in self.TABULAR_FILES
142
+
143
+ class CSVProperties(FileProperties):
144
+ """Class for parsing CSV files. Mostly determining the delimiter."""
145
+
146
+ DELIMITER_REGEX_PATTERN = r'[^0-9a-zA-Z_ "-]'
147
+ DEFAULT_DELIMITER = ','
148
+ DEFAULT_SAMPLE_ROWS = 1
149
+ CSV_SNIFF_SAMPLE_ROWS = 5
150
+ DATAFRAME_SAMPLE_ROWS = 20
151
+
152
+ QUOTING_MAP = {
153
+ 0: 'no quoting',
154
+ 1: 'quote all',
155
+ 2: 'quote minimal',
156
+ 3: 'quote non-numeric'
157
+ }
158
+
159
+ def __init__(self, filepath):
160
+ """
161
+ Initialize the CSVParser class.
162
+
163
+ Args:
164
+ filepath (str): Path to the file to read.
165
+ """
166
+ super().__init__(filepath)
167
+ self.first_row = self._get_first_row_from_file()
168
+ self.delimiter = self._infer_csv_file_delimiter()
169
+ if not self.is_csv:
170
+ raise ValueError(
171
+ f"File extension '{self.extension_string}' is not a valid CSV file extension."
172
+ )
173
+
174
+ def _get_first_row_from_file(self):
175
+ """Reads and returns the first line of a file.
176
+
177
+ Args:
178
+ filename: The path to the file.
179
+
180
+ Returns:
181
+ The first line of the file, stripped of leading/trailing whitespace,
182
+ or None if the file is empty.
183
+ """
184
+ with open(self.filepath, 'r', encoding=self.DEFAULT_ENCODING) as csv_file:
185
+ first_line = csv_file.readline().strip()
186
+ return first_line
187
+
188
+ def _get_most_common_non_alpha_numeric_character_from_string(self):
189
+ """Get the most common non-alpha-numeric character from a given string.
190
+
191
+ Args:
192
+ text (str): The string to get the most common non-alpha-numeric character from.
193
+
194
+ Returns:
195
+ str: The most common non-alpha-numeric character from the string.
196
+ """
197
+ columns_no_spaces = self.first_row.replace(' ', '')
198
+ regex = re.compile(self.DELIMITER_REGEX_PATTERN)
199
+ counts = Counter(char for char in regex.findall(columns_no_spaces))
200
+ most_common = counts.most_common()
201
+ return most_common
202
+
203
+ def _infer_csv_file_delimiter(self):
204
+ """Infer the delimiter of a CSV file.
205
+
206
+ Args:
207
+ csv_file (str): The path to the CSV file.
208
+
209
+ Returns:
210
+ str: The delimiter of the CSV file.
211
+ """
212
+ delimiter_candidates = self._get_most_common_non_alpha_numeric_character_from_string()
213
+
214
+ if self.is_empty:
215
+ delimiter = self.DEFAULT_DELIMITER
216
+ elif len(delimiter_candidates) == 0:
217
+ delimiter = ' '
218
+ else:
219
+ delimiter = delimiter_candidates[0][0]
220
+ return delimiter
221
+
222
+ def _get_attributes(self):
223
+ """Generate a dictionary of CSV attributes."""
224
+ columns_list = self.first_row.split(self.delimiter)
225
+ columns = {c: 'VARCHAR' for c in columns_list}
226
+ with open(self.filepath, 'r', encoding=self.DEFAULT_ENCODING) as csvfile:
227
+ # Sniff the file to detect parameters
228
+ dialect = csv.Sniffer().sniff(csvfile.read(self.CSV_SNIFF_SAMPLE_ROWS))
229
+ csvfile.seek(0) # Reset file pointer to the beginning
230
+
231
+ attributes = {
232
+ 'delimiter': self.delimiter,
233
+ 'quotechar': dialect.quotechar,
234
+ 'escapechar': dialect.escapechar,
235
+ 'doublequote': dialect.doublequote,
236
+ 'newline_delimiter': dialect.lineterminator,
237
+ 'skipinitialspace': dialect.skipinitialspace,
238
+ 'quoting': self.QUOTING_MAP.get(dialect.quoting),
239
+ 'row_count_with_header': self.row_count_with_header,
240
+ 'row_count_without_header': self.row_count_without_header,
241
+ 'columns_schema': columns,
242
+ 'columns_original_format': self.first_row,
243
+ 'columns_list': columns_list,
244
+ 'columns_string': ", ".join(columns_list),
245
+ 'columns_byte_string': ", ".join(columns_list).encode(),
246
+ 'column_count': len(columns_list)
247
+ }
248
+
249
+ return attributes
250
+
251
+ @property
252
+ @lru_cache()
253
+ def row_count_with_header(self):
254
+ """Return the number of lines in the CSV file including the header."""
255
+ with open(self.filepath, 'rb') as csv_file:
256
+ return sum(1 for _ in csv_file)
257
+
258
+ @property
259
+ def row_count_without_header(self):
260
+ """Return the number of lines in the CSV file excluding the header."""
261
+ return self.row_count_with_header - 1
262
+
263
+ @property
264
+ def columns(self):
265
+ """Return the schema of the columns in the CSV file."""
266
+ return self._get_attributes()['columns_list']
267
+
268
+ @property
269
+ def columns_string(self):
270
+ """Return the first row of a CSV file as a string."""
271
+ return self._get_attributes()['columns_string']
272
+
273
+ @property
274
+ def columns_byte_string(self):
275
+ """Return the first row of the CSV file as bytes."""
276
+ return self._get_attributes()['columns_byte_string']
277
+
278
+ @property
279
+ def column_count(self):
280
+ """Return the number of columns in the CSV file."""
281
+ return self._get_attributes()['column_count']
282
+
283
+ @property
284
+ def quotechar(self):
285
+ """Return the quote character used in the CSV file."""
286
+ return self._get_attributes()['quotechar']
287
+
288
+ @property
289
+ def escapechar(self):
290
+ """Return the escape character used in the CSV file."""
291
+ return self._get_attributes()['escapechar']
292
+
293
+ @property
294
+ def newline_delimiter(self):
295
+ """Return the newline delimiter used in the CSV file."""
296
+ return self._get_attributes()['newline_delimiter']
@@ -0,0 +1,47 @@
1
+ """Module for showing logging messages."""
2
+
3
+ import logging
4
+
5
+ LARGE_FILE_WARNING = "File is large and may load into memory slowly or exceed memory capacity."
6
+ DUCKDB_ENGINE_ERROR = """DuckDB engine failed due to the following error: {error}. \
7
+ Switching to Polars."""
8
+
9
+
10
+ def show_warning(message):
11
+ """Show a warning message.
12
+
13
+ Args:
14
+ message (str): The message to show.
15
+ """
16
+ logging.basicConfig(level=logging.WARNING, format='%(levelname)s - %(message)s')
17
+ return logging.warning(message)
18
+
19
+
20
+ def show_info_message(message):
21
+ """Show an info message.
22
+
23
+ Args:
24
+ message (str): The message to show.
25
+ """
26
+ logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(message)s')
27
+ return logging.info(message)
28
+
29
+
30
+ def show_large_file_warning():
31
+ """Show a warning message if the file is large."""
32
+ show_warning(LARGE_FILE_WARNING)
33
+
34
+
35
+ def duckdb_query_error(error_message):
36
+ """Show error message if duckdb query fails."""
37
+ message = DUCKDB_ENGINE_ERROR.format(error=error_message)
38
+ return show_warning(message)
39
+
40
+
41
+ def show_dataframe_sample(dataframe):
42
+ """Show dataframe output.
43
+
44
+ Args:
45
+ dataframe (dataframe): The dataframe to show.
46
+ """
47
+ return show_info_message(dataframe)
@@ -0,0 +1,103 @@
1
+ """Module to store database queries and query strings."""
2
+
3
+ # standard library
4
+
5
+ # third party libraries
6
+
7
+ # local libraries
8
+ from .databases import DuckDBDatabase
9
+ from .fileproperties import FileProperties
10
+
11
+ class DuckDBQueries(DuckDBDatabase):
12
+ """Class to store DuckDB database queries and query strings."""
13
+
14
+ def __init__(self, filepath):
15
+ """
16
+ Initialize the DuckDBQueries class.
17
+
18
+ Args:
19
+ filepath (str): Path to the file.
20
+ """
21
+ super().__init__(filepath)
22
+ self.export_properties = FileProperties(self.filepath)
23
+
24
+ def _set_out_filename(self, default_filename, out_filename=None):
25
+ """Evaluate if a filename is passed in and if not, return default filename."""
26
+ if out_filename:
27
+ filename = out_filename
28
+ else:
29
+ filename = default_filename
30
+ return filename
31
+
32
+ def import_csv_query(self, delimiter):
33
+ """Query to import a CSV file into a DuckDB table.
34
+
35
+ Args:
36
+ filepath str: Path to the file.
37
+ delimiter str: The delimiter to use.
38
+ """
39
+ return f"""
40
+ CREATE OR REPLACE TABLE {self.database_table_name} AS
41
+ SELECT *
42
+ FROM read_csv('{self.filepath}',
43
+ auto_detect=true,
44
+ delim='{delimiter}',
45
+ header=true,
46
+ null_padding=true,
47
+ all_varchar=True);
48
+ """
49
+
50
+ def select_from_duckdb_table(self):
51
+ """Query to select from a DuckDB table."""
52
+ return f"SELECT * FROM {self.database_table_name}"
53
+
54
+ def export_csv_query(self, out_filename=None):
55
+ """Query to export a DuckDB table to a CSV file.
56
+
57
+ Args:
58
+ out_filename (str, optional): The name of the output file.
59
+ """
60
+ filename = self._set_out_filename(self.export_properties.CSV_OUT_FILENAME, out_filename)
61
+ return f"COPY {self.database_table_name} TO '{filename}' (HEADER, DELIMITER ',');"
62
+
63
+ def export_excel_query(self, out_filename=None):
64
+ """Query to export a DuckDB table to an Excel file.
65
+
66
+ Args:
67
+ out_filename (str, optional): The name of the output file.
68
+ """
69
+ filename = self._set_out_filename(self.export_properties.EXCEL_OUT_FILENAME, out_filename)
70
+ return f"""
71
+ INSTALL spatial;
72
+ LOAD spatial;
73
+ COPY (SELECT * FROM {self.database_table_name})
74
+ TO '{filename}'(FORMAT GDAL, DRIVER 'xlsx')
75
+ """
76
+
77
+ def export_json_query(self, out_filename=None):
78
+ """Query to export a DuckDB table to a JSON file.
79
+
80
+ Args:
81
+ out_filename (str, optional): The name of the output file.
82
+ """
83
+ filename = self._set_out_filename(self.export_properties.JSON_OUT_FILENAME, out_filename)
84
+ return f"COPY (SELECT * FROM {self.database_table_name}) TO '{filename}' (ARRAY true) "
85
+
86
+ def export_json_newline_delimited_query(self, out_filename=None):
87
+ """Query to export a DuckDB table to a JSON file with newline delimited.
88
+
89
+ Args:
90
+ out_filename (str, optional): The name of the output file.
91
+ """
92
+ filename = self._set_out_filename(self.export_properties.JSON_NEWLINE_OUT_FILENAME,
93
+ out_filename)
94
+ return f"COPY (SELECT * FROM {self.database_table_name}) TO '{filename}'"
95
+
96
+ def export_parquet_query(self, out_filename=None):
97
+ """Query to export a DuckDB table to a Parquet file.
98
+
99
+ Args:
100
+ out_filename (str, optional): The name of the output file.
101
+ """
102
+ filename = self._set_out_filename(self.export_properties.PARQUET_OUT_FILENAME, out_filename)
103
+ return f"COPY (SELECT * FROM {self.database_table_name}) TO '{filename}'(FORMAT PARQUET)"
datagrunt/csvfile.py ADDED
@@ -0,0 +1,156 @@
1
+ """Module for reading CSV files and converting CSV files to different standard file formats."""
2
+
3
+ # standard library
4
+
5
+ # third party libraries
6
+ import duckdb
7
+
8
+ # local libraries
9
+ from .core.fileproperties import CSVProperties
10
+ from .core.engines import CSVReaderDuckDBEngine, CSVReaderPolarsEngine
11
+ from .core.engines import CSVWriterDuckDBEngine, CSVWriterPolarsEngine
12
+ from .core.queries import DuckDBQueries
13
+
14
+ class CSVReader(CSVProperties):
15
+ """Class to unify the interface for reading CSV files."""
16
+
17
+ READER_ENGINES = ['duckdb', 'polars']
18
+ VALUE_ERROR_MESSAGE = """Reader engine '{engine}' is not 'duckdb' or 'polars'. Pass either 'duckdb' or 'polars' as valid engine params."""
19
+
20
+ def __init__(self, filepath, engine='polars'):
21
+ """Initialize the CSV Reader class.
22
+
23
+ Args:
24
+ filepath (str): Path to the file to read.
25
+ engine (str, default 'polars'): Determines which reader engine class to instantiate.
26
+ """
27
+ super().__init__(filepath)
28
+ self.db_table = DuckDBQueries(self.filepath).database_table_name
29
+ self.engine = engine.lower().replace(' ', '')
30
+ if self.engine not in self.READER_ENGINES:
31
+ raise ValueError(self.VALUE_ERROR_MESSAGE.format(engine=self.engine))
32
+
33
+ def _set_reader_engine(self):
34
+ """Sets the CSV reader engine as either DuckDB or Polars.
35
+ Default engine is Polars.
36
+ """
37
+ if self.engine != 'polars':
38
+ engine = CSVReaderDuckDBEngine(self.filepath)
39
+ else:
40
+ engine = CSVReaderPolarsEngine(self.filepath)
41
+ return engine
42
+
43
+ def get_sample(self):
44
+ """Return a sample of the CSV file."""
45
+ self._set_reader_engine().get_sample()
46
+
47
+ def to_dataframe(self):
48
+ """Converts CSV to a Polars dataframe.
49
+
50
+ Returns:
51
+ A Polars dataframe.
52
+ """
53
+ return self._set_reader_engine().to_dataframe()
54
+
55
+ def to_arrow_table(self):
56
+ """Converts CSV to a Polars dataframe.
57
+
58
+ Returns:
59
+ A PyArrow table.
60
+ """
61
+ return self._set_reader_engine().to_arrow_table()
62
+
63
+ def to_dicts(self):
64
+ """Converts CSV to a Polars dataframe.
65
+
66
+ Returns:
67
+ A list of dictionaries.
68
+ """
69
+ return self._set_reader_engine().to_dicts()
70
+
71
+ def query_data(self, sql_query):
72
+ """Queries as CSV file after importing into DuckDB.
73
+
74
+ Args:
75
+ sql_query (str): Query to run against DuckDB.
76
+
77
+ Returns:
78
+ A DuckDB DuckDBPyRelation with the query results.
79
+
80
+ Example if DuckDB Engine:
81
+ dg = CSVReader('myfile.csv')
82
+ query = "SELECT col1, col2 FROM {dg.db_table}" # f string assumed
83
+ dg.query_csv_data(query)
84
+ """
85
+ queries = DuckDBQueries(self.filepath)
86
+ duckdb.sql(queries.import_csv_query(self.delimiter))
87
+ return duckdb.sql(sql_query)
88
+
89
+ class CSVWriter(CSVProperties):
90
+ """Class to unify the interface for converting CSV files to various other supported file types."""
91
+
92
+ WRITER_ENGINES = ['duckdb', 'polars']
93
+ VALUE_ERROR_MESSAGE = """Writer engine '{engine}' is not 'duckdb' or 'polars'. Pass either 'duckdb' or 'polars' as valid engine params."""
94
+
95
+ def __init__(self, filepath, engine='duckdb'):
96
+ """Initialize the CSV Writer class.
97
+
98
+ Args:
99
+ filepath (str): Path to the file to write.
100
+ engine (str, default 'duckdb'): Determines which writer engine class to instantiate.
101
+ """
102
+ super().__init__(filepath)
103
+ self.db_table = DuckDBQueries(self.filepath).database_table_name
104
+ self.engine = engine.lower().replace(' ', '')
105
+ if self.engine not in self.WRITER_ENGINES:
106
+ raise ValueError(self.VALUE_ERROR_MESSAGE.format(engine=self.engine))
107
+
108
+ def _set_writer_engine(self):
109
+ """Sets the CSV reader engine as either DuckDB or Polars.
110
+ Default engine is Polars.
111
+ """
112
+ if self.engine != 'polars':
113
+ engine = CSVWriterDuckDBEngine(self.filepath)
114
+ else:
115
+ engine = CSVWriterPolarsEngine(self.filepath)
116
+ return engine
117
+
118
+ def write_csv(self, out_filename=None):
119
+ """Query to export a DuckDB table to a CSV file.
120
+
121
+ Args:
122
+ out_filename str: The name of the output file.
123
+ """
124
+ return self._set_writer_engine().write_csv(out_filename)
125
+
126
+ def write_excel(self, out_filename=None):
127
+ """Query to export a DuckDB table to an Excel file.
128
+
129
+ Args:
130
+ out_filename str: The name of the output file.
131
+ """
132
+ return self._set_writer_engine().write_excel(out_filename)
133
+
134
+ def write_json(self, out_filename=None):
135
+ """Query to export a DuckDB table to a JSON file.
136
+
137
+ Args:
138
+ out_filename str: The name of the output file.
139
+ """
140
+ return self._set_writer_engine().write_json(out_filename)
141
+
142
+ def write_json_newline_delimited(self, out_filename=None):
143
+ """Query to export a DuckDB table to a JSON newline delimited file.
144
+
145
+ Args:
146
+ out_filename str: The name of the output file.
147
+ """
148
+ return self._set_writer_engine().write_json_newline_delimited(out_filename)
149
+
150
+ def write_parquet(self, out_filename=None):
151
+ """Query to export a DuckDB table to a Parquet file.
152
+
153
+ Args:
154
+ out_filename str: The name of the output file.
155
+ """
156
+ return self._set_writer_engine().write_parquet(out_filename)
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Martin Graham
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,155 @@
1
+ Metadata-Version: 2.1
2
+ Name: datagrunt
3
+ Version: 0.0.0
4
+ Summary: Read CSV files and convert to other file formats easily
5
+ Author-email: Martin Graham <datagrunt@datagrunt.io>
6
+ License: MIT License
7
+ Project-URL: Homepage, https://pmgraham.github.io/datagrunt-docs
8
+ Project-URL: Bug Tracker, https://github.com/pmgraham/datagrunt/issues
9
+ Project-URL: Documentation, https://pmgraham.github.io/datagrunt-docs
10
+ Project-URL: Source Code, https://github.com/pmgraham/datagrunt
11
+ Keywords: csv,data,duckdb,polars,pyarrow,xlsx,delimiter
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Intended Audience :: Developers
18
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
19
+ Requires-Python: >=3.10
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: duckdb>=1.1.0
23
+ Requires-Dist: polars>=1.7.1
24
+ Requires-Dist: pyarrow>=17.0.0
25
+ Requires-Dist: XlsxWriter>=3.2.0
26
+ Provides-Extra: build
27
+ Requires-Dist: build; extra == "build"
28
+ Requires-Dist: twine; extra == "build"
29
+ Requires-Dist: bumpver; extra == "build"
30
+ Provides-Extra: dev
31
+ Requires-Dist: pytest>=7.0; extra == "dev"
32
+ Requires-Dist: pytest-cov>=3.0; extra == "dev"
33
+ Requires-Dist: black; extra == "dev"
34
+ Requires-Dist: isort; extra == "dev"
35
+ Requires-Dist: flake8; extra == "dev"
36
+
37
+ # Welcome To Datagrunt
38
+
39
+ Datagrunt is a Python library designed to simplify the way you work with CSV files. It provides a streamlined approach to reading, processing, and transforming your data into various formats, making data manipulation efficient and intuitive.
40
+
41
+ ## Why Datagrunt?
42
+
43
+ Born out of real-world frustration, Datagrunt eliminates the need For repetitive coding when handling CSV files. Whether you're a data analyst, data engineer, or data scientist, Datagrunt empowers you to focus on insights, not tedious data wrangling.
44
+
45
+ ## Key Features
46
+
47
+ - **Intelligent Delimiter Inference:** Datagrunt automatically detects and applies the correct delimiter for your csv files.
48
+ - **Seamless Data Processing:** Leverage the robust capabilities of [DuckDB](https://duckdb.org) and [Polars](https://pola.rs) to perform advanced data processing tasks directly on your CSV data.
49
+ - **Flexible Transformation:** Easily convert your processed CSV data into various formats to suit your needs.
50
+ - **Pythonic API:** Enjoy a clean and intuitive API that integrates seamlessly into your existing Python workflows.
51
+
52
+ ## Installation
53
+
54
+ Get started with Datagrunt in seconds using pip:
55
+
56
+ ```bash
57
+ pip install datagrunt
58
+ ```
59
+
60
+ ## Getting Started
61
+
62
+ ```python
63
+ from datagrunt import CSVReader
64
+
65
+ # Load your CSV file
66
+ csv_file = 'electric_vehicle_population_data.csv'
67
+ engine = 'duckdb'
68
+
69
+ # Set duckdb as the processing engine. Engine set to 'polars' by default
70
+ dg = CSVReader(csv_file, engine=engine)
71
+
72
+ # return sample of the data to get a peek at the schema
73
+ dg.get_sample()
74
+ ┌────────────┬───────────┬──────────────┬───┬──────────────────────┬──────────────────────┬───────────────────┐
75
+ │ VIN (1-10) │ County │ City │ … │ Vehicle Location │ Electric Utility │ 2020 Census Tract │
76
+ │ varchar │ varchar │ varchar │ │ varchar │ varchar │ varchar │
77
+ ├────────────┼───────────┼──────────────┼───┼──────────────────────┼──────────────────────┼───────────────────┤
78
+ │ 5YJSA1E28K │ Snohomish │ Mukilteo │ … │ POINT (-122.29943 … │ PUGET SOUND ENERGY… │ 53061042001 │
79
+ │ 1C4JJXP68P │ Yakima │ Yakima │ … │ POINT (-120.468875… │ PACIFICORP │ 53077001601 │
80
+ │ WBY8P6C05L │ Kitsap │ Kingston │ … │ POINT (-122.517835… │ PUGET SOUND ENERGY… │ 53035090102 │
81
+ │ JTDKARFP1J │ Kitsap │ Port Orchard │ … │ POINT (-122.653005… │ PUGET SOUND ENERGY… │ 53035092802 │
82
+ │ 5UXTA6C09N │ Snohomish │ Everett │ … │ POINT (-122.203234… │ PUGET SOUND ENERGY… │ 53061041605 │
83
+ │ 5YJYGDEF8L │ King │ Seattle │ … │ POINT (-122.378886… │ CITY OF SEATTLE - … │ 53033004703 │
84
+ │ JTMAB3FV7P │ Thurston │ Rainier │ … │ POINT (-122.677141… │ PUGET SOUND ENERGY… │ 53067012530 │
85
+ │ JN1AZ0CPXC │ King │ Kirkland │ … │ POINT (-122.192596… │ PUGET SOUND ENERGY… │ 53033022402 │
86
+ │ JN1AZ0CP7B │ King │ Kirkland │ … │ POINT (-122.192596… │ PUGET SOUND ENERGY… │ 53033022603 │
87
+ │ 1N4AZ0CP0F │ Thurston │ Olympia │ … │ POINT (-122.86491 … │ PUGET SOUND ENERGY… │ 53067010300 │
88
+ │ · │ · │ · │ · │ · │ · │ · │
89
+ │ · │ · │ · │ · │ · │ · │ · │
90
+ │ · │ · │ · │ · │ · │ · │ · │
91
+ │ 5YJYGDEE7M │ Clark │ Vancouver │ … │ POINT (-122.515805… │ BONNEVILLE POWER A… │ 53011041310 │
92
+ │ 7SAYGAEE0P │ Snohomish │ Monroe │ … │ POINT (-121.968385… │ PUGET SOUND ENERGY… │ 53061052203 │
93
+ │ 2C4RC1N75P │ King │ Burien │ … │ POINT (-122.347227… │ CITY OF SEATTLE - … │ 53033027600 │
94
+ │ 1FTVW1EVXP │ King │ Kirkland │ … │ POINT (-122.202653… │ PUGET SOUND ENERGY… │ 53033022300 │
95
+ │ 4JGGM1CB2P │ King │ Seattle │ … │ POINT (-122.2453 4… │ CITY OF SEATTLE - … │ 53033011700 │
96
+ │ 1N4BZ0CP0G │ King │ Seattle │ … │ POINT (-122.334079… │ CITY OF SEATTLE - … │ 53033008300 │
97
+ │ 7SAYGDEF2N │ King │ Bellevue │ … │ POINT (-122.144149… │ PUGET SOUND ENERGY… │ 53033024704 │
98
+ │ 1N4BZ1DP7L │ King │ Bellevue │ … │ POINT (-122.144149… │ PUGET SOUND ENERGY… │ 53033024902 │
99
+ ...
100
+ ├────────────┴───────────┴──────────────┴───┴──────────────────────┴──────────────────────┴───────────────────┤
101
+ │ ? rows (>9999 rows, 20 shown) 17 columns (6 shown) │
102
+ └─────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
103
+ ```
104
+
105
+ ## DuckDB Integration for Performant SQL Queries
106
+ ```python
107
+ from datagrunt import CSVReader
108
+
109
+ csv_file = 'electric_vehicle_population_data.csv'
110
+ engine = 'duckdb'
111
+
112
+ dg = CSVReader(csv_file, engine=engine)
113
+
114
+ # Construct your SQL query
115
+ query = f"""
116
+ WITH core AS (
117
+ SELECT
118
+ City AS city,
119
+ "VIN (1-10)" AS vin
120
+ FROM {dg.db_table}
121
+ )
122
+ SELECT
123
+ city,
124
+ COUNT(vin) AS vehicle_count
125
+ FROM core
126
+ GROUP BY 1
127
+ ORDER BY 2 DESC
128
+ """
129
+
130
+ # Execute the query and get results as a Polars DataFrame
131
+ df = dg.query_data(query).pl()
132
+ print(df)
133
+ ┌────────────────┬───────────────┐
134
+ │ city ┆ vehicle_count │
135
+ │ --- ┆ --- │
136
+ │ str ┆ i64 │
137
+ ╞════════════════╪═══════════════╡
138
+ │ Seattle ┆ 32602 │
139
+ │ Bellevue ┆ 9960 │
140
+ │ Redmond ┆ 7165 │
141
+ │ Vancouver ┆ 7081 │
142
+ │ Bothell ┆ 6602 │
143
+ │ … ┆ … │
144
+ │ Glenwood ┆ 1 │
145
+ │ Walla Walla Co ┆ 1 │
146
+ │ Pittsburg ┆ 1 │
147
+ │ Decatur ┆ 1 │
148
+ │ Redwood City ┆ 1 │
149
+ └────────────────┴───────────────┘
150
+ ```
151
+ ## License
152
+ This project is licensed under the [MIT License](https://opensource.org/license/mit)
153
+
154
+ ## Acknowledgements
155
+ A HUGE thank you to the open source community and the creators of [DuckDB](https://duckdb.org) and [Polars](https://pola.rs) for their fantastic libraries that power Datagrunt.
@@ -0,0 +1,13 @@
1
+ datagrunt/__init__.py,sha256=rWgU2R8hvXqr9hw_DH_7j74PI2gnXlHCybQSpwDiZfg,1122
2
+ datagrunt/csvfile.py,sha256=jbIvA9_0ucRJ4qCu6yQ1InSg0ENxN2dkCZcX8rVN4iI,5476
3
+ datagrunt/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ datagrunt/core/databases.py,sha256=yo8ILK9dtj8zusrM8HgoA5s629b-ghGRp2HumvJS4eE,1702
5
+ datagrunt/core/engines.py,sha256=K__2kkdKwvxeL6QwmyEyZdeuUT5MDPpY0lzKwYHrouM,7543
6
+ datagrunt/core/fileproperties.py,sha256=e5mP7bAsQzKpppryZqEV425AXNXsMGZnjrPhYz8gaJI,10198
7
+ datagrunt/core/logger.py,sha256=ck5HmwHmk0SUmTjQudqLgpQk5mpC9T0UOlDKqi6qssI,1250
8
+ datagrunt/core/queries.py,sha256=2OPD2Zz1Toryr5_nVANkJvifwg6MzCx4obah34m-IBY,3846
9
+ datagrunt-0.0.0.dist-info/LICENSE,sha256=qY7AmIOAwE05aVD7siEo8-2HOUz5WYUEhxbX0Ts9fyg,1069
10
+ datagrunt-0.0.0.dist-info/METADATA,sha256=K64t3rDDe8IiHUrJrcrweVYPJSa3COMhC0jWvZ01TpQ,9250
11
+ datagrunt-0.0.0.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
12
+ datagrunt-0.0.0.dist-info/top_level.txt,sha256=kVGx0P9sIaw_RegsAR94f7fyE4Um4ayE5vf_aTImMoE,10
13
+ datagrunt-0.0.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (74.1.2)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ datagrunt