TestDataX 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- src/__init__.py +7 -0
- src/cli.py +166 -0
- src/exporters/__init__.py +0 -0
- src/exporters/base_exporter.py +23 -0
- src/exporters/csv_exporter.py +115 -0
- src/exporters/json_exporter.py +89 -0
- src/exporters/mssql_exporter.py +198 -0
- src/exporters/mysql_exporter.py +184 -0
- src/exporters/oracle_exporter.py +205 -0
- src/exporters/orc_exporter.py +100 -0
- src/exporters/parquet_exporter.py +102 -0
- src/exporters/utils/__init__.py +0 -0
- src/exporters/utils/chunker.py +27 -0
- src/exporters/utils/constants.py +55 -0
- src/exporters/utils/exporter_config.py +17 -0
- src/exporters/utils/formatters.py +165 -0
- src/generator.py +117 -0
- src/providers/__init__.py +4 -0
- src/providers/base.py +58 -0
- src/providers/faker_provider.py +65 -0
- src/schemas.py +81 -0
- testdatax-0.1.0.dist-info/LICENSE +21 -0
- testdatax-0.1.0.dist-info/METADATA +345 -0
- testdatax-0.1.0.dist-info/RECORD +26 -0
- testdatax-0.1.0.dist-info/WHEEL +4 -0
- testdatax-0.1.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
import decimal
|
|
2
|
+
from datetime import date, datetime
|
|
3
|
+
from typing import Any
|
|
4
|
+
from uuid import UUID
|
|
5
|
+
|
|
6
|
+
from .base_exporter import BaseExporter
|
|
7
|
+
from .utils.constants import DEFAULT_SCHEMA
|
|
8
|
+
|
|
9
|
+
MYSQL_TYPE_MAPPING = {
|
|
10
|
+
"string": "VARCHAR(255)",
|
|
11
|
+
"text": "TEXT",
|
|
12
|
+
"integer": "INT",
|
|
13
|
+
"bigint": "BIGINT",
|
|
14
|
+
"float": "FLOAT",
|
|
15
|
+
"decimal": "DECIMAL(18,2)",
|
|
16
|
+
"boolean": "TINYINT(1)",
|
|
17
|
+
"date": "DATE",
|
|
18
|
+
"datetime": "DATETIME",
|
|
19
|
+
"blob": "LONGBLOB",
|
|
20
|
+
"uuid": "VARCHAR(36)",
|
|
21
|
+
"enum": "ENUM",
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class MysqlExporter(BaseExporter):
|
|
26
|
+
"""Exports data to MySQL compatible SQL file."""
|
|
27
|
+
|
|
28
|
+
def _format_value(
|
|
29
|
+
self,
|
|
30
|
+
value: (
|
|
31
|
+
None
|
|
32
|
+
| str
|
|
33
|
+
| UUID
|
|
34
|
+
| datetime
|
|
35
|
+
| date
|
|
36
|
+
| bool
|
|
37
|
+
| bytes
|
|
38
|
+
| int
|
|
39
|
+
| float
|
|
40
|
+
| decimal.Decimal
|
|
41
|
+
),
|
|
42
|
+
) -> str:
|
|
43
|
+
"""Format a value for use in a MySQL query.
|
|
44
|
+
|
|
45
|
+
This method handles various data types and converts them to their
|
|
46
|
+
appropriate string representation for use in MySQL queries.
|
|
47
|
+
It includes proper escaping for special characters in strings.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
value: The value to format. Can be one of the following types:
|
|
51
|
+
- None: Converted to 'NULL'
|
|
52
|
+
- str: Escaped and wrapped in single quotes
|
|
53
|
+
- UUID: Converted to string, escaped and wrapped in single quotes
|
|
54
|
+
- datetime: Converted to ISO format string and wrapped in single quotes
|
|
55
|
+
- date: Converted to ISO format string and wrapped in single quotes
|
|
56
|
+
- bool: Converted to '1' for True or '0' for False
|
|
57
|
+
- bytes: Converted to hexadecimal string with 'x' prefix
|
|
58
|
+
- int: Converted to string representation
|
|
59
|
+
- float: Converted to string representation
|
|
60
|
+
- decimal.Decimal: Converted to string representation
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
str: The formatted value ready for use in a MySQL query.
|
|
64
|
+
|
|
65
|
+
Raises:
|
|
66
|
+
ValueError: If the input value type is not supported.
|
|
67
|
+
|
|
68
|
+
"""
|
|
69
|
+
if value is None:
|
|
70
|
+
return "NULL"
|
|
71
|
+
elif isinstance(value, (str | UUID)):
|
|
72
|
+
return "'" + str(value).replace("'", "\\'").replace("\n", "\\n") + "'"
|
|
73
|
+
elif isinstance(value, (datetime | date)):
|
|
74
|
+
return f"'{value.isoformat()}'"
|
|
75
|
+
elif isinstance(value, bool):
|
|
76
|
+
return "1" if value else "0"
|
|
77
|
+
elif isinstance(value, bytes):
|
|
78
|
+
return f"x'{value.hex()}'"
|
|
79
|
+
elif isinstance(value, (int | float | decimal.Decimal)):
|
|
80
|
+
return str(value)
|
|
81
|
+
else:
|
|
82
|
+
raise ValueError(f"Unsupported type: {type(value)}")
|
|
83
|
+
|
|
84
|
+
def _get_column_type(self, field: dict) -> str:
|
|
85
|
+
"""Get the MySQL column type based on the field type.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
field (dict): A dictionary containing field information, including its type.
|
|
89
|
+
The type can be either a string or a dictionary with a 'type' key.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
str: The corresponding MySQL data type from MYSQL_TYPE_MAPPING.
|
|
93
|
+
|
|
94
|
+
Raises:
|
|
95
|
+
KeyError: If the field type is not found in MYSQL_TYPE_MAPPING.
|
|
96
|
+
|
|
97
|
+
"""
|
|
98
|
+
field_type = field.get("type", "string")
|
|
99
|
+
if isinstance(field_type, dict):
|
|
100
|
+
field_type = field_type.get("type", "string")
|
|
101
|
+
return MYSQL_TYPE_MAPPING[field_type]
|
|
102
|
+
|
|
103
|
+
def _create_table_stmt(self, schema: dict, table_name: str = "output") -> str:
|
|
104
|
+
"""Generate a MySQL CREATE TABLE statement based on provided schema.
|
|
105
|
+
|
|
106
|
+
This method constructs a CREATE TABLE SQL statement by mapping schema field
|
|
107
|
+
definitions to their corresponding MySQL column types. It supports both simple
|
|
108
|
+
type definitions and complex field definitions including ENUM types.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
schema (dict): A dictionary defining the table schema with field names
|
|
112
|
+
as keys and type definitions as values.
|
|
113
|
+
table_name (str, optional): Name for the table. Defaults to "output".
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
str: A complete MySQL CREATE TABLE statement as a string.
|
|
117
|
+
|
|
118
|
+
Example:
|
|
119
|
+
schema = {
|
|
120
|
+
"id": "int",
|
|
121
|
+
"status": {"type": "enum", "values": ["active", "inactive"]}
|
|
122
|
+
}
|
|
123
|
+
result = _create_table_stmt(schema, "users")
|
|
124
|
+
# Returns: CREATE TABLE users (
|
|
125
|
+
# id INT NULL,
|
|
126
|
+
# status ENUM('active','inactive') NULL
|
|
127
|
+
# );
|
|
128
|
+
|
|
129
|
+
"""
|
|
130
|
+
columns = []
|
|
131
|
+
for field_name, field_def in schema.items():
|
|
132
|
+
field_type_dict = (
|
|
133
|
+
field_def if isinstance(field_def, dict) else {"type": field_def}
|
|
134
|
+
)
|
|
135
|
+
sql_type = self._get_column_type(field_type_dict)
|
|
136
|
+
if (
|
|
137
|
+
isinstance(field_def, dict)
|
|
138
|
+
and field_def.get("type") == "enum"
|
|
139
|
+
and "values" in field_def
|
|
140
|
+
):
|
|
141
|
+
values = "','".join(field_def["values"])
|
|
142
|
+
sql_type = f"ENUM('{values}')"
|
|
143
|
+
columns.append(f" {field_name} {sql_type} NULL")
|
|
144
|
+
|
|
145
|
+
return f"CREATE TABLE {table_name} (\n" + ",\n".join(columns) + "\n);\n\n"
|
|
146
|
+
|
|
147
|
+
def _create_insert_stmt(
|
|
148
|
+
self, row: dict[str, Any], table_name: str = "output"
|
|
149
|
+
) -> str:
|
|
150
|
+
"""Create a MySQL INSERT statement from a dictionary of values.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
row (dict[str, Any]): Dictionary containing column names as keys
|
|
154
|
+
and values to insert
|
|
155
|
+
table_name (str, optional): Name of the target table. Defaults to "output"
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
str: Formatted MySQL INSERT statement string
|
|
159
|
+
|
|
160
|
+
Example:
|
|
161
|
+
>>> row = {"id": 1, "name": "test"}
|
|
162
|
+
>>> _create_insert_stmt(row, "users")
|
|
163
|
+
'INSERT INTO users (id, name) VALUES (1, "test");'
|
|
164
|
+
|
|
165
|
+
"""
|
|
166
|
+
columns = ", ".join(row.keys())
|
|
167
|
+
values = ", ".join(self._format_value(v) for v in row.values())
|
|
168
|
+
return f"INSERT INTO {table_name} ({columns}) VALUES ({values});"
|
|
169
|
+
|
|
170
|
+
def export(
|
|
171
|
+
self, data: list[dict[str, Any]], output_path: str, schema: dict | None = None
|
|
172
|
+
) -> None:
|
|
173
|
+
"""Export data to MySQL compatible SQL file."""
|
|
174
|
+
if not data:
|
|
175
|
+
return
|
|
176
|
+
|
|
177
|
+
table_name = output_path.split("/")[-1].split(".")[0]
|
|
178
|
+
|
|
179
|
+
with open(output_path, "w") as f:
|
|
180
|
+
# Always write CREATE TABLE using DEFAULT_SCHEMA if no schema provided
|
|
181
|
+
schema_to_use = schema or DEFAULT_SCHEMA
|
|
182
|
+
f.write(self._create_table_stmt(schema_to_use, table_name))
|
|
183
|
+
for row in data:
|
|
184
|
+
f.write(self._create_insert_stmt(row, table_name) + "\n")
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
import decimal
|
|
2
|
+
from datetime import date, datetime
|
|
3
|
+
from typing import Any
|
|
4
|
+
from uuid import UUID
|
|
5
|
+
|
|
6
|
+
from .base_exporter import BaseExporter
|
|
7
|
+
from .utils.constants import DEFAULT_SCHEMA
|
|
8
|
+
|
|
9
|
+
ORACLE_TYPE_MAPPING = {
|
|
10
|
+
"string": "VARCHAR2(255)", # Oracle's Unicode string type
|
|
11
|
+
"text": "CLOB", # Oracle's large text type
|
|
12
|
+
"integer": "NUMBER(10)", # Standard integer in Oracle
|
|
13
|
+
"bigint": "NUMBER(19)", # Large integers in Oracle
|
|
14
|
+
"float": "FLOAT", # Same in Oracle
|
|
15
|
+
"decimal": "NUMBER(18,2)", # Oracle's decimal type
|
|
16
|
+
"boolean": "NUMBER(1)", # Oracle has no native boolean
|
|
17
|
+
"date": "DATE", # Same in Oracle
|
|
18
|
+
"datetime": "TIMESTAMP", # Oracle's datetime type
|
|
19
|
+
"blob": "BLOB", # Oracle's binary large object
|
|
20
|
+
"uuid": "VARCHAR2(36)", # Oracle has no native UUID
|
|
21
|
+
"enum": "VARCHAR2(255)", # Oracle has no ENUM type
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class OracleExporter(BaseExporter):
|
|
26
|
+
"""Exports data to ORACLE compatible SQL file."""
|
|
27
|
+
|
|
28
|
+
def _format_value(
|
|
29
|
+
self,
|
|
30
|
+
value: (
|
|
31
|
+
None
|
|
32
|
+
| str
|
|
33
|
+
| UUID
|
|
34
|
+
| datetime
|
|
35
|
+
| date
|
|
36
|
+
| bool
|
|
37
|
+
| bytes
|
|
38
|
+
| int
|
|
39
|
+
| float
|
|
40
|
+
| decimal.Decimal
|
|
41
|
+
),
|
|
42
|
+
) -> str:
|
|
43
|
+
"""Format a value for use in a ORACLE query.
|
|
44
|
+
|
|
45
|
+
This method handles various data types and converts them to their
|
|
46
|
+
appropriate string representation for use in ORACLE queries.
|
|
47
|
+
It includes proper escaping for special characters in strings.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
value: The value to format. Can be one of the following types:
|
|
51
|
+
- None: Converted to 'NULL'
|
|
52
|
+
- str: Escaped and wrapped in single quotes
|
|
53
|
+
- UUID: Converted to string, escaped and wrapped in single quotes
|
|
54
|
+
- datetime: Converted to ISO format string and wrapped in single quotes
|
|
55
|
+
- date: Converted to ISO format string and wrapped in single quotes
|
|
56
|
+
- bool: Converted to '1' for True or '0' for False
|
|
57
|
+
- bytes: Use 0x prefix for hexadecimal to binary conversion
|
|
58
|
+
- int: Converted to string representation
|
|
59
|
+
- float: Converted to string representation
|
|
60
|
+
- decimal.Decimal: Converted to string representation
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
str: The formatted value ready for use in a ORACLE query.
|
|
64
|
+
|
|
65
|
+
Raises:
|
|
66
|
+
ValueError: If the input value type is not supported.
|
|
67
|
+
|
|
68
|
+
"""
|
|
69
|
+
if value is None:
|
|
70
|
+
return "NULL"
|
|
71
|
+
elif isinstance(value, (str)):
|
|
72
|
+
return "'" + str(value).replace("'", "''") + "'" # uses '' for escaping
|
|
73
|
+
elif isinstance(value, UUID):
|
|
74
|
+
return f"'{str(value)}'"
|
|
75
|
+
elif isinstance(value, datetime):
|
|
76
|
+
return (
|
|
77
|
+
f"TO_TIMESTAMP('{value.isoformat()}', "
|
|
78
|
+
f"'YYYY-MM-DD\"T\"HH24:MI:SS.FF')"
|
|
79
|
+
)
|
|
80
|
+
elif isinstance(value, date):
|
|
81
|
+
return f"DATE '{value.isoformat()}'"
|
|
82
|
+
elif isinstance(value, bool):
|
|
83
|
+
return "1" if value else "0"
|
|
84
|
+
elif isinstance(value, bytes):
|
|
85
|
+
return f"HEXTORAW('{value.hex()}')" # Oracle BLOB format
|
|
86
|
+
elif isinstance(value, (int | float | decimal.Decimal)):
|
|
87
|
+
return str(value)
|
|
88
|
+
else:
|
|
89
|
+
raise ValueError(f"Unsupported type: {type(value)}")
|
|
90
|
+
|
|
91
|
+
def _get_column_type(self, field: dict) -> str:
|
|
92
|
+
"""Get the ORACLE column type based on the field type.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
field (dict): A dictionary containing field information, including its type.
|
|
96
|
+
The type can be either a string or a dictionary with a 'type' key.
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
str: The corresponding ORACLE data type from ORACLE_TYPE_MAPPING.
|
|
100
|
+
|
|
101
|
+
Raises:
|
|
102
|
+
KeyError: If the field type is not found in ORACLE_TYPE_MAPPING.
|
|
103
|
+
|
|
104
|
+
"""
|
|
105
|
+
field_type = field.get("type", "string")
|
|
106
|
+
if isinstance(field_type, dict):
|
|
107
|
+
field_type = field_type.get("type", "string")
|
|
108
|
+
return ORACLE_TYPE_MAPPING[field_type]
|
|
109
|
+
|
|
110
|
+
def _create_table_stmt(self, schema: dict, table_name: str = "output") -> str:
|
|
111
|
+
"""Generate a ORACLE CREATE TABLE statement based on provided schema.
|
|
112
|
+
|
|
113
|
+
This method constructs a CREATE TABLE SQL statement by mapping schema field
|
|
114
|
+
definitions to their corresponding ORACLE column types. For ENUM types,
|
|
115
|
+
it creates a CHECK constraint to validate the values.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
schema (dict): A dictionary defining the table schema with field names
|
|
119
|
+
as keys and type definitions as values.
|
|
120
|
+
table_name (str, optional): Name for the table. Defaults to "output".
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
str: A complete ORACLE CREATE TABLE statement as a string.
|
|
124
|
+
|
|
125
|
+
Example:
|
|
126
|
+
schema = {
|
|
127
|
+
"id": "int",
|
|
128
|
+
"status": {"type": "enum", "values": ["active", "inactive"]}
|
|
129
|
+
}
|
|
130
|
+
result = _create_table_stmt(schema, "users")
|
|
131
|
+
# Returns: CREATE TABLE users (
|
|
132
|
+
# id INT NULL,
|
|
133
|
+
# status NVARCHAR(255) NULL CHECK (status IN ('active','inactive'))
|
|
134
|
+
# );
|
|
135
|
+
|
|
136
|
+
"""
|
|
137
|
+
columns = []
|
|
138
|
+
check_constraints = []
|
|
139
|
+
|
|
140
|
+
for field_name, field_def in schema.items():
|
|
141
|
+
field_type_dict = (
|
|
142
|
+
field_def if isinstance(field_def, dict) else {"type": field_def}
|
|
143
|
+
)
|
|
144
|
+
sql_type = self._get_column_type(field_type_dict)
|
|
145
|
+
|
|
146
|
+
if (
|
|
147
|
+
isinstance(field_def, dict)
|
|
148
|
+
and field_def.get("type") == "enum"
|
|
149
|
+
and "values" in field_def
|
|
150
|
+
):
|
|
151
|
+
values = "','".join(field_def["values"])
|
|
152
|
+
check_constraints.append(f"CHECK ({field_name} IN ('{values}'))")
|
|
153
|
+
|
|
154
|
+
columns.append(f" {field_name} {sql_type} NULL")
|
|
155
|
+
|
|
156
|
+
# Combine columns and check constraints
|
|
157
|
+
return (
|
|
158
|
+
f"CREATE TABLE {table_name} (\n"
|
|
159
|
+
+ ",\n".join(columns)
|
|
160
|
+
+ (
|
|
161
|
+
(",\n " + ",\n ".join(check_constraints))
|
|
162
|
+
if check_constraints
|
|
163
|
+
else ""
|
|
164
|
+
)
|
|
165
|
+
+ "\n);\n\n"
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
def _create_insert_stmt(
|
|
169
|
+
self, row: dict[str, Any], table_name: str = "output"
|
|
170
|
+
) -> str:
|
|
171
|
+
"""Create a ORACLE INSERT statement from a dictionary of values.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
row (dict[str, Any]): Dictionary containing column names as keys
|
|
175
|
+
and values to insert
|
|
176
|
+
table_name (str, optional): Name of the target table. Defaults to "output"
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
str: Formatted ORACLE INSERT statement string
|
|
180
|
+
|
|
181
|
+
Example:
|
|
182
|
+
>>> row = {"id": 1, "name": "test"}
|
|
183
|
+
>>> _create_insert_stmt(row, "users")
|
|
184
|
+
'INSERT INTO users (id, name) VALUES (1, "test");'
|
|
185
|
+
|
|
186
|
+
"""
|
|
187
|
+
columns = ", ".join(row.keys())
|
|
188
|
+
values = ", ".join(self._format_value(v) for v in row.values())
|
|
189
|
+
return f"INSERT INTO {table_name} ({columns}) VALUES ({values});"
|
|
190
|
+
|
|
191
|
+
def export(
|
|
192
|
+
self, data: list[dict[str, Any]], output_path: str, schema: dict | None = None
|
|
193
|
+
) -> None:
|
|
194
|
+
"""Export data to ORACLE compatible SQL file."""
|
|
195
|
+
if not data:
|
|
196
|
+
return
|
|
197
|
+
|
|
198
|
+
table_name = output_path.split("/")[-1].split(".")[0]
|
|
199
|
+
|
|
200
|
+
with open(output_path, "w") as f:
|
|
201
|
+
# Always write CREATE TABLE using DEFAULT_SCHEMA if no schema provided
|
|
202
|
+
schema_to_use = schema or DEFAULT_SCHEMA
|
|
203
|
+
f.write(self._create_table_stmt(schema_to_use, table_name))
|
|
204
|
+
for row in data:
|
|
205
|
+
f.write(self._create_insert_stmt(row, table_name) + "\n")
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import pyarrow as pa
|
|
6
|
+
import pyarrow.orc as po
|
|
7
|
+
|
|
8
|
+
from .base_exporter import BaseExporter
|
|
9
|
+
from .utils.chunker import DataChunker
|
|
10
|
+
from .utils.constants import (
|
|
11
|
+
CHUNK_SIZE_ORC,
|
|
12
|
+
DEFAULT_ORC_COMPRESSION,
|
|
13
|
+
VALID_ORC_COMPRESSION,
|
|
14
|
+
OrcCompression,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class OrcExporter(BaseExporter):
|
|
21
|
+
"""Exports data to ORC file format using PyArrow and Pandas."""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
chunk_size: int = CHUNK_SIZE_ORC,
|
|
26
|
+
compression: OrcCompression = DEFAULT_ORC_COMPRESSION,
|
|
27
|
+
) -> None:
|
|
28
|
+
"""Initialize Exporter with specified options.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
chunk_size: Number of rows to write at once
|
|
32
|
+
compression: Compression codec
|
|
33
|
+
compression_strategy: Compression strategy
|
|
34
|
+
|
|
35
|
+
"""
|
|
36
|
+
if compression not in VALID_ORC_COMPRESSION:
|
|
37
|
+
raise ValueError(f"Invalid compression: {compression}")
|
|
38
|
+
|
|
39
|
+
self.chunker = DataChunker(chunk_size)
|
|
40
|
+
self.compression = compression
|
|
41
|
+
|
|
42
|
+
def _get_compressed_path(self, path: str) -> str:
|
|
43
|
+
"""Insert compression type into filename before extension."""
|
|
44
|
+
if self.compression == "UNCOMPRESSED":
|
|
45
|
+
return path
|
|
46
|
+
# Remove any existing compression extension
|
|
47
|
+
base = path
|
|
48
|
+
for comp in VALID_ORC_COMPRESSION:
|
|
49
|
+
base = base.replace(f".{comp.lower()}", "")
|
|
50
|
+
# Split at last occurrence of .orc
|
|
51
|
+
base = base.replace(".orc", "")
|
|
52
|
+
return f"{base}.{self.compression.lower()}.orc"
|
|
53
|
+
|
|
54
|
+
def export(
|
|
55
|
+
self,
|
|
56
|
+
data: list[dict[str, Any]],
|
|
57
|
+
output_path: str,
|
|
58
|
+
schema: dict | None = None,
|
|
59
|
+
) -> None:
|
|
60
|
+
"""Export data to ORC file format.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
data: List of dictionaries containing the data to export
|
|
64
|
+
output_path: Path where the ORC file will be written
|
|
65
|
+
schema: Schema definition for the data (not used for ORC)
|
|
66
|
+
|
|
67
|
+
"""
|
|
68
|
+
if not data:
|
|
69
|
+
logger.info("No data to export")
|
|
70
|
+
raise RuntimeError("No data to export")
|
|
71
|
+
|
|
72
|
+
# Validate base path ends with .orc before adding compression
|
|
73
|
+
if not output_path.endswith(".orc"):
|
|
74
|
+
raise ValueError("Output path must have .orc extension")
|
|
75
|
+
|
|
76
|
+
output_path = self._get_compressed_path(output_path)
|
|
77
|
+
logger.info(f"Starting ORC export to {output_path} with {len(data)} rows")
|
|
78
|
+
if not isinstance(data, list):
|
|
79
|
+
raise TypeError("Data must be a list of dictionaries")
|
|
80
|
+
|
|
81
|
+
writer = None
|
|
82
|
+
try:
|
|
83
|
+
for i, chunk in enumerate(self.chunker.chunk_data(data), 1):
|
|
84
|
+
logger.debug(f"Processing chunk {i}")
|
|
85
|
+
df = pd.DataFrame(chunk)
|
|
86
|
+
if "uuid" in df.columns:
|
|
87
|
+
df["uuid"] = df["uuid"].astype(str)
|
|
88
|
+
table = pa.Table.from_pandas(df)
|
|
89
|
+
if writer is None:
|
|
90
|
+
writer = po.ORCWriter(
|
|
91
|
+
str(output_path),
|
|
92
|
+
compression=self.compression,
|
|
93
|
+
)
|
|
94
|
+
writer.write(table)
|
|
95
|
+
logger.info(f"Successfully exported {len(data)} rows to {output_path}")
|
|
96
|
+
except Exception as e:
|
|
97
|
+
raise RuntimeError(f"Failed to write Orc file: {e}") from e
|
|
98
|
+
finally:
|
|
99
|
+
if writer:
|
|
100
|
+
writer.close()
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import pyarrow as pa
|
|
6
|
+
import pyarrow.parquet as pq
|
|
7
|
+
|
|
8
|
+
from .base_exporter import BaseExporter
|
|
9
|
+
from .utils.chunker import DataChunker
|
|
10
|
+
from .utils.constants import (
|
|
11
|
+
CHUNK_SIZE_PARQUET,
|
|
12
|
+
DEFAULT_PARQUET_COMPRESSION,
|
|
13
|
+
VALID_PARQUET_COMPRESSION,
|
|
14
|
+
ParquetCompression,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ParquetExporter(BaseExporter):
|
|
21
|
+
"""Exporter class for writing data to Parquet format files."""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
chunk_size: int = CHUNK_SIZE_PARQUET,
|
|
26
|
+
compression: ParquetCompression = DEFAULT_PARQUET_COMPRESSION,
|
|
27
|
+
) -> None:
|
|
28
|
+
"""Initialize Exporter with specified chunk size.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
chunk_size (int, optional): Number of rows to write at once.
|
|
32
|
+
Defaults to CHUNK_SIZE_PARQUET.
|
|
33
|
+
compression (str, optional): Compression codec.
|
|
34
|
+
|
|
35
|
+
"""
|
|
36
|
+
if compression not in VALID_PARQUET_COMPRESSION:
|
|
37
|
+
raise ValueError(f"Invalid compression: {compression}")
|
|
38
|
+
|
|
39
|
+
self.chunk_size = chunk_size
|
|
40
|
+
self.compression = compression
|
|
41
|
+
self.chunker = DataChunker(chunk_size)
|
|
42
|
+
|
|
43
|
+
def _get_compressed_path(self, path: str) -> str:
|
|
44
|
+
"""Insert compression type into filename before extension."""
|
|
45
|
+
if self.compression == "none":
|
|
46
|
+
return path
|
|
47
|
+
base = path
|
|
48
|
+
for comp in VALID_PARQUET_COMPRESSION:
|
|
49
|
+
base = base.replace(f".{comp}", "")
|
|
50
|
+
base = base.replace(".parquet", "")
|
|
51
|
+
return f"{base}.{self.compression}.parquet"
|
|
52
|
+
|
|
53
|
+
def export(
|
|
54
|
+
self,
|
|
55
|
+
data: list[dict[str, Any]],
|
|
56
|
+
output_path: str,
|
|
57
|
+
schema: dict | None = None,
|
|
58
|
+
) -> None:
|
|
59
|
+
"""Export data to a Parquet file.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
data: List of dictionaries containing the data to export
|
|
63
|
+
output_path: Path where the Parquet file will be written
|
|
64
|
+
schema: Optional dictionary defining the data schema
|
|
65
|
+
chunk_size: Number of rows to write at once (default: CHUNK_SIZE_PARQUET)
|
|
66
|
+
|
|
67
|
+
"""
|
|
68
|
+
if not data:
|
|
69
|
+
logger.info("No data to export")
|
|
70
|
+
raise RuntimeError("No data to export")
|
|
71
|
+
|
|
72
|
+
# Validate base path ends with .parquet before adding compression
|
|
73
|
+
if not output_path.endswith(".parquet"):
|
|
74
|
+
raise ValueError("Output path must have .parquet extension")
|
|
75
|
+
|
|
76
|
+
output_path = self._get_compressed_path(output_path)
|
|
77
|
+
logger.info(f"Starting Parquet export to {output_path} with {len(data)} rows")
|
|
78
|
+
if not isinstance(data, list):
|
|
79
|
+
raise TypeError("Data must be a list of dictionaries")
|
|
80
|
+
|
|
81
|
+
writer = None
|
|
82
|
+
try:
|
|
83
|
+
for i, chunk in enumerate(self.chunker.chunk_data(data), 1):
|
|
84
|
+
logger.debug(f"Processing chunk {i}")
|
|
85
|
+
df = pd.DataFrame(chunk)
|
|
86
|
+
if "uuid" in df.columns:
|
|
87
|
+
df["uuid"] = df["uuid"].astype(str)
|
|
88
|
+
table = pa.Table.from_pandas(df)
|
|
89
|
+
# Initialize writer with first chunk's schema
|
|
90
|
+
if writer is None:
|
|
91
|
+
writer = pq.ParquetWriter(
|
|
92
|
+
output_path,
|
|
93
|
+
table.schema,
|
|
94
|
+
compression=self.compression,
|
|
95
|
+
)
|
|
96
|
+
writer.write_table(table)
|
|
97
|
+
logger.info(f"Successfully exported {len(data)} rows to {output_path}")
|
|
98
|
+
except Exception as e:
|
|
99
|
+
raise RuntimeError(f"Failed to write Parquet file: {e}") from e
|
|
100
|
+
finally:
|
|
101
|
+
if writer is not None:
|
|
102
|
+
writer.close()
|
|
File without changes
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from collections.abc import Iterator
|
|
3
|
+
from typing import TypeVar
|
|
4
|
+
|
|
5
|
+
logger = logging.getLogger(__name__)
|
|
6
|
+
|
|
7
|
+
T = TypeVar("T")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DataChunker:
|
|
11
|
+
"""Handle data chunking operations."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, chunk_size: int) -> None:
|
|
14
|
+
"""Initialize the DataChunker with a specified chunk size."""
|
|
15
|
+
if chunk_size <= 0:
|
|
16
|
+
raise ValueError("Chunk size must be greater than zero")
|
|
17
|
+
self.chunk_size = chunk_size
|
|
18
|
+
|
|
19
|
+
def chunk_data(self, data: list[T]) -> Iterator[list[T]]:
|
|
20
|
+
"""Yield chunks of data with memory cleanup."""
|
|
21
|
+
total_chunks = (len(data) + self.chunk_size - 1) // self.chunk_size
|
|
22
|
+
for chunk_num, i in enumerate(range(0, len(data), self.chunk_size), 1):
|
|
23
|
+
chunk = data[i : i + self.chunk_size]
|
|
24
|
+
logger.debug(f"Processing chunk {chunk_num}/{total_chunks}")
|
|
25
|
+
yield chunk
|
|
26
|
+
# Help garbage collection
|
|
27
|
+
del chunk
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from typing import Literal, cast
|
|
2
|
+
|
|
3
|
+
# Export related constants
|
|
4
|
+
EXPORT_FORMATS = ["csv", "json", "parquet", "orc", "mysql", "mssql", "oracle"]
|
|
5
|
+
EXPORT_PATTERNS = "^(csv|json|parquet|orc|mysql|mssql|oracle)$"
|
|
6
|
+
|
|
7
|
+
# Chunk size for exporting data
|
|
8
|
+
CHUNK_SIZE = 10000
|
|
9
|
+
CHUNK_SIZE_JSON = CHUNK_SIZE
|
|
10
|
+
CHUNK_SIZE_CSV = CHUNK_SIZE
|
|
11
|
+
CHUNK_SIZE_PARQUET = CHUNK_SIZE
|
|
12
|
+
CHUNK_SIZE_ORC = CHUNK_SIZE
|
|
13
|
+
|
|
14
|
+
# ORC compression types
|
|
15
|
+
OrcCompression = Literal["UNCOMPRESSED", "SNAPPY", "ZLIB", "LZ4", "ZSTD"]
|
|
16
|
+
OrcStrategy = Literal["SPEED", "COMPRESSION"]
|
|
17
|
+
|
|
18
|
+
# Valid compression options
|
|
19
|
+
VALID_ORC_COMPRESSION: list[OrcCompression] = cast(
|
|
20
|
+
list[OrcCompression], ["UNCOMPRESSED", "SNAPPY", "ZLIB", "LZ4", "ZSTD"]
|
|
21
|
+
)
|
|
22
|
+
VALID_ORC_STRATEGIES: list[OrcStrategy] = cast(
|
|
23
|
+
list[OrcStrategy], ["SPEED", "COMPRESSION"]
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
# Default settings
|
|
27
|
+
DEFAULT_ORC_COMPRESSION: OrcCompression = "SNAPPY"
|
|
28
|
+
DEFAULT_ORC_STRATEGY: OrcStrategy = "SPEED"
|
|
29
|
+
|
|
30
|
+
# Parquet compression types - matching pyarrow's expected values
|
|
31
|
+
ParquetCompression = Literal["none", "snappy", "gzip", "brotli", "lz4", "zstd"]
|
|
32
|
+
|
|
33
|
+
# Valid compression options for Parquet
|
|
34
|
+
VALID_PARQUET_COMPRESSION: list[ParquetCompression] = cast(
|
|
35
|
+
list[ParquetCompression], ["none", "snappy", "gzip", "brotli", "lz4", "zstd"]
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
# Default compression for Parquet
|
|
39
|
+
DEFAULT_PARQUET_COMPRESSION: ParquetCompression = "snappy"
|
|
40
|
+
|
|
41
|
+
# Default schema for data generation
|
|
42
|
+
DEFAULT_SCHEMA = {
|
|
43
|
+
"username": {"type": "string", "faker": "name"},
|
|
44
|
+
"bio": "text",
|
|
45
|
+
"age": "integer",
|
|
46
|
+
"emp_number": "bigint",
|
|
47
|
+
"height": "float",
|
|
48
|
+
"salary": "decimal",
|
|
49
|
+
"is_active": "boolean",
|
|
50
|
+
"signup_date": "date",
|
|
51
|
+
"signup_dt": "datetime",
|
|
52
|
+
"profile_picture": "blob",
|
|
53
|
+
"uuid": "uuid",
|
|
54
|
+
"user_enum": {"type": "enum", "values": ["active", "inactive", "pending"]},
|
|
55
|
+
}
|