TestDataX 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,184 @@
1
+ import decimal
2
+ from datetime import date, datetime
3
+ from typing import Any
4
+ from uuid import UUID
5
+
6
+ from .base_exporter import BaseExporter
7
+ from .utils.constants import DEFAULT_SCHEMA
8
+
9
+ MYSQL_TYPE_MAPPING = {
10
+ "string": "VARCHAR(255)",
11
+ "text": "TEXT",
12
+ "integer": "INT",
13
+ "bigint": "BIGINT",
14
+ "float": "FLOAT",
15
+ "decimal": "DECIMAL(18,2)",
16
+ "boolean": "TINYINT(1)",
17
+ "date": "DATE",
18
+ "datetime": "DATETIME",
19
+ "blob": "LONGBLOB",
20
+ "uuid": "VARCHAR(36)",
21
+ "enum": "ENUM",
22
+ }
23
+
24
+
25
+ class MysqlExporter(BaseExporter):
26
+ """Exports data to MySQL compatible SQL file."""
27
+
28
+ def _format_value(
29
+ self,
30
+ value: (
31
+ None
32
+ | str
33
+ | UUID
34
+ | datetime
35
+ | date
36
+ | bool
37
+ | bytes
38
+ | int
39
+ | float
40
+ | decimal.Decimal
41
+ ),
42
+ ) -> str:
43
+ """Format a value for use in a MySQL query.
44
+
45
+ This method handles various data types and converts them to their
46
+ appropriate string representation for use in MySQL queries.
47
+ It includes proper escaping for special characters in strings.
48
+
49
+ Args:
50
+ value: The value to format. Can be one of the following types:
51
+ - None: Converted to 'NULL'
52
+ - str: Escaped and wrapped in single quotes
53
+ - UUID: Converted to string, escaped and wrapped in single quotes
54
+ - datetime: Converted to ISO format string and wrapped in single quotes
55
+ - date: Converted to ISO format string and wrapped in single quotes
56
+ - bool: Converted to '1' for True or '0' for False
57
+ - bytes: Converted to hexadecimal string with 'x' prefix
58
+ - int: Converted to string representation
59
+ - float: Converted to string representation
60
+ - decimal.Decimal: Converted to string representation
61
+
62
+ Returns:
63
+ str: The formatted value ready for use in a MySQL query.
64
+
65
+ Raises:
66
+ ValueError: If the input value type is not supported.
67
+
68
+ """
69
+ if value is None:
70
+ return "NULL"
71
+ elif isinstance(value, (str | UUID)):
72
+ return "'" + str(value).replace("'", "\\'").replace("\n", "\\n") + "'"
73
+ elif isinstance(value, (datetime | date)):
74
+ return f"'{value.isoformat()}'"
75
+ elif isinstance(value, bool):
76
+ return "1" if value else "0"
77
+ elif isinstance(value, bytes):
78
+ return f"x'{value.hex()}'"
79
+ elif isinstance(value, (int | float | decimal.Decimal)):
80
+ return str(value)
81
+ else:
82
+ raise ValueError(f"Unsupported type: {type(value)}")
83
+
84
+ def _get_column_type(self, field: dict) -> str:
85
+ """Get the MySQL column type based on the field type.
86
+
87
+ Args:
88
+ field (dict): A dictionary containing field information, including its type.
89
+ The type can be either a string or a dictionary with a 'type' key.
90
+
91
+ Returns:
92
+ str: The corresponding MySQL data type from MYSQL_TYPE_MAPPING.
93
+
94
+ Raises:
95
+ KeyError: If the field type is not found in MYSQL_TYPE_MAPPING.
96
+
97
+ """
98
+ field_type = field.get("type", "string")
99
+ if isinstance(field_type, dict):
100
+ field_type = field_type.get("type", "string")
101
+ return MYSQL_TYPE_MAPPING[field_type]
102
+
103
+ def _create_table_stmt(self, schema: dict, table_name: str = "output") -> str:
104
+ """Generate a MySQL CREATE TABLE statement based on provided schema.
105
+
106
+ This method constructs a CREATE TABLE SQL statement by mapping schema field
107
+ definitions to their corresponding MySQL column types. It supports both simple
108
+ type definitions and complex field definitions including ENUM types.
109
+
110
+ Args:
111
+ schema (dict): A dictionary defining the table schema with field names
112
+ as keys and type definitions as values.
113
+ table_name (str, optional): Name for the table. Defaults to "output".
114
+
115
+ Returns:
116
+ str: A complete MySQL CREATE TABLE statement as a string.
117
+
118
+ Example:
119
+ schema = {
120
+ "id": "int",
121
+ "status": {"type": "enum", "values": ["active", "inactive"]}
122
+ }
123
+ result = _create_table_stmt(schema, "users")
124
+ # Returns: CREATE TABLE users (
125
+ # id INT NULL,
126
+ # status ENUM('active','inactive') NULL
127
+ # );
128
+
129
+ """
130
+ columns = []
131
+ for field_name, field_def in schema.items():
132
+ field_type_dict = (
133
+ field_def if isinstance(field_def, dict) else {"type": field_def}
134
+ )
135
+ sql_type = self._get_column_type(field_type_dict)
136
+ if (
137
+ isinstance(field_def, dict)
138
+ and field_def.get("type") == "enum"
139
+ and "values" in field_def
140
+ ):
141
+ values = "','".join(field_def["values"])
142
+ sql_type = f"ENUM('{values}')"
143
+ columns.append(f" {field_name} {sql_type} NULL")
144
+
145
+ return f"CREATE TABLE {table_name} (\n" + ",\n".join(columns) + "\n);\n\n"
146
+
147
+ def _create_insert_stmt(
148
+ self, row: dict[str, Any], table_name: str = "output"
149
+ ) -> str:
150
+ """Create a MySQL INSERT statement from a dictionary of values.
151
+
152
+ Args:
153
+ row (dict[str, Any]): Dictionary containing column names as keys
154
+ and values to insert
155
+ table_name (str, optional): Name of the target table. Defaults to "output"
156
+
157
+ Returns:
158
+ str: Formatted MySQL INSERT statement string
159
+
160
+ Example:
161
+ >>> row = {"id": 1, "name": "test"}
162
+ >>> _create_insert_stmt(row, "users")
163
+ 'INSERT INTO users (id, name) VALUES (1, "test");'
164
+
165
+ """
166
+ columns = ", ".join(row.keys())
167
+ values = ", ".join(self._format_value(v) for v in row.values())
168
+ return f"INSERT INTO {table_name} ({columns}) VALUES ({values});"
169
+
170
+ def export(
171
+ self, data: list[dict[str, Any]], output_path: str, schema: dict | None = None
172
+ ) -> None:
173
+ """Export data to MySQL compatible SQL file."""
174
+ if not data:
175
+ return
176
+
177
+ table_name = output_path.split("/")[-1].split(".")[0]
178
+
179
+ with open(output_path, "w") as f:
180
+ # Always write CREATE TABLE using DEFAULT_SCHEMA if no schema provided
181
+ schema_to_use = schema or DEFAULT_SCHEMA
182
+ f.write(self._create_table_stmt(schema_to_use, table_name))
183
+ for row in data:
184
+ f.write(self._create_insert_stmt(row, table_name) + "\n")
@@ -0,0 +1,205 @@
1
+ import decimal
2
+ from datetime import date, datetime
3
+ from typing import Any
4
+ from uuid import UUID
5
+
6
+ from .base_exporter import BaseExporter
7
+ from .utils.constants import DEFAULT_SCHEMA
8
+
9
+ ORACLE_TYPE_MAPPING = {
10
+ "string": "VARCHAR2(255)", # Oracle's Unicode string type
11
+ "text": "CLOB", # Oracle's large text type
12
+ "integer": "NUMBER(10)", # Standard integer in Oracle
13
+ "bigint": "NUMBER(19)", # Large integers in Oracle
14
+ "float": "FLOAT", # Same in Oracle
15
+ "decimal": "NUMBER(18,2)", # Oracle's decimal type
16
+ "boolean": "NUMBER(1)", # Oracle has no native boolean
17
+ "date": "DATE", # Same in Oracle
18
+ "datetime": "TIMESTAMP", # Oracle's datetime type
19
+ "blob": "BLOB", # Oracle's binary large object
20
+ "uuid": "VARCHAR2(36)", # Oracle has no native UUID
21
+ "enum": "VARCHAR2(255)", # Oracle has no ENUM type
22
+ }
23
+
24
+
25
+ class OracleExporter(BaseExporter):
26
+ """Exports data to ORACLE compatible SQL file."""
27
+
28
+ def _format_value(
29
+ self,
30
+ value: (
31
+ None
32
+ | str
33
+ | UUID
34
+ | datetime
35
+ | date
36
+ | bool
37
+ | bytes
38
+ | int
39
+ | float
40
+ | decimal.Decimal
41
+ ),
42
+ ) -> str:
43
+ """Format a value for use in a ORACLE query.
44
+
45
+ This method handles various data types and converts them to their
46
+ appropriate string representation for use in ORACLE queries.
47
+ It includes proper escaping for special characters in strings.
48
+
49
+ Args:
50
+ value: The value to format. Can be one of the following types:
51
+ - None: Converted to 'NULL'
52
+ - str: Escaped and wrapped in single quotes
53
+ - UUID: Converted to string, escaped and wrapped in single quotes
54
+ - datetime: Converted to ISO format string and wrapped in single quotes
55
+ - date: Converted to ISO format string and wrapped in single quotes
56
+ - bool: Converted to '1' for True or '0' for False
57
+ - bytes: Use 0x prefix for hexadecimal to binary conversion
58
+ - int: Converted to string representation
59
+ - float: Converted to string representation
60
+ - decimal.Decimal: Converted to string representation
61
+
62
+ Returns:
63
+ str: The formatted value ready for use in a ORACLE query.
64
+
65
+ Raises:
66
+ ValueError: If the input value type is not supported.
67
+
68
+ """
69
+ if value is None:
70
+ return "NULL"
71
+ elif isinstance(value, (str)):
72
+ return "'" + str(value).replace("'", "''") + "'" # uses '' for escaping
73
+ elif isinstance(value, UUID):
74
+ return f"'{str(value)}'"
75
+ elif isinstance(value, datetime):
76
+ return (
77
+ f"TO_TIMESTAMP('{value.isoformat()}', "
78
+ f"'YYYY-MM-DD\"T\"HH24:MI:SS.FF')"
79
+ )
80
+ elif isinstance(value, date):
81
+ return f"DATE '{value.isoformat()}'"
82
+ elif isinstance(value, bool):
83
+ return "1" if value else "0"
84
+ elif isinstance(value, bytes):
85
+ return f"HEXTORAW('{value.hex()}')" # Oracle BLOB format
86
+ elif isinstance(value, (int | float | decimal.Decimal)):
87
+ return str(value)
88
+ else:
89
+ raise ValueError(f"Unsupported type: {type(value)}")
90
+
91
+ def _get_column_type(self, field: dict) -> str:
92
+ """Get the ORACLE column type based on the field type.
93
+
94
+ Args:
95
+ field (dict): A dictionary containing field information, including its type.
96
+ The type can be either a string or a dictionary with a 'type' key.
97
+
98
+ Returns:
99
+ str: The corresponding ORACLE data type from ORACLE_TYPE_MAPPING.
100
+
101
+ Raises:
102
+ KeyError: If the field type is not found in ORACLE_TYPE_MAPPING.
103
+
104
+ """
105
+ field_type = field.get("type", "string")
106
+ if isinstance(field_type, dict):
107
+ field_type = field_type.get("type", "string")
108
+ return ORACLE_TYPE_MAPPING[field_type]
109
+
110
+ def _create_table_stmt(self, schema: dict, table_name: str = "output") -> str:
111
+ """Generate a ORACLE CREATE TABLE statement based on provided schema.
112
+
113
+ This method constructs a CREATE TABLE SQL statement by mapping schema field
114
+ definitions to their corresponding ORACLE column types. For ENUM types,
115
+ it creates a CHECK constraint to validate the values.
116
+
117
+ Args:
118
+ schema (dict): A dictionary defining the table schema with field names
119
+ as keys and type definitions as values.
120
+ table_name (str, optional): Name for the table. Defaults to "output".
121
+
122
+ Returns:
123
+ str: A complete ORACLE CREATE TABLE statement as a string.
124
+
125
+ Example:
126
+ schema = {
127
+ "id": "int",
128
+ "status": {"type": "enum", "values": ["active", "inactive"]}
129
+ }
130
+ result = _create_table_stmt(schema, "users")
131
+ # Returns: CREATE TABLE users (
132
+ # id INT NULL,
133
+ # status NVARCHAR(255) NULL CHECK (status IN ('active','inactive'))
134
+ # );
135
+
136
+ """
137
+ columns = []
138
+ check_constraints = []
139
+
140
+ for field_name, field_def in schema.items():
141
+ field_type_dict = (
142
+ field_def if isinstance(field_def, dict) else {"type": field_def}
143
+ )
144
+ sql_type = self._get_column_type(field_type_dict)
145
+
146
+ if (
147
+ isinstance(field_def, dict)
148
+ and field_def.get("type") == "enum"
149
+ and "values" in field_def
150
+ ):
151
+ values = "','".join(field_def["values"])
152
+ check_constraints.append(f"CHECK ({field_name} IN ('{values}'))")
153
+
154
+ columns.append(f" {field_name} {sql_type} NULL")
155
+
156
+ # Combine columns and check constraints
157
+ return (
158
+ f"CREATE TABLE {table_name} (\n"
159
+ + ",\n".join(columns)
160
+ + (
161
+ (",\n " + ",\n ".join(check_constraints))
162
+ if check_constraints
163
+ else ""
164
+ )
165
+ + "\n);\n\n"
166
+ )
167
+
168
+ def _create_insert_stmt(
169
+ self, row: dict[str, Any], table_name: str = "output"
170
+ ) -> str:
171
+ """Create a ORACLE INSERT statement from a dictionary of values.
172
+
173
+ Args:
174
+ row (dict[str, Any]): Dictionary containing column names as keys
175
+ and values to insert
176
+ table_name (str, optional): Name of the target table. Defaults to "output"
177
+
178
+ Returns:
179
+ str: Formatted ORACLE INSERT statement string
180
+
181
+ Example:
182
+ >>> row = {"id": 1, "name": "test"}
183
+ >>> _create_insert_stmt(row, "users")
184
+ 'INSERT INTO users (id, name) VALUES (1, "test");'
185
+
186
+ """
187
+ columns = ", ".join(row.keys())
188
+ values = ", ".join(self._format_value(v) for v in row.values())
189
+ return f"INSERT INTO {table_name} ({columns}) VALUES ({values});"
190
+
191
+ def export(
192
+ self, data: list[dict[str, Any]], output_path: str, schema: dict | None = None
193
+ ) -> None:
194
+ """Export data to ORACLE compatible SQL file."""
195
+ if not data:
196
+ return
197
+
198
+ table_name = output_path.split("/")[-1].split(".")[0]
199
+
200
+ with open(output_path, "w") as f:
201
+ # Always write CREATE TABLE using DEFAULT_SCHEMA if no schema provided
202
+ schema_to_use = schema or DEFAULT_SCHEMA
203
+ f.write(self._create_table_stmt(schema_to_use, table_name))
204
+ for row in data:
205
+ f.write(self._create_insert_stmt(row, table_name) + "\n")
@@ -0,0 +1,100 @@
1
+ import logging
2
+ from typing import Any
3
+
4
+ import pandas as pd
5
+ import pyarrow as pa
6
+ import pyarrow.orc as po
7
+
8
+ from .base_exporter import BaseExporter
9
+ from .utils.chunker import DataChunker
10
+ from .utils.constants import (
11
+ CHUNK_SIZE_ORC,
12
+ DEFAULT_ORC_COMPRESSION,
13
+ VALID_ORC_COMPRESSION,
14
+ OrcCompression,
15
+ )
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class OrcExporter(BaseExporter):
21
+ """Exports data to ORC file format using PyArrow and Pandas."""
22
+
23
+ def __init__(
24
+ self,
25
+ chunk_size: int = CHUNK_SIZE_ORC,
26
+ compression: OrcCompression = DEFAULT_ORC_COMPRESSION,
27
+ ) -> None:
28
+ """Initialize Exporter with specified options.
29
+
30
+ Args:
31
+ chunk_size: Number of rows to write at once
32
+ compression: Compression codec
33
+ compression_strategy: Compression strategy
34
+
35
+ """
36
+ if compression not in VALID_ORC_COMPRESSION:
37
+ raise ValueError(f"Invalid compression: {compression}")
38
+
39
+ self.chunker = DataChunker(chunk_size)
40
+ self.compression = compression
41
+
42
+ def _get_compressed_path(self, path: str) -> str:
43
+ """Insert compression type into filename before extension."""
44
+ if self.compression == "UNCOMPRESSED":
45
+ return path
46
+ # Remove any existing compression extension
47
+ base = path
48
+ for comp in VALID_ORC_COMPRESSION:
49
+ base = base.replace(f".{comp.lower()}", "")
50
+ # Split at last occurrence of .orc
51
+ base = base.replace(".orc", "")
52
+ return f"{base}.{self.compression.lower()}.orc"
53
+
54
+ def export(
55
+ self,
56
+ data: list[dict[str, Any]],
57
+ output_path: str,
58
+ schema: dict | None = None,
59
+ ) -> None:
60
+ """Export data to ORC file format.
61
+
62
+ Args:
63
+ data: List of dictionaries containing the data to export
64
+ output_path: Path where the ORC file will be written
65
+ schema: Schema definition for the data (not used for ORC)
66
+
67
+ """
68
+ if not data:
69
+ logger.info("No data to export")
70
+ raise RuntimeError("No data to export")
71
+
72
+ # Validate base path ends with .orc before adding compression
73
+ if not output_path.endswith(".orc"):
74
+ raise ValueError("Output path must have .orc extension")
75
+
76
+ output_path = self._get_compressed_path(output_path)
77
+ logger.info(f"Starting ORC export to {output_path} with {len(data)} rows")
78
+ if not isinstance(data, list):
79
+ raise TypeError("Data must be a list of dictionaries")
80
+
81
+ writer = None
82
+ try:
83
+ for i, chunk in enumerate(self.chunker.chunk_data(data), 1):
84
+ logger.debug(f"Processing chunk {i}")
85
+ df = pd.DataFrame(chunk)
86
+ if "uuid" in df.columns:
87
+ df["uuid"] = df["uuid"].astype(str)
88
+ table = pa.Table.from_pandas(df)
89
+ if writer is None:
90
+ writer = po.ORCWriter(
91
+ str(output_path),
92
+ compression=self.compression,
93
+ )
94
+ writer.write(table)
95
+ logger.info(f"Successfully exported {len(data)} rows to {output_path}")
96
+ except Exception as e:
97
+ raise RuntimeError(f"Failed to write Orc file: {e}") from e
98
+ finally:
99
+ if writer:
100
+ writer.close()
@@ -0,0 +1,102 @@
1
+ import logging
2
+ from typing import Any
3
+
4
+ import pandas as pd
5
+ import pyarrow as pa
6
+ import pyarrow.parquet as pq
7
+
8
+ from .base_exporter import BaseExporter
9
+ from .utils.chunker import DataChunker
10
+ from .utils.constants import (
11
+ CHUNK_SIZE_PARQUET,
12
+ DEFAULT_PARQUET_COMPRESSION,
13
+ VALID_PARQUET_COMPRESSION,
14
+ ParquetCompression,
15
+ )
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class ParquetExporter(BaseExporter):
21
+ """Exporter class for writing data to Parquet format files."""
22
+
23
+ def __init__(
24
+ self,
25
+ chunk_size: int = CHUNK_SIZE_PARQUET,
26
+ compression: ParquetCompression = DEFAULT_PARQUET_COMPRESSION,
27
+ ) -> None:
28
+ """Initialize Exporter with specified chunk size.
29
+
30
+ Args:
31
+ chunk_size (int, optional): Number of rows to write at once.
32
+ Defaults to CHUNK_SIZE_PARQUET.
33
+ compression (str, optional): Compression codec.
34
+
35
+ """
36
+ if compression not in VALID_PARQUET_COMPRESSION:
37
+ raise ValueError(f"Invalid compression: {compression}")
38
+
39
+ self.chunk_size = chunk_size
40
+ self.compression = compression
41
+ self.chunker = DataChunker(chunk_size)
42
+
43
+ def _get_compressed_path(self, path: str) -> str:
44
+ """Insert compression type into filename before extension."""
45
+ if self.compression == "none":
46
+ return path
47
+ base = path
48
+ for comp in VALID_PARQUET_COMPRESSION:
49
+ base = base.replace(f".{comp}", "")
50
+ base = base.replace(".parquet", "")
51
+ return f"{base}.{self.compression}.parquet"
52
+
53
+ def export(
54
+ self,
55
+ data: list[dict[str, Any]],
56
+ output_path: str,
57
+ schema: dict | None = None,
58
+ ) -> None:
59
+ """Export data to a Parquet file.
60
+
61
+ Args:
62
+ data: List of dictionaries containing the data to export
63
+ output_path: Path where the Parquet file will be written
64
+ schema: Optional dictionary defining the data schema
65
+ chunk_size: Number of rows to write at once (default: CHUNK_SIZE_PARQUET)
66
+
67
+ """
68
+ if not data:
69
+ logger.info("No data to export")
70
+ raise RuntimeError("No data to export")
71
+
72
+ # Validate base path ends with .parquet before adding compression
73
+ if not output_path.endswith(".parquet"):
74
+ raise ValueError("Output path must have .parquet extension")
75
+
76
+ output_path = self._get_compressed_path(output_path)
77
+ logger.info(f"Starting Parquet export to {output_path} with {len(data)} rows")
78
+ if not isinstance(data, list):
79
+ raise TypeError("Data must be a list of dictionaries")
80
+
81
+ writer = None
82
+ try:
83
+ for i, chunk in enumerate(self.chunker.chunk_data(data), 1):
84
+ logger.debug(f"Processing chunk {i}")
85
+ df = pd.DataFrame(chunk)
86
+ if "uuid" in df.columns:
87
+ df["uuid"] = df["uuid"].astype(str)
88
+ table = pa.Table.from_pandas(df)
89
+ # Initialize writer with first chunk's schema
90
+ if writer is None:
91
+ writer = pq.ParquetWriter(
92
+ output_path,
93
+ table.schema,
94
+ compression=self.compression,
95
+ )
96
+ writer.write_table(table)
97
+ logger.info(f"Successfully exported {len(data)} rows to {output_path}")
98
+ except Exception as e:
99
+ raise RuntimeError(f"Failed to write Parquet file: {e}") from e
100
+ finally:
101
+ if writer is not None:
102
+ writer.close()
File without changes
@@ -0,0 +1,27 @@
1
+ import logging
2
+ from collections.abc import Iterator
3
+ from typing import TypeVar
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+ T = TypeVar("T")
8
+
9
+
10
+ class DataChunker:
11
+ """Handle data chunking operations."""
12
+
13
+ def __init__(self, chunk_size: int) -> None:
14
+ """Initialize the DataChunker with a specified chunk size."""
15
+ if chunk_size <= 0:
16
+ raise ValueError("Chunk size must be greater than zero")
17
+ self.chunk_size = chunk_size
18
+
19
+ def chunk_data(self, data: list[T]) -> Iterator[list[T]]:
20
+ """Yield chunks of data with memory cleanup."""
21
+ total_chunks = (len(data) + self.chunk_size - 1) // self.chunk_size
22
+ for chunk_num, i in enumerate(range(0, len(data), self.chunk_size), 1):
23
+ chunk = data[i : i + self.chunk_size]
24
+ logger.debug(f"Processing chunk {chunk_num}/{total_chunks}")
25
+ yield chunk
26
+ # Help garbage collection
27
+ del chunk
@@ -0,0 +1,55 @@
1
+ from typing import Literal, cast
2
+
3
+ # Export related constants
4
+ EXPORT_FORMATS = ["csv", "json", "parquet", "orc", "mysql", "mssql", "oracle"]
5
+ EXPORT_PATTERNS = "^(csv|json|parquet|orc|mysql|mssql|oracle)$"
6
+
7
+ # Chunk size for exporting data
8
+ CHUNK_SIZE = 10000
9
+ CHUNK_SIZE_JSON = CHUNK_SIZE
10
+ CHUNK_SIZE_CSV = CHUNK_SIZE
11
+ CHUNK_SIZE_PARQUET = CHUNK_SIZE
12
+ CHUNK_SIZE_ORC = CHUNK_SIZE
13
+
14
+ # ORC compression types
15
+ OrcCompression = Literal["UNCOMPRESSED", "SNAPPY", "ZLIB", "LZ4", "ZSTD"]
16
+ OrcStrategy = Literal["SPEED", "COMPRESSION"]
17
+
18
+ # Valid compression options
19
+ VALID_ORC_COMPRESSION: list[OrcCompression] = cast(
20
+ list[OrcCompression], ["UNCOMPRESSED", "SNAPPY", "ZLIB", "LZ4", "ZSTD"]
21
+ )
22
+ VALID_ORC_STRATEGIES: list[OrcStrategy] = cast(
23
+ list[OrcStrategy], ["SPEED", "COMPRESSION"]
24
+ )
25
+
26
+ # Default settings
27
+ DEFAULT_ORC_COMPRESSION: OrcCompression = "SNAPPY"
28
+ DEFAULT_ORC_STRATEGY: OrcStrategy = "SPEED"
29
+
30
+ # Parquet compression types - matching pyarrow's expected values
31
+ ParquetCompression = Literal["none", "snappy", "gzip", "brotli", "lz4", "zstd"]
32
+
33
+ # Valid compression options for Parquet
34
+ VALID_PARQUET_COMPRESSION: list[ParquetCompression] = cast(
35
+ list[ParquetCompression], ["none", "snappy", "gzip", "brotli", "lz4", "zstd"]
36
+ )
37
+
38
+ # Default compression for Parquet
39
+ DEFAULT_PARQUET_COMPRESSION: ParquetCompression = "snappy"
40
+
41
+ # Default schema for data generation
42
+ DEFAULT_SCHEMA = {
43
+ "username": {"type": "string", "faker": "name"},
44
+ "bio": "text",
45
+ "age": "integer",
46
+ "emp_number": "bigint",
47
+ "height": "float",
48
+ "salary": "decimal",
49
+ "is_active": "boolean",
50
+ "signup_date": "date",
51
+ "signup_dt": "datetime",
52
+ "profile_picture": "blob",
53
+ "uuid": "uuid",
54
+ "user_enum": {"type": "enum", "values": ["active", "inactive", "pending"]},
55
+ }