aio-sf 0.1.0b3__tar.gz → 0.1.0b5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/PKG-INFO +2 -2
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/pyproject.toml +1 -1
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/exporter/bulk_export.py +41 -42
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/exporter/parquet_writer.py +107 -18
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/.cursor/rules/api-structure.mdc +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/.cursor/rules/async-patterns.mdc +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/.cursor/rules/project-tooling.mdc +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/.github/workflows/publish.yml +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/.github/workflows/test.yml +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/.gitignore +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/LICENSE +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/README.md +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/RELEASE.md +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/pytest.ini +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/__init__.py +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/__init__.py +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/auth/__init__.py +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/auth/base.py +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/auth/client_credentials.py +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/auth/refresh_token.py +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/auth/sfdx_cli.py +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/auth/static_token.py +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/bulk_v2/__init__.py +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/bulk_v2/client.py +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/bulk_v2/types.py +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/client.py +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/collections/__init__.py +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/collections/client.py +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/collections/types.py +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/describe/__init__.py +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/describe/client.py +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/describe/types.py +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/query/__init__.py +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/query/client.py +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/query/types.py +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/types.py +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/exporter/__init__.py +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/tests/__init__.py +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/tests/conftest.py +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/tests/test_api_clients.py +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/tests/test_auth.py +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/tests/test_client.py +0 -0
- {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/uv.lock +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: aio-sf
|
|
3
|
-
Version: 0.1.
|
|
4
|
-
Summary: Async Salesforce library for Python
|
|
3
|
+
Version: 0.1.0b5
|
|
4
|
+
Summary: Async Salesforce library for Python
|
|
5
5
|
Project-URL: Homepage, https://github.com/callawaycloud/aio-salesforce
|
|
6
6
|
Project-URL: Repository, https://github.com/callawaycloud/aio-salesforce
|
|
7
7
|
Project-URL: Issues, https://github.com/callawaycloud/aio-salesforce/issues
|
|
@@ -5,7 +5,7 @@ build-backend = "hatchling.build"
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "aio-sf"
|
|
7
7
|
dynamic = ["version"]
|
|
8
|
-
description = "Async Salesforce library for Python
|
|
8
|
+
description = "Async Salesforce library for Python"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {file = "LICENSE"}
|
|
11
11
|
authors = [
|
|
@@ -2,6 +2,7 @@ import logging
|
|
|
2
2
|
from typing import Any, Dict, List, Generator, Optional
|
|
3
3
|
import csv
|
|
4
4
|
import asyncio
|
|
5
|
+
import io
|
|
5
6
|
|
|
6
7
|
from ..api.describe.types import FieldInfo
|
|
7
8
|
from ..api.client import SalesforceClient
|
|
@@ -111,37 +112,50 @@ class QueryResult:
|
|
|
111
112
|
"""
|
|
112
113
|
Stream CSV response and convert to record dictionaries.
|
|
113
114
|
|
|
115
|
+
Uses proper CSV parsing to handle quotes, newlines, and special characters correctly.
|
|
116
|
+
|
|
114
117
|
:param response_text: CSV response text
|
|
115
118
|
:yields: Individual record dictionaries
|
|
116
119
|
"""
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
# Get the header row first
|
|
120
|
-
if not lines:
|
|
120
|
+
if not response_text or not response_text.strip():
|
|
121
121
|
# No data in this batch
|
|
122
122
|
return
|
|
123
123
|
|
|
124
124
|
try:
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
#
|
|
129
|
-
|
|
125
|
+
# Create a StringIO object for proper CSV parsing
|
|
126
|
+
csv_buffer = io.StringIO(response_text)
|
|
127
|
+
|
|
128
|
+
# Use DictReader for proper CSV parsing with header detection
|
|
129
|
+
# This handles quotes, newlines in fields, and escaping correctly
|
|
130
|
+
csv_reader = csv.DictReader(
|
|
131
|
+
csv_buffer,
|
|
132
|
+
delimiter=",",
|
|
133
|
+
quotechar='"',
|
|
134
|
+
quoting=csv.QUOTE_MINIMAL,
|
|
135
|
+
skipinitialspace=True,
|
|
136
|
+
)
|
|
130
137
|
|
|
131
|
-
|
|
132
|
-
for line in lines[1:]:
|
|
133
|
-
if line.strip(): # Skip empty lines
|
|
138
|
+
for row_num, record in enumerate(csv_reader, start=1):
|
|
134
139
|
try:
|
|
135
|
-
#
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
140
|
+
# Convert None values to empty strings for consistency
|
|
141
|
+
cleaned_record = {
|
|
142
|
+
key: (value if value is not None else "")
|
|
143
|
+
for key, value in record.items()
|
|
144
|
+
}
|
|
145
|
+
yield cleaned_record
|
|
146
|
+
except Exception as e:
|
|
147
|
+
logging.warning(f"Error processing CSV record {row_num}: {e}")
|
|
148
|
+
# Continue processing other records
|
|
143
149
|
continue
|
|
144
150
|
|
|
151
|
+
except csv.Error as e:
|
|
152
|
+
logging.error(f"CSV parsing error: {e}")
|
|
153
|
+
# If CSV parsing fails completely, don't yield any records
|
|
154
|
+
return
|
|
155
|
+
except Exception as e:
|
|
156
|
+
logging.error(f"Unexpected error parsing CSV response: {e}")
|
|
157
|
+
return
|
|
158
|
+
|
|
145
159
|
async def _generate_records(self):
|
|
146
160
|
"""Async generator that yields individual records."""
|
|
147
161
|
locator = self._query_locator
|
|
@@ -170,7 +184,9 @@ class QueryResult:
|
|
|
170
184
|
|
|
171
185
|
except Exception as e:
|
|
172
186
|
raise Exception(
|
|
173
|
-
f"Error processing record {ctn}: {e}. Current Query Locator: {locator}"
|
|
187
|
+
f"Error processing record {ctn}: {e}. Current Query Locator: {locator}. "
|
|
188
|
+
f"This may indicate a CSV parsing issue - check if the response contains "
|
|
189
|
+
f"malformed CSV data or fields with special characters."
|
|
174
190
|
)
|
|
175
191
|
|
|
176
192
|
|
|
@@ -296,32 +312,15 @@ def resume_from_locator(
|
|
|
296
312
|
|
|
297
313
|
|
|
298
314
|
# Helper function to get all fields that can be queried by bulk API
|
|
299
|
-
async def get_bulk_fields(
|
|
300
|
-
|
|
301
|
-
) -> List[FieldInfo]:
|
|
302
|
-
"""Get field metadata for queryable fields in a Salesforce object.
|
|
303
|
-
|
|
304
|
-
:param sf: Salesforce client instance
|
|
305
|
-
:param object_type: Name of the Salesforce object (e.g., 'Account', 'Contact')
|
|
306
|
-
:param api_version: API version to use (defaults to client version)
|
|
307
|
-
:returns: List of field metadata dictionaries for queryable fields
|
|
308
|
-
"""
|
|
315
|
+
async def get_bulk_fields(fields_metadata: List[FieldInfo]) -> List[FieldInfo]:
|
|
316
|
+
"""Get field metadata for queryable fields in a Salesforce object."""
|
|
309
317
|
# Use the metadata API to get object description
|
|
310
|
-
describe_data = await sf.describe.sobject(object_type, api_version)
|
|
311
|
-
fields_metadata = describe_data["fields"]
|
|
312
|
-
|
|
313
|
-
# Create a set of all compound field names to exclude
|
|
314
|
-
compound_field_names = {
|
|
315
|
-
field.get("compoundFieldName")
|
|
316
|
-
for field in fields_metadata
|
|
317
|
-
if field.get("compoundFieldName")
|
|
318
|
-
}
|
|
319
318
|
|
|
320
|
-
# Filter to only queryable fields that aren't compound fields
|
|
319
|
+
# Filter to only queryable fields that aren't compound fields (unless field is actually name)
|
|
321
320
|
queryable_fields = [
|
|
322
321
|
field
|
|
323
322
|
for field in fields_metadata
|
|
324
|
-
if field.get("
|
|
323
|
+
if field.get("type") not in ["address", "location"]
|
|
325
324
|
]
|
|
326
325
|
|
|
327
326
|
return queryable_fields
|
|
@@ -3,26 +3,37 @@ Parquet writer module for converting Salesforce QueryResult to Parquet format.
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
from typing import Any, Dict, List, Optional
|
|
6
|
+
from typing import Any, Dict, List, Optional, Callable
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
import pyarrow as pa
|
|
9
9
|
import pandas as pd
|
|
10
10
|
import pyarrow.parquet as pq
|
|
11
|
+
from datetime import datetime
|
|
11
12
|
|
|
12
13
|
from ..api.describe.types import FieldInfo
|
|
13
14
|
|
|
14
15
|
from .bulk_export import QueryResult, batch_records_async
|
|
15
16
|
|
|
16
17
|
|
|
17
|
-
def salesforce_to_arrow_type(
|
|
18
|
-
|
|
18
|
+
def salesforce_to_arrow_type(
|
|
19
|
+
sf_type: str, convert_datetime_to_timestamp: bool = True
|
|
20
|
+
) -> pa.DataType:
|
|
21
|
+
"""Convert Salesforce data types to Arrow data types.
|
|
22
|
+
|
|
23
|
+
:param sf_type: Salesforce field type
|
|
24
|
+
:param convert_datetime_to_timestamp: If True, datetime fields use timestamp type, otherwise string
|
|
25
|
+
"""
|
|
19
26
|
type_mapping = {
|
|
20
27
|
"string": pa.string(),
|
|
21
28
|
"boolean": pa.bool_(),
|
|
22
29
|
"int": pa.int64(),
|
|
23
30
|
"double": pa.float64(),
|
|
24
|
-
"date": pa.string(), #
|
|
25
|
-
"datetime":
|
|
31
|
+
"date": pa.string(), # Always store as string since SF returns ISO format
|
|
32
|
+
"datetime": (
|
|
33
|
+
pa.timestamp("us", tz="UTC")
|
|
34
|
+
if convert_datetime_to_timestamp
|
|
35
|
+
else pa.string()
|
|
36
|
+
),
|
|
26
37
|
"currency": pa.float64(),
|
|
27
38
|
"reference": pa.string(),
|
|
28
39
|
"picklist": pa.string(),
|
|
@@ -40,18 +51,26 @@ def salesforce_to_arrow_type(sf_type: str) -> pa.DataType:
|
|
|
40
51
|
return type_mapping.get(sf_type.lower(), pa.string())
|
|
41
52
|
|
|
42
53
|
|
|
43
|
-
def create_schema_from_metadata(
|
|
54
|
+
def create_schema_from_metadata(
|
|
55
|
+
fields_metadata: List[FieldInfo],
|
|
56
|
+
column_formatter: Optional[Callable[[str], str]] = None,
|
|
57
|
+
convert_datetime_to_timestamp: bool = True,
|
|
58
|
+
) -> pa.Schema:
|
|
44
59
|
"""
|
|
45
60
|
Create a PyArrow schema from Salesforce field metadata.
|
|
46
61
|
|
|
47
62
|
:param fields_metadata: List of field metadata dictionaries from Salesforce
|
|
63
|
+
:param column_formatter: Optional function to format column names
|
|
64
|
+
:param convert_datetime_to_timestamp: If True, datetime fields use timestamp type, otherwise string
|
|
48
65
|
:returns: PyArrow schema
|
|
49
66
|
"""
|
|
50
67
|
arrow_fields = []
|
|
51
68
|
for field in fields_metadata:
|
|
52
|
-
field_name = field.get("name", "")
|
|
69
|
+
field_name = field.get("name", "")
|
|
70
|
+
if column_formatter:
|
|
71
|
+
field_name = column_formatter(field_name)
|
|
53
72
|
sf_type = field.get("type", "string")
|
|
54
|
-
arrow_type = salesforce_to_arrow_type(sf_type)
|
|
73
|
+
arrow_type = salesforce_to_arrow_type(sf_type, convert_datetime_to_timestamp)
|
|
55
74
|
# All fields are nullable since Salesforce can return empty values
|
|
56
75
|
arrow_fields.append(pa.field(field_name, arrow_type, nullable=True))
|
|
57
76
|
|
|
@@ -70,6 +89,8 @@ class ParquetWriter:
|
|
|
70
89
|
schema: Optional[pa.Schema] = None,
|
|
71
90
|
batch_size: int = 10000,
|
|
72
91
|
convert_empty_to_null: bool = True,
|
|
92
|
+
column_formatter: Optional[Callable[[str], str]] = None,
|
|
93
|
+
convert_datetime_to_timestamp: bool = True,
|
|
73
94
|
):
|
|
74
95
|
"""
|
|
75
96
|
Initialize ParquetWriter.
|
|
@@ -78,11 +99,15 @@ class ParquetWriter:
|
|
|
78
99
|
:param schema: Optional PyArrow schema. If None, will be inferred from first batch
|
|
79
100
|
:param batch_size: Number of records to process in each batch
|
|
80
101
|
:param convert_empty_to_null: Convert empty strings to null values
|
|
102
|
+
:param column_formatter: Optional function to format column names. If None, no formatting is applied
|
|
103
|
+
:param convert_datetime_to_timestamp: If True, datetime fields are converted to timestamps, otherwise stored as strings
|
|
81
104
|
"""
|
|
82
105
|
self.file_path = file_path
|
|
83
106
|
self.schema = schema
|
|
84
107
|
self.batch_size = batch_size
|
|
85
108
|
self.convert_empty_to_null = convert_empty_to_null
|
|
109
|
+
self.column_formatter = column_formatter
|
|
110
|
+
self.convert_datetime_to_timestamp = convert_datetime_to_timestamp
|
|
86
111
|
self._writer = None
|
|
87
112
|
self._schema_finalized = False
|
|
88
113
|
|
|
@@ -106,10 +131,15 @@ class ParquetWriter:
|
|
|
106
131
|
if not batch:
|
|
107
132
|
return
|
|
108
133
|
|
|
109
|
-
#
|
|
134
|
+
# Apply column formatting if specified
|
|
110
135
|
converted_batch = []
|
|
111
136
|
for record in batch:
|
|
112
|
-
|
|
137
|
+
if self.column_formatter:
|
|
138
|
+
converted_record = {
|
|
139
|
+
self.column_formatter(k): v for k, v in record.items()
|
|
140
|
+
}
|
|
141
|
+
else:
|
|
142
|
+
converted_record = record.copy()
|
|
113
143
|
converted_batch.append(converted_record)
|
|
114
144
|
|
|
115
145
|
# Create DataFrame
|
|
@@ -121,7 +151,7 @@ class ParquetWriter:
|
|
|
121
151
|
self.schema = self._infer_schema_from_dataframe(df)
|
|
122
152
|
else:
|
|
123
153
|
# Filter schema to only include fields that are actually in the data
|
|
124
|
-
self.schema = self._filter_schema_to_data(self.schema, df.columns)
|
|
154
|
+
self.schema = self._filter_schema_to_data(self.schema, list(df.columns))
|
|
125
155
|
self._schema_finalized = True
|
|
126
156
|
|
|
127
157
|
# Apply data type conversions based on schema
|
|
@@ -181,6 +211,8 @@ class ParquetWriter:
|
|
|
181
211
|
|
|
182
212
|
def _convert_dataframe_types(self, df: pd.DataFrame) -> None:
|
|
183
213
|
"""Convert DataFrame types based on the schema."""
|
|
214
|
+
if self.schema is None:
|
|
215
|
+
return
|
|
184
216
|
for field in self.schema:
|
|
185
217
|
field_name = field.name
|
|
186
218
|
if field_name not in df.columns:
|
|
@@ -192,23 +224,72 @@ class ParquetWriter:
|
|
|
192
224
|
|
|
193
225
|
# Apply type-specific conversions
|
|
194
226
|
if pa.types.is_boolean(field.type):
|
|
195
|
-
# Convert string 'true'/'false' to boolean
|
|
196
|
-
df[field_name]
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
227
|
+
# Convert string 'true'/'false' to boolean, keeping original values for others
|
|
228
|
+
original_series = df[field_name]
|
|
229
|
+
mapped_series = original_series.map(
|
|
230
|
+
{"true": True, "false": False, None: None}
|
|
231
|
+
)
|
|
232
|
+
# For values that weren't mapped, keep the original values
|
|
233
|
+
# This avoids the fillna FutureWarning by using boolean indexing instead
|
|
234
|
+
mask = mapped_series.notna()
|
|
235
|
+
result_series = original_series.copy()
|
|
236
|
+
result_series.loc[mask] = mapped_series.loc[mask]
|
|
237
|
+
df[field_name] = result_series
|
|
201
238
|
elif pa.types.is_integer(field.type):
|
|
202
239
|
df[field_name] = pd.to_numeric(df[field_name], errors="coerce").astype(
|
|
203
240
|
"Int64"
|
|
204
241
|
) # Nullable integer
|
|
205
242
|
elif pa.types.is_floating(field.type):
|
|
206
243
|
df[field_name] = pd.to_numeric(df[field_name], errors="coerce")
|
|
244
|
+
elif pa.types.is_timestamp(field.type):
|
|
245
|
+
# Convert Salesforce ISO datetime strings to timestamps
|
|
246
|
+
datetime_series = df[field_name]
|
|
247
|
+
if isinstance(datetime_series, pd.Series):
|
|
248
|
+
df[field_name] = self._convert_datetime_strings_to_timestamps(
|
|
249
|
+
datetime_series
|
|
250
|
+
)
|
|
207
251
|
|
|
208
252
|
# Replace empty strings with None for non-string fields
|
|
209
253
|
if not pa.types.is_string(field.type):
|
|
210
254
|
df[field_name] = df[field_name].replace("", pd.NA)
|
|
211
255
|
|
|
256
|
+
def _convert_datetime_strings_to_timestamps(self, series: pd.Series) -> pd.Series:
|
|
257
|
+
"""
|
|
258
|
+
Convert Salesforce ISO datetime strings to pandas datetime objects.
|
|
259
|
+
|
|
260
|
+
Salesforce returns datetime in ISO format like '2023-12-25T10:30:00.000+0000'
|
|
261
|
+
or '2023-12-25T10:30:00Z'. This method handles various ISO formats.
|
|
262
|
+
"""
|
|
263
|
+
|
|
264
|
+
def parse_sf_datetime(dt_str):
|
|
265
|
+
if pd.isna(dt_str) or dt_str == "" or dt_str is None:
|
|
266
|
+
return pd.NaT
|
|
267
|
+
|
|
268
|
+
try:
|
|
269
|
+
# Handle common Salesforce datetime formats
|
|
270
|
+
dt_str = str(dt_str).strip()
|
|
271
|
+
|
|
272
|
+
# Convert +0000 to Z for pandas compatibility
|
|
273
|
+
if dt_str.endswith("+0000"):
|
|
274
|
+
dt_str = dt_str[:-5] + "Z"
|
|
275
|
+
elif dt_str.endswith("+00:00"):
|
|
276
|
+
dt_str = dt_str[:-6] + "Z"
|
|
277
|
+
|
|
278
|
+
# Use pandas to_datetime with UTC parsing
|
|
279
|
+
return pd.to_datetime(dt_str, utc=True)
|
|
280
|
+
|
|
281
|
+
except (ValueError, TypeError) as e:
|
|
282
|
+
logging.warning(f"Failed to parse datetime string '{dt_str}': {e}")
|
|
283
|
+
return pd.NaT
|
|
284
|
+
|
|
285
|
+
# Apply the conversion function to the series
|
|
286
|
+
result = series.apply(parse_sf_datetime)
|
|
287
|
+
if isinstance(result, pd.Series):
|
|
288
|
+
return result
|
|
289
|
+
else:
|
|
290
|
+
# This shouldn't happen, but handle it gracefully
|
|
291
|
+
return pd.Series(result, index=series.index)
|
|
292
|
+
|
|
212
293
|
def close(self) -> None:
|
|
213
294
|
"""Close the parquet writer."""
|
|
214
295
|
if self._writer:
|
|
@@ -223,6 +304,8 @@ async def write_query_to_parquet(
|
|
|
223
304
|
schema: Optional[pa.Schema] = None,
|
|
224
305
|
batch_size: int = 10000,
|
|
225
306
|
convert_empty_to_null: bool = True,
|
|
307
|
+
column_formatter: Optional[Callable[[str], str]] = None,
|
|
308
|
+
convert_datetime_to_timestamp: bool = True,
|
|
226
309
|
) -> None:
|
|
227
310
|
"""
|
|
228
311
|
Convenience function to write a QueryResult to a parquet file (async version).
|
|
@@ -233,18 +316,24 @@ async def write_query_to_parquet(
|
|
|
233
316
|
:param schema: Optional pre-created PyArrow schema (takes precedence over fields_metadata)
|
|
234
317
|
:param batch_size: Number of records to process in each batch
|
|
235
318
|
:param convert_empty_to_null: Convert empty strings to null values
|
|
319
|
+
:param column_formatter: Optional function to format column names
|
|
320
|
+
:param convert_datetime_to_timestamp: If True, datetime fields are converted to timestamps, otherwise stored as strings
|
|
236
321
|
"""
|
|
237
322
|
effective_schema = None
|
|
238
323
|
if schema:
|
|
239
324
|
effective_schema = schema
|
|
240
325
|
elif fields_metadata:
|
|
241
|
-
effective_schema = create_schema_from_metadata(
|
|
326
|
+
effective_schema = create_schema_from_metadata(
|
|
327
|
+
fields_metadata, column_formatter, convert_datetime_to_timestamp
|
|
328
|
+
)
|
|
242
329
|
|
|
243
330
|
writer = ParquetWriter(
|
|
244
331
|
file_path=file_path,
|
|
245
332
|
schema=effective_schema,
|
|
246
333
|
batch_size=batch_size,
|
|
247
334
|
convert_empty_to_null=convert_empty_to_null,
|
|
335
|
+
column_formatter=column_formatter,
|
|
336
|
+
convert_datetime_to_timestamp=convert_datetime_to_timestamp,
|
|
248
337
|
)
|
|
249
338
|
|
|
250
339
|
await writer.write_query_result(query_result)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|