aio-sf 0.1.0b3__tar.gz → 0.1.0b5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/PKG-INFO +2 -2
  2. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/pyproject.toml +1 -1
  3. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/exporter/bulk_export.py +41 -42
  4. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/exporter/parquet_writer.py +107 -18
  5. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/.cursor/rules/api-structure.mdc +0 -0
  6. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/.cursor/rules/async-patterns.mdc +0 -0
  7. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/.cursor/rules/project-tooling.mdc +0 -0
  8. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/.github/workflows/publish.yml +0 -0
  9. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/.github/workflows/test.yml +0 -0
  10. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/.gitignore +0 -0
  11. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/LICENSE +0 -0
  12. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/README.md +0 -0
  13. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/RELEASE.md +0 -0
  14. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/pytest.ini +0 -0
  15. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/__init__.py +0 -0
  16. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/__init__.py +0 -0
  17. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/auth/__init__.py +0 -0
  18. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/auth/base.py +0 -0
  19. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/auth/client_credentials.py +0 -0
  20. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/auth/refresh_token.py +0 -0
  21. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/auth/sfdx_cli.py +0 -0
  22. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/auth/static_token.py +0 -0
  23. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/bulk_v2/__init__.py +0 -0
  24. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/bulk_v2/client.py +0 -0
  25. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/bulk_v2/types.py +0 -0
  26. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/client.py +0 -0
  27. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/collections/__init__.py +0 -0
  28. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/collections/client.py +0 -0
  29. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/collections/types.py +0 -0
  30. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/describe/__init__.py +0 -0
  31. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/describe/client.py +0 -0
  32. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/describe/types.py +0 -0
  33. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/query/__init__.py +0 -0
  34. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/query/client.py +0 -0
  35. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/query/types.py +0 -0
  36. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/api/types.py +0 -0
  37. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/src/aio_sf/exporter/__init__.py +0 -0
  38. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/tests/__init__.py +0 -0
  39. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/tests/conftest.py +0 -0
  40. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/tests/test_api_clients.py +0 -0
  41. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/tests/test_auth.py +0 -0
  42. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/tests/test_client.py +0 -0
  43. {aio_sf-0.1.0b3 → aio_sf-0.1.0b5}/uv.lock +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aio-sf
3
- Version: 0.1.0b3
4
- Summary: Async Salesforce library for Python with Bulk API 2.0 support
3
+ Version: 0.1.0b5
4
+ Summary: Async Salesforce library for Python
5
5
  Project-URL: Homepage, https://github.com/callawaycloud/aio-salesforce
6
6
  Project-URL: Repository, https://github.com/callawaycloud/aio-salesforce
7
7
  Project-URL: Issues, https://github.com/callawaycloud/aio-salesforce/issues
@@ -5,7 +5,7 @@ build-backend = "hatchling.build"
5
5
  [project]
6
6
  name = "aio-sf"
7
7
  dynamic = ["version"]
8
- description = "Async Salesforce library for Python with Bulk API 2.0 support"
8
+ description = "Async Salesforce library for Python"
9
9
  readme = "README.md"
10
10
  license = {file = "LICENSE"}
11
11
  authors = [
@@ -2,6 +2,7 @@ import logging
2
2
  from typing import Any, Dict, List, Generator, Optional
3
3
  import csv
4
4
  import asyncio
5
+ import io
5
6
 
6
7
  from ..api.describe.types import FieldInfo
7
8
  from ..api.client import SalesforceClient
@@ -111,37 +112,50 @@ class QueryResult:
111
112
  """
112
113
  Stream CSV response and convert to record dictionaries.
113
114
 
115
+ Uses proper CSV parsing to handle quotes, newlines, and special characters correctly.
116
+
114
117
  :param response_text: CSV response text
115
118
  :yields: Individual record dictionaries
116
119
  """
117
- lines = response_text.splitlines()
118
-
119
- # Get the header row first
120
- if not lines:
120
+ if not response_text or not response_text.strip():
121
121
  # No data in this batch
122
122
  return
123
123
 
124
124
  try:
125
- header_line = lines[0]
126
- fieldnames = next(csv.reader([header_line]))
127
- except (IndexError, StopIteration, csv.Error):
128
- # No data in this batch
129
- return
125
+ # Create a StringIO object for proper CSV parsing
126
+ csv_buffer = io.StringIO(response_text)
127
+
128
+ # Use DictReader for proper CSV parsing with header detection
129
+ # This handles quotes, newlines in fields, and escaping correctly
130
+ csv_reader = csv.DictReader(
131
+ csv_buffer,
132
+ delimiter=",",
133
+ quotechar='"',
134
+ quoting=csv.QUOTE_MINIMAL,
135
+ skipinitialspace=True,
136
+ )
130
137
 
131
- # Process each data row
132
- for line in lines[1:]:
133
- if line.strip(): # Skip empty lines
138
+ for row_num, record in enumerate(csv_reader, start=1):
134
139
  try:
135
- # Parse the CSV row
136
- row_values = next(csv.reader([line]))
137
- # Convert to dictionary
138
- row = dict(zip(fieldnames, row_values))
139
- yield row
140
- except (csv.Error, StopIteration):
141
- logging.warning(f"Error parsing line: {line}")
142
- # Skip malformed lines
140
+ # Convert None values to empty strings for consistency
141
+ cleaned_record = {
142
+ key: (value if value is not None else "")
143
+ for key, value in record.items()
144
+ }
145
+ yield cleaned_record
146
+ except Exception as e:
147
+ logging.warning(f"Error processing CSV record {row_num}: {e}")
148
+ # Continue processing other records
143
149
  continue
144
150
 
151
+ except csv.Error as e:
152
+ logging.error(f"CSV parsing error: {e}")
153
+ # If CSV parsing fails completely, don't yield any records
154
+ return
155
+ except Exception as e:
156
+ logging.error(f"Unexpected error parsing CSV response: {e}")
157
+ return
158
+
145
159
  async def _generate_records(self):
146
160
  """Async generator that yields individual records."""
147
161
  locator = self._query_locator
@@ -170,7 +184,9 @@ class QueryResult:
170
184
 
171
185
  except Exception as e:
172
186
  raise Exception(
173
- f"Error processing record {ctn}: {e}. Current Query Locator: {locator}"
187
+ f"Error processing record {ctn}: {e}. Current Query Locator: {locator}. "
188
+ f"This may indicate a CSV parsing issue - check if the response contains "
189
+ f"malformed CSV data or fields with special characters."
174
190
  )
175
191
 
176
192
 
@@ -296,32 +312,15 @@ def resume_from_locator(
296
312
 
297
313
 
298
314
  # Helper function to get all fields that can be queried by bulk API
299
- async def get_bulk_fields(
300
- sf: SalesforceClient, object_type: str, api_version: Optional[str] = None
301
- ) -> List[FieldInfo]:
302
- """Get field metadata for queryable fields in a Salesforce object.
303
-
304
- :param sf: Salesforce client instance
305
- :param object_type: Name of the Salesforce object (e.g., 'Account', 'Contact')
306
- :param api_version: API version to use (defaults to client version)
307
- :returns: List of field metadata dictionaries for queryable fields
308
- """
315
+ async def get_bulk_fields(fields_metadata: List[FieldInfo]) -> List[FieldInfo]:
316
+ """Get field metadata for queryable fields in a Salesforce object."""
309
317
  # Use the metadata API to get object description
310
- describe_data = await sf.describe.sobject(object_type, api_version)
311
- fields_metadata = describe_data["fields"]
312
-
313
- # Create a set of all compound field names to exclude
314
- compound_field_names = {
315
- field.get("compoundFieldName")
316
- for field in fields_metadata
317
- if field.get("compoundFieldName")
318
- }
319
318
 
320
- # Filter to only queryable fields that aren't compound fields
319
+ # Filter to only queryable fields that aren't compound fields (unless field is actually name)
321
320
  queryable_fields = [
322
321
  field
323
322
  for field in fields_metadata
324
- if field.get("name") not in compound_field_names
323
+ if field.get("type") not in ["address", "location"]
325
324
  ]
326
325
 
327
326
  return queryable_fields
@@ -3,26 +3,37 @@ Parquet writer module for converting Salesforce QueryResult to Parquet format.
3
3
  """
4
4
 
5
5
  import logging
6
- from typing import Any, Dict, List, Optional
6
+ from typing import Any, Dict, List, Optional, Callable
7
7
  from pathlib import Path
8
8
  import pyarrow as pa
9
9
  import pandas as pd
10
10
  import pyarrow.parquet as pq
11
+ from datetime import datetime
11
12
 
12
13
  from ..api.describe.types import FieldInfo
13
14
 
14
15
  from .bulk_export import QueryResult, batch_records_async
15
16
 
16
17
 
17
- def salesforce_to_arrow_type(sf_type: str) -> pa.DataType:
18
- """Convert Salesforce data types to Arrow data types."""
18
+ def salesforce_to_arrow_type(
19
+ sf_type: str, convert_datetime_to_timestamp: bool = True
20
+ ) -> pa.DataType:
21
+ """Convert Salesforce data types to Arrow data types.
22
+
23
+ :param sf_type: Salesforce field type
24
+ :param convert_datetime_to_timestamp: If True, datetime fields use timestamp type, otherwise string
25
+ """
19
26
  type_mapping = {
20
27
  "string": pa.string(),
21
28
  "boolean": pa.bool_(),
22
29
  "int": pa.int64(),
23
30
  "double": pa.float64(),
24
- "date": pa.string(), # Store as string since SF returns ISO format
25
- "datetime": pa.string(), # Store as string since SF returns ISO format
31
+ "date": pa.string(), # Always store as string since SF returns ISO format
32
+ "datetime": (
33
+ pa.timestamp("us", tz="UTC")
34
+ if convert_datetime_to_timestamp
35
+ else pa.string()
36
+ ),
26
37
  "currency": pa.float64(),
27
38
  "reference": pa.string(),
28
39
  "picklist": pa.string(),
@@ -40,18 +51,26 @@ def salesforce_to_arrow_type(sf_type: str) -> pa.DataType:
40
51
  return type_mapping.get(sf_type.lower(), pa.string())
41
52
 
42
53
 
43
- def create_schema_from_metadata(fields_metadata: List[FieldInfo]) -> pa.Schema:
54
+ def create_schema_from_metadata(
55
+ fields_metadata: List[FieldInfo],
56
+ column_formatter: Optional[Callable[[str], str]] = None,
57
+ convert_datetime_to_timestamp: bool = True,
58
+ ) -> pa.Schema:
44
59
  """
45
60
  Create a PyArrow schema from Salesforce field metadata.
46
61
 
47
62
  :param fields_metadata: List of field metadata dictionaries from Salesforce
63
+ :param column_formatter: Optional function to format column names
64
+ :param convert_datetime_to_timestamp: If True, datetime fields use timestamp type, otherwise string
48
65
  :returns: PyArrow schema
49
66
  """
50
67
  arrow_fields = []
51
68
  for field in fields_metadata:
52
- field_name = field.get("name", "").lower() # Normalize to lowercase
69
+ field_name = field.get("name", "")
70
+ if column_formatter:
71
+ field_name = column_formatter(field_name)
53
72
  sf_type = field.get("type", "string")
54
- arrow_type = salesforce_to_arrow_type(sf_type)
73
+ arrow_type = salesforce_to_arrow_type(sf_type, convert_datetime_to_timestamp)
55
74
  # All fields are nullable since Salesforce can return empty values
56
75
  arrow_fields.append(pa.field(field_name, arrow_type, nullable=True))
57
76
 
@@ -70,6 +89,8 @@ class ParquetWriter:
70
89
  schema: Optional[pa.Schema] = None,
71
90
  batch_size: int = 10000,
72
91
  convert_empty_to_null: bool = True,
92
+ column_formatter: Optional[Callable[[str], str]] = None,
93
+ convert_datetime_to_timestamp: bool = True,
73
94
  ):
74
95
  """
75
96
  Initialize ParquetWriter.
@@ -78,11 +99,15 @@ class ParquetWriter:
78
99
  :param schema: Optional PyArrow schema. If None, will be inferred from first batch
79
100
  :param batch_size: Number of records to process in each batch
80
101
  :param convert_empty_to_null: Convert empty strings to null values
102
+ :param column_formatter: Optional function to format column names. If None, no formatting is applied
103
+ :param convert_datetime_to_timestamp: If True, datetime fields are converted to timestamps, otherwise stored as strings
81
104
  """
82
105
  self.file_path = file_path
83
106
  self.schema = schema
84
107
  self.batch_size = batch_size
85
108
  self.convert_empty_to_null = convert_empty_to_null
109
+ self.column_formatter = column_formatter
110
+ self.convert_datetime_to_timestamp = convert_datetime_to_timestamp
86
111
  self._writer = None
87
112
  self._schema_finalized = False
88
113
 
@@ -106,10 +131,15 @@ class ParquetWriter:
106
131
  if not batch:
107
132
  return
108
133
 
109
- # Convert field names to lowercase for consistency
134
+ # Apply column formatting if specified
110
135
  converted_batch = []
111
136
  for record in batch:
112
- converted_record = {k.lower(): v for k, v in record.items()}
137
+ if self.column_formatter:
138
+ converted_record = {
139
+ self.column_formatter(k): v for k, v in record.items()
140
+ }
141
+ else:
142
+ converted_record = record.copy()
113
143
  converted_batch.append(converted_record)
114
144
 
115
145
  # Create DataFrame
@@ -121,7 +151,7 @@ class ParquetWriter:
121
151
  self.schema = self._infer_schema_from_dataframe(df)
122
152
  else:
123
153
  # Filter schema to only include fields that are actually in the data
124
- self.schema = self._filter_schema_to_data(self.schema, df.columns)
154
+ self.schema = self._filter_schema_to_data(self.schema, list(df.columns))
125
155
  self._schema_finalized = True
126
156
 
127
157
  # Apply data type conversions based on schema
@@ -181,6 +211,8 @@ class ParquetWriter:
181
211
 
182
212
  def _convert_dataframe_types(self, df: pd.DataFrame) -> None:
183
213
  """Convert DataFrame types based on the schema."""
214
+ if self.schema is None:
215
+ return
184
216
  for field in self.schema:
185
217
  field_name = field.name
186
218
  if field_name not in df.columns:
@@ -192,23 +224,72 @@ class ParquetWriter:
192
224
 
193
225
  # Apply type-specific conversions
194
226
  if pa.types.is_boolean(field.type):
195
- # Convert string 'true'/'false' to boolean
196
- df[field_name] = (
197
- df[field_name]
198
- .map({"true": True, "false": False, None: None})
199
- .fillna(df[field_name])
200
- ) # Keep original values for non-string booleans
227
+ # Convert string 'true'/'false' to boolean, keeping original values for others
228
+ original_series = df[field_name]
229
+ mapped_series = original_series.map(
230
+ {"true": True, "false": False, None: None}
231
+ )
232
+ # For values that weren't mapped, keep the original values
233
+ # This avoids the fillna FutureWarning by using boolean indexing instead
234
+ mask = mapped_series.notna()
235
+ result_series = original_series.copy()
236
+ result_series.loc[mask] = mapped_series.loc[mask]
237
+ df[field_name] = result_series
201
238
  elif pa.types.is_integer(field.type):
202
239
  df[field_name] = pd.to_numeric(df[field_name], errors="coerce").astype(
203
240
  "Int64"
204
241
  ) # Nullable integer
205
242
  elif pa.types.is_floating(field.type):
206
243
  df[field_name] = pd.to_numeric(df[field_name], errors="coerce")
244
+ elif pa.types.is_timestamp(field.type):
245
+ # Convert Salesforce ISO datetime strings to timestamps
246
+ datetime_series = df[field_name]
247
+ if isinstance(datetime_series, pd.Series):
248
+ df[field_name] = self._convert_datetime_strings_to_timestamps(
249
+ datetime_series
250
+ )
207
251
 
208
252
  # Replace empty strings with None for non-string fields
209
253
  if not pa.types.is_string(field.type):
210
254
  df[field_name] = df[field_name].replace("", pd.NA)
211
255
 
256
+ def _convert_datetime_strings_to_timestamps(self, series: pd.Series) -> pd.Series:
257
+ """
258
+ Convert Salesforce ISO datetime strings to pandas datetime objects.
259
+
260
+ Salesforce returns datetime in ISO format like '2023-12-25T10:30:00.000+0000'
261
+ or '2023-12-25T10:30:00Z'. This method handles various ISO formats.
262
+ """
263
+
264
+ def parse_sf_datetime(dt_str):
265
+ if pd.isna(dt_str) or dt_str == "" or dt_str is None:
266
+ return pd.NaT
267
+
268
+ try:
269
+ # Handle common Salesforce datetime formats
270
+ dt_str = str(dt_str).strip()
271
+
272
+ # Convert +0000 to Z for pandas compatibility
273
+ if dt_str.endswith("+0000"):
274
+ dt_str = dt_str[:-5] + "Z"
275
+ elif dt_str.endswith("+00:00"):
276
+ dt_str = dt_str[:-6] + "Z"
277
+
278
+ # Use pandas to_datetime with UTC parsing
279
+ return pd.to_datetime(dt_str, utc=True)
280
+
281
+ except (ValueError, TypeError) as e:
282
+ logging.warning(f"Failed to parse datetime string '{dt_str}': {e}")
283
+ return pd.NaT
284
+
285
+ # Apply the conversion function to the series
286
+ result = series.apply(parse_sf_datetime)
287
+ if isinstance(result, pd.Series):
288
+ return result
289
+ else:
290
+ # This shouldn't happen, but handle it gracefully
291
+ return pd.Series(result, index=series.index)
292
+
212
293
  def close(self) -> None:
213
294
  """Close the parquet writer."""
214
295
  if self._writer:
@@ -223,6 +304,8 @@ async def write_query_to_parquet(
223
304
  schema: Optional[pa.Schema] = None,
224
305
  batch_size: int = 10000,
225
306
  convert_empty_to_null: bool = True,
307
+ column_formatter: Optional[Callable[[str], str]] = None,
308
+ convert_datetime_to_timestamp: bool = True,
226
309
  ) -> None:
227
310
  """
228
311
  Convenience function to write a QueryResult to a parquet file (async version).
@@ -233,18 +316,24 @@ async def write_query_to_parquet(
233
316
  :param schema: Optional pre-created PyArrow schema (takes precedence over fields_metadata)
234
317
  :param batch_size: Number of records to process in each batch
235
318
  :param convert_empty_to_null: Convert empty strings to null values
319
+ :param column_formatter: Optional function to format column names
320
+ :param convert_datetime_to_timestamp: If True, datetime fields are converted to timestamps, otherwise stored as strings
236
321
  """
237
322
  effective_schema = None
238
323
  if schema:
239
324
  effective_schema = schema
240
325
  elif fields_metadata:
241
- effective_schema = create_schema_from_metadata(fields_metadata)
326
+ effective_schema = create_schema_from_metadata(
327
+ fields_metadata, column_formatter, convert_datetime_to_timestamp
328
+ )
242
329
 
243
330
  writer = ParquetWriter(
244
331
  file_path=file_path,
245
332
  schema=effective_schema,
246
333
  batch_size=batch_size,
247
334
  convert_empty_to_null=convert_empty_to_null,
335
+ column_formatter=column_formatter,
336
+ convert_datetime_to_timestamp=convert_datetime_to_timestamp,
248
337
  )
249
338
 
250
339
  await writer.write_query_result(query_result)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes