atlan-application-sdk 1.1.1__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. application_sdk/activities/common/sql_utils.py +308 -0
  2. application_sdk/activities/common/utils.py +1 -45
  3. application_sdk/activities/metadata_extraction/sql.py +110 -353
  4. application_sdk/activities/query_extraction/sql.py +12 -11
  5. application_sdk/application/__init__.py +1 -1
  6. application_sdk/clients/sql.py +167 -1
  7. application_sdk/clients/temporal.py +6 -6
  8. application_sdk/common/types.py +8 -0
  9. application_sdk/common/utils.py +1 -8
  10. application_sdk/constants.py +1 -1
  11. application_sdk/handlers/sql.py +10 -25
  12. application_sdk/interceptors/events.py +1 -1
  13. application_sdk/io/__init__.py +654 -0
  14. application_sdk/io/json.py +429 -0
  15. application_sdk/{outputs → io}/parquet.py +358 -47
  16. application_sdk/io/utils.py +307 -0
  17. application_sdk/observability/observability.py +23 -12
  18. application_sdk/server/fastapi/middleware/logmiddleware.py +23 -17
  19. application_sdk/server/fastapi/middleware/metrics.py +27 -24
  20. application_sdk/server/fastapi/models.py +1 -1
  21. application_sdk/server/fastapi/routers/server.py +1 -1
  22. application_sdk/server/fastapi/utils.py +10 -0
  23. application_sdk/services/eventstore.py +4 -4
  24. application_sdk/services/secretstore.py +1 -1
  25. application_sdk/test_utils/hypothesis/strategies/outputs/json_output.py +0 -1
  26. application_sdk/test_utils/hypothesis/strategies/server/fastapi/__init__.py +1 -1
  27. application_sdk/version.py +1 -1
  28. application_sdk/worker.py +1 -1
  29. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.0.0.dist-info}/METADATA +9 -11
  30. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.0.0.dist-info}/RECORD +35 -42
  31. application_sdk/common/dataframe_utils.py +0 -42
  32. application_sdk/events/__init__.py +0 -5
  33. application_sdk/inputs/.cursor/BUGBOT.md +0 -250
  34. application_sdk/inputs/__init__.py +0 -168
  35. application_sdk/inputs/iceberg.py +0 -75
  36. application_sdk/inputs/json.py +0 -136
  37. application_sdk/inputs/parquet.py +0 -272
  38. application_sdk/inputs/sql_query.py +0 -271
  39. application_sdk/outputs/.cursor/BUGBOT.md +0 -295
  40. application_sdk/outputs/__init__.py +0 -453
  41. application_sdk/outputs/iceberg.py +0 -139
  42. application_sdk/outputs/json.py +0 -268
  43. /application_sdk/{events → interceptors}/models.py +0 -0
  44. /application_sdk/{common/dapr_utils.py → services/_utils.py} +0 -0
  45. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.0.0.dist-info}/WHEEL +0 -0
  46. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.0.0.dist-info}/licenses/LICENSE +0 -0
  47. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.0.0.dist-info}/licenses/NOTICE +0 -0
@@ -0,0 +1,654 @@
1
+ """Output module for handling data output operations.
2
+
3
+ This module provides base classes and utilities for handling various types of data outputs
4
+ in the application, including file outputs and object store interactions.
5
+ """
6
+
7
+ import gc
8
+ import inspect
9
+ import os
10
+ from abc import ABC, abstractmethod
11
+ from enum import Enum
12
+ from typing import (
13
+ TYPE_CHECKING,
14
+ Any,
15
+ AsyncGenerator,
16
+ AsyncIterator,
17
+ Dict,
18
+ Generator,
19
+ Iterator,
20
+ List,
21
+ Optional,
22
+ Union,
23
+ cast,
24
+ )
25
+
26
+ import orjson
27
+ from temporalio import activity
28
+
29
+ from application_sdk.activities.common.models import ActivityStatistics
30
+ from application_sdk.activities.common.utils import get_object_store_prefix
31
+ from application_sdk.common.types import DataframeType
32
+ from application_sdk.constants import ENABLE_ATLAN_UPLOAD, UPSTREAM_OBJECT_STORE_NAME
33
+ from application_sdk.io.utils import (
34
+ estimate_dataframe_record_size,
35
+ is_empty_dataframe,
36
+ path_gen,
37
+ )
38
+ from application_sdk.observability.logger_adaptor import get_logger
39
+ from application_sdk.observability.metrics_adaptor import MetricType
40
+ from application_sdk.services.objectstore import ObjectStore
41
+
42
+ logger = get_logger(__name__)
43
+ activity.logger = logger
44
+
45
+
46
+ if TYPE_CHECKING:
47
+ import daft # type: ignore
48
+ import pandas as pd
49
+
50
+
51
+ class Reader(ABC):
52
+ """
53
+ Abstract base class for reader data sources.
54
+ """
55
+
56
+ @abstractmethod
57
+ def read_batches(
58
+ self,
59
+ ) -> Union[
60
+ Iterator["pd.DataFrame"],
61
+ AsyncIterator["pd.DataFrame"],
62
+ Iterator["daft.DataFrame"],
63
+ AsyncIterator["daft.DataFrame"],
64
+ ]:
65
+ """
66
+ Get an iterator of batched pandas DataFrames.
67
+
68
+ Returns:
69
+ Iterator["pd.DataFrame"]: An iterator of batched pandas DataFrames.
70
+
71
+ Raises:
72
+ NotImplementedError: If the method is not implemented.
73
+ """
74
+ raise NotImplementedError
75
+
76
+ @abstractmethod
77
+ async def read(self) -> Union["pd.DataFrame", "daft.DataFrame"]:
78
+ """
79
+ Get a single pandas or daft DataFrame.
80
+
81
+ Returns:
82
+ Union["pd.DataFrame", "daft.DataFrame"]: A pandas or daft DataFrame.
83
+
84
+ Raises:
85
+ NotImplementedError: If the method is not implemented.
86
+ """
87
+ raise NotImplementedError
88
+
89
+
90
+ class WriteMode(Enum):
91
+ """Enumeration of write modes for output operations."""
92
+
93
+ APPEND = "append"
94
+ OVERWRITE = "overwrite"
95
+ OVERWRITE_PARTITIONS = "overwrite-partitions"
96
+
97
+
98
+ class Writer(ABC):
99
+ """Abstract base class for writer handlers.
100
+
101
+ This class defines the interface for writer handlers that can write data
102
+ to various destinations in different formats. Follows Python's file I/O
103
+ pattern with open/write/close semantics and supports context managers.
104
+
105
+ Attributes:
106
+ path (str): Path where the writer will be written.
107
+ output_prefix (str): Prefix for files when uploading to object store.
108
+ total_record_count (int): Total number of records processed.
109
+ chunk_count (int): Number of chunks the writer was split into.
110
+ buffer_size (int): Size of the buffer to write data to.
111
+ max_file_size_bytes (int): Maximum size of the file to write data to.
112
+ current_buffer_size (int): Current size of the buffer to write data to.
113
+ current_buffer_size_bytes (int): Current size of the buffer to write data to.
114
+ partitions (List[int]): Partitions of the writer.
115
+
116
+ Example:
117
+ Using close() explicitly::
118
+
119
+ writer = JsonFileWriter(path="/data/output")
120
+ await writer.write(dataframe)
121
+ await writer.write({"key": "value"}) # Dict support
122
+ stats = await writer.close()
123
+
124
+ Using context manager (recommended)::
125
+
126
+ async with JsonFileWriter(path="/data/output") as writer:
127
+ await writer.write(dataframe)
128
+ # close() called automatically
129
+ """
130
+
131
+ path: str
132
+ output_prefix: str
133
+ total_record_count: int
134
+ chunk_count: int
135
+ chunk_part: int
136
+ buffer_size: int
137
+ max_file_size_bytes: int
138
+ current_buffer_size: int
139
+ current_buffer_size_bytes: int
140
+ partitions: List[int]
141
+ extension: str
142
+ dataframe_type: DataframeType
143
+ _is_closed: bool = False
144
+ _statistics: Optional[ActivityStatistics] = None
145
+
146
+ async def __aenter__(self) -> "Writer":
147
+ """Enter the async context manager.
148
+
149
+ Returns:
150
+ Writer: The writer instance.
151
+ """
152
+ return self
153
+
154
+ async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
155
+ """Exit the async context manager, closing the writer.
156
+
157
+ Args:
158
+ exc_type: Exception type if an exception was raised.
159
+ exc_val: Exception value if an exception was raised.
160
+ exc_tb: Exception traceback if an exception was raised.
161
+ """
162
+ await self.close()
163
+
164
+ def _convert_to_dataframe(
165
+ self,
166
+ data: Union[
167
+ "pd.DataFrame", "daft.DataFrame", Dict[str, Any], List[Dict[str, Any]]
168
+ ],
169
+ ) -> Union["pd.DataFrame", "daft.DataFrame"]:
170
+ """Convert input data to a DataFrame if needed.
171
+
172
+ Args:
173
+ data: Input data - can be a DataFrame, dict, or list of dicts.
174
+
175
+ Returns:
176
+ A pandas or daft DataFrame depending on self.dataframe_type.
177
+
178
+ Raises:
179
+ TypeError: If data type is not supported or if dict/list input is used with daft when daft is not available.
180
+ """
181
+ import pandas as pd
182
+
183
+ # Already a pandas DataFrame - return as-is or convert to daft if needed
184
+ if isinstance(data, pd.DataFrame):
185
+ if self.dataframe_type == DataframeType.daft:
186
+ try:
187
+ import daft
188
+
189
+ return daft.from_pandas(data)
190
+ except ImportError:
191
+ raise TypeError(
192
+ "daft is not installed. Please install daft to use DataframeType.daft, "
193
+ "or use DataframeType.pandas instead."
194
+ )
195
+ return data
196
+
197
+ # Check for daft DataFrame
198
+ try:
199
+ import daft
200
+
201
+ if isinstance(data, daft.DataFrame):
202
+ return data
203
+ except ImportError:
204
+ pass
205
+
206
+ # Convert dict or list of dicts to DataFrame
207
+ if isinstance(data, dict) or (
208
+ isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict)
209
+ ):
210
+ # For daft dataframe_type, convert to daft DataFrame directly
211
+ if self.dataframe_type == DataframeType.daft:
212
+ try:
213
+ import daft
214
+
215
+ # Convert to columnar format for daft.from_pydict()
216
+ if isinstance(data, dict):
217
+ # Single dict: {"col1": "val1", "col2": "val2"} -> {"col1": ["val1"], "col2": ["val2"]}
218
+ columnar_data = {k: [v] for k, v in data.items()}
219
+ else:
220
+ # List of dicts: [{"col1": "v1"}, {"col1": "v2"}] -> {"col1": ["v1", "v2"]}
221
+ columnar_data = {}
222
+ for record in data:
223
+ for key, value in record.items():
224
+ if key not in columnar_data:
225
+ columnar_data[key] = []
226
+ columnar_data[key].append(value)
227
+ return daft.from_pydict(columnar_data)
228
+ except ImportError:
229
+ raise TypeError(
230
+ "Dict and list inputs require daft to be installed when using DataframeType.daft. "
231
+ "Please install daft or use DataframeType.pandas instead."
232
+ )
233
+ # For pandas dataframe_type, convert to pandas DataFrame
234
+ return pd.DataFrame([data] if isinstance(data, dict) else data)
235
+
236
+ raise TypeError(
237
+ f"Unsupported data type: {type(data).__name__}. "
238
+ "Expected DataFrame, dict, or list of dicts."
239
+ )
240
+
241
+ async def write(
242
+ self,
243
+ data: Union[
244
+ "pd.DataFrame", "daft.DataFrame", Dict[str, Any], List[Dict[str, Any]]
245
+ ],
246
+ **kwargs: Any,
247
+ ) -> None:
248
+ """Write data to the output destination.
249
+
250
+ Supports writing DataFrames, dicts (converted to single-row DataFrame),
251
+ or lists of dicts (converted to multi-row DataFrame).
252
+
253
+ Args:
254
+ data: Data to write - DataFrame, dict, or list of dicts.
255
+ **kwargs: Additional parameters passed to the underlying write method.
256
+
257
+ Raises:
258
+ ValueError: If the writer has been closed or dataframe_type is unsupported.
259
+ TypeError: If data type is not supported.
260
+ """
261
+ if self._is_closed:
262
+ raise ValueError("Cannot write to a closed writer")
263
+
264
+ # Convert to DataFrame if needed
265
+ dataframe = self._convert_to_dataframe(data)
266
+
267
+ if self.dataframe_type == DataframeType.pandas:
268
+ await self._write_dataframe(dataframe, **kwargs)
269
+ elif self.dataframe_type == DataframeType.daft:
270
+ await self._write_daft_dataframe(dataframe, **kwargs)
271
+ else:
272
+ raise ValueError(f"Unsupported dataframe_type: {self.dataframe_type}")
273
+
274
+ async def write_batches(
275
+ self,
276
+ dataframe: Union[
277
+ AsyncGenerator["pd.DataFrame", None],
278
+ Generator["pd.DataFrame", None, None],
279
+ AsyncGenerator["daft.DataFrame", None],
280
+ Generator["daft.DataFrame", None, None],
281
+ ],
282
+ ) -> None:
283
+ """Write batched DataFrames to the output destination.
284
+
285
+ Args:
286
+ dataframe: Async or sync generator yielding DataFrames.
287
+
288
+ Raises:
289
+ ValueError: If the writer has been closed or dataframe_type is unsupported.
290
+ """
291
+ if self._is_closed:
292
+ raise ValueError("Cannot write to a closed writer")
293
+
294
+ if self.dataframe_type == DataframeType.pandas:
295
+ await self._write_batched_dataframe(dataframe)
296
+ elif self.dataframe_type == DataframeType.daft:
297
+ await self._write_batched_daft_dataframe(dataframe)
298
+ else:
299
+ raise ValueError(f"Unsupported dataframe_type: {self.dataframe_type}")
300
+
301
+ async def _write_batched_dataframe(
302
+ self,
303
+ batched_dataframe: Union[
304
+ AsyncGenerator["pd.DataFrame", None], Generator["pd.DataFrame", None, None]
305
+ ],
306
+ ):
307
+ """Write a batched pandas DataFrame to Output.
308
+
309
+ This method writes the DataFrame to Output provided, potentially splitting it
310
+ into chunks based on chunk_size and buffer_size settings.
311
+
312
+ Args:
313
+ dataframe (pd.DataFrame): The DataFrame to write.
314
+
315
+ Note:
316
+ If the DataFrame is empty, the method returns without writing.
317
+ """
318
+ try:
319
+ if inspect.isasyncgen(batched_dataframe):
320
+ async for dataframe in batched_dataframe:
321
+ if not is_empty_dataframe(dataframe):
322
+ await self._write_dataframe(dataframe)
323
+ else:
324
+ # Cast to Generator since we've confirmed it's not an AsyncGenerator
325
+ sync_generator = cast(
326
+ Generator["pd.DataFrame", None, None], batched_dataframe
327
+ )
328
+ for dataframe in sync_generator:
329
+ if not is_empty_dataframe(dataframe):
330
+ await self._write_dataframe(dataframe)
331
+ except Exception as e:
332
+ logger.error(f"Error writing batched dataframe: {str(e)}")
333
+ raise
334
+
335
+ async def _write_dataframe(self, dataframe: "pd.DataFrame", **kwargs):
336
+ """Write a pandas DataFrame to Parquet files and upload to object store.
337
+
338
+ Args:
339
+ dataframe (pd.DataFrame): The DataFrame to write.
340
+ **kwargs: Additional parameters (currently unused for pandas DataFrames).
341
+ """
342
+ try:
343
+ if self.chunk_start is None:
344
+ self.chunk_part = 0
345
+ if len(dataframe) == 0:
346
+ return
347
+
348
+ chunk_size_bytes = estimate_dataframe_record_size(dataframe, self.extension)
349
+
350
+ for i in range(0, len(dataframe), self.buffer_size):
351
+ chunk = dataframe[i : i + self.buffer_size]
352
+
353
+ if (
354
+ self.current_buffer_size_bytes + chunk_size_bytes
355
+ > self.max_file_size_bytes
356
+ ):
357
+ output_file_name = f"{self.path}/{path_gen(self.chunk_count, self.chunk_part, extension=self.extension)}"
358
+ if os.path.exists(output_file_name):
359
+ await self._upload_file(output_file_name)
360
+ self.chunk_part += 1
361
+
362
+ self.current_buffer_size += len(chunk)
363
+ self.current_buffer_size_bytes += chunk_size_bytes * len(chunk)
364
+ await self._flush_buffer(chunk, self.chunk_part)
365
+
366
+ del chunk
367
+ gc.collect()
368
+
369
+ if self.current_buffer_size_bytes > 0:
370
+ # Finally upload the final file to the object store
371
+ output_file_name = f"{self.path}/{path_gen(self.chunk_count, self.chunk_part, extension=self.extension)}"
372
+ if os.path.exists(output_file_name):
373
+ await self._upload_file(output_file_name)
374
+ self.chunk_part += 1
375
+
376
+ # Record metrics for successful write
377
+ self.metrics.record_metric(
378
+ name="write_records",
379
+ value=len(dataframe),
380
+ metric_type=MetricType.COUNTER,
381
+ labels={"type": "pandas", "mode": WriteMode.APPEND.value},
382
+ description="Number of records written to files from pandas DataFrame",
383
+ )
384
+
385
+ # Record chunk metrics
386
+ self.metrics.record_metric(
387
+ name="chunks_written",
388
+ value=1,
389
+ metric_type=MetricType.COUNTER,
390
+ labels={"type": "pandas", "mode": WriteMode.APPEND.value},
391
+ description="Number of chunks written to files",
392
+ )
393
+
394
+ # If chunk_start is set we don't want to increment the chunk_count
395
+ # Since it should only increment the chunk_part in this case
396
+ if self.chunk_start is None:
397
+ self.chunk_count += 1
398
+ self.partitions.append(self.chunk_part)
399
+ except Exception as e:
400
+ # Record metrics for failed write
401
+ self.metrics.record_metric(
402
+ name="write_errors",
403
+ value=1,
404
+ metric_type=MetricType.COUNTER,
405
+ labels={
406
+ "type": "pandas",
407
+ "mode": WriteMode.APPEND.value,
408
+ "error": str(e),
409
+ },
410
+ description="Number of errors while writing to files",
411
+ )
412
+ logger.error(f"Error writing pandas dataframe to files: {str(e)}")
413
+ raise
414
+
415
+ async def _write_batched_daft_dataframe(
416
+ self,
417
+ batched_dataframe: Union[
418
+ AsyncGenerator["daft.DataFrame", None], # noqa: F821
419
+ Generator["daft.DataFrame", None, None], # noqa: F821
420
+ ],
421
+ ):
422
+ """Write a batched daft DataFrame to JSON files.
423
+
424
+ This method writes the DataFrame to JSON files, potentially splitting it
425
+ into chunks based on chunk_size and buffer_size settings.
426
+
427
+ Args:
428
+ dataframe (daft.DataFrame): The DataFrame to write.
429
+
430
+ Note:
431
+ If the DataFrame is empty, the method returns without writing.
432
+ """
433
+ try:
434
+ if inspect.isasyncgen(batched_dataframe):
435
+ async for dataframe in batched_dataframe:
436
+ if not is_empty_dataframe(dataframe):
437
+ await self._write_daft_dataframe(dataframe)
438
+ else:
439
+ # Cast to Generator since we've confirmed it's not an AsyncGenerator
440
+ sync_generator = cast(
441
+ Generator["daft.DataFrame", None, None], batched_dataframe
442
+ ) # noqa: F821
443
+ for dataframe in sync_generator:
444
+ if not is_empty_dataframe(dataframe):
445
+ await self._write_daft_dataframe(dataframe)
446
+ except Exception as e:
447
+ logger.error(f"Error writing batched daft dataframe: {str(e)}")
448
+ raise
449
+
450
+ @abstractmethod
451
+ async def _write_daft_dataframe(self, dataframe: "daft.DataFrame", **kwargs): # noqa: F821
452
+ """Write a daft DataFrame to the output destination.
453
+
454
+ Args:
455
+ dataframe (daft.DataFrame): The DataFrame to write.
456
+ **kwargs: Additional parameters passed through from write().
457
+ """
458
+ pass
459
+
460
+ @property
461
+ def statistics(self) -> ActivityStatistics:
462
+ """Get current statistics without closing the writer.
463
+
464
+ Returns:
465
+ ActivityStatistics: Current statistics (record count, chunk count, partitions).
466
+
467
+ Note:
468
+ This returns the current state. For final statistics after all
469
+ writes complete, use close() instead.
470
+ """
471
+ return ActivityStatistics(
472
+ total_record_count=self.total_record_count,
473
+ chunk_count=len(self.partitions),
474
+ partitions=self.partitions,
475
+ )
476
+
477
+ async def _finalize(self) -> None:
478
+ """Finalize the writer before closing.
479
+
480
+ Override this method in subclasses to perform any final flush operations,
481
+ upload remaining files, etc. This is called by close() before writing statistics.
482
+ """
483
+ pass
484
+
485
+ async def close(self) -> ActivityStatistics:
486
+ """Close the writer, flush buffers, upload files, and return statistics.
487
+
488
+ This method finalizes all pending writes, uploads any remaining files to
489
+ the object store, writes statistics, and marks the writer as closed.
490
+ Calling close() multiple times is safe (subsequent calls are no-ops).
491
+
492
+ The typename for statistics is automatically taken from `self.typename`
493
+ if it was set during initialization.
494
+
495
+ Returns:
496
+ ActivityStatistics: Final statistics including total_record_count,
497
+ chunk_count, and partitions.
498
+
499
+ Raises:
500
+ ValueError: If statistics data is invalid.
501
+ Exception: If there's an error during finalization or writing statistics.
502
+
503
+ Example:
504
+ ```python
505
+ writer = JsonFileWriter(path="/data/output", typename="table")
506
+ await writer.write(dataframe)
507
+ stats = await writer.close()
508
+ print(f"Wrote {stats.total_record_count} records")
509
+ ```
510
+ """
511
+ if self._is_closed:
512
+ if self._statistics:
513
+ return self._statistics
514
+ return self.statistics
515
+
516
+ try:
517
+ # Allow subclasses to perform final flush/upload operations
518
+ await self._finalize()
519
+
520
+ # Use self.typename if available
521
+ typename = getattr(self, "typename", None)
522
+
523
+ # Write statistics to file and object store
524
+ statistics_dict = await self._write_statistics(typename)
525
+ if not statistics_dict:
526
+ raise ValueError("No statistics data available")
527
+
528
+ self._statistics = ActivityStatistics.model_validate(statistics_dict)
529
+ if typename:
530
+ self._statistics.typename = typename
531
+
532
+ self._is_closed = True
533
+ return self._statistics
534
+
535
+ except Exception as e:
536
+ logger.error(f"Error closing writer: {str(e)}")
537
+ raise
538
+
539
+ async def _upload_file(self, file_name: str):
540
+ """Upload a file to the object store."""
541
+ # Get retain_local_copy from the writer instance, defaulting to False
542
+ retain_local = getattr(self, "retain_local_copy", False)
543
+
544
+ if ENABLE_ATLAN_UPLOAD:
545
+ await ObjectStore.upload_file(
546
+ source=file_name,
547
+ store_name=UPSTREAM_OBJECT_STORE_NAME,
548
+ retain_local_copy=True, # Always retain for the second upload to deployment store
549
+ destination=get_object_store_prefix(file_name),
550
+ )
551
+ await ObjectStore.upload_file(
552
+ source=file_name,
553
+ destination=get_object_store_prefix(file_name),
554
+ retain_local_copy=retain_local, # Respect the writer's retain_local_copy setting
555
+ )
556
+
557
+ self.current_buffer_size_bytes = 0
558
+
559
+ async def _flush_buffer(self, chunk: "pd.DataFrame", chunk_part: int):
560
+ """Flush the current buffer to a JSON file.
561
+
562
+ This method combines all DataFrames in the buffer, writes them to a JSON file,
563
+ and uploads the file to the object store.
564
+
565
+ Note:
566
+ If the buffer is empty or has no records, the method returns without writing.
567
+ """
568
+ try:
569
+ if not is_empty_dataframe(chunk):
570
+ self.total_record_count += len(chunk)
571
+ output_file_name = f"{self.path}/{path_gen(self.chunk_count, chunk_part, extension=self.extension)}"
572
+ await self._write_chunk(chunk, output_file_name)
573
+
574
+ self.current_buffer_size = 0
575
+
576
+ # Record chunk metrics
577
+ self.metrics.record_metric(
578
+ name="chunks_written",
579
+ value=1,
580
+ metric_type=MetricType.COUNTER,
581
+ labels={"type": "output"},
582
+ description="Number of chunks written to files",
583
+ )
584
+
585
+ except Exception as e:
586
+ # Record metrics for failed write
587
+ self.metrics.record_metric(
588
+ name="write_errors",
589
+ value=1,
590
+ metric_type=MetricType.COUNTER,
591
+ labels={"type": "output", "error": str(e)},
592
+ description="Number of errors while writing to files",
593
+ )
594
+ logger.error(f"Error flushing buffer to files: {str(e)}")
595
+ raise e
596
+
597
+ async def _write_statistics(
598
+ self, typename: Optional[str] = None
599
+ ) -> Optional[Dict[str, Any]]:
600
+ """Write statistics about the output to a JSON file.
601
+
602
+ Internal method called by close() to persist statistics.
603
+
604
+ Args:
605
+ typename (str, optional): Type name for organizing statistics.
606
+
607
+ Returns:
608
+ Dict containing statistics data.
609
+
610
+ Raises:
611
+ Exception: If there's an error writing or uploading the statistics.
612
+ """
613
+ try:
614
+ # prepare the statistics
615
+ statistics = {
616
+ "total_record_count": self.total_record_count,
617
+ "chunk_count": len(self.partitions),
618
+ "partitions": self.partitions,
619
+ }
620
+
621
+ # Ensure typename is included in the statistics payload (if provided)
622
+ if typename:
623
+ statistics["typename"] = typename
624
+
625
+ # Write the statistics to a json file inside a dedicated statistics/ folder
626
+ statistics_dir = os.path.join(self.path, "statistics")
627
+ os.makedirs(statistics_dir, exist_ok=True)
628
+ output_file_name = os.path.join(statistics_dir, "statistics.json.ignore")
629
+ # If chunk_start is provided, include it in the statistics filename
630
+ try:
631
+ cs = getattr(self, "chunk_start", None)
632
+ if cs is not None:
633
+ output_file_name = os.path.join(
634
+ statistics_dir, f"statistics-chunk-{cs}.json.ignore"
635
+ )
636
+ except Exception:
637
+ # If accessing chunk_start fails, fallback to default filename
638
+ pass
639
+
640
+ # Write the statistics dictionary to the JSON file
641
+ with open(output_file_name, "wb") as f:
642
+ f.write(orjson.dumps(statistics))
643
+
644
+ destination_file_path = get_object_store_prefix(output_file_name)
645
+ # Push the file to the object store
646
+ await ObjectStore.upload_file(
647
+ source=output_file_name,
648
+ destination=destination_file_path,
649
+ )
650
+
651
+ return statistics
652
+ except Exception as e:
653
+ logger.error(f"Error writing statistics: {str(e)}")
654
+ raise