atlan-application-sdk 1.1.1__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. application_sdk/activities/common/sql_utils.py +312 -0
  2. application_sdk/activities/common/utils.py +1 -45
  3. application_sdk/activities/metadata_extraction/sql.py +110 -353
  4. application_sdk/activities/query_extraction/sql.py +12 -11
  5. application_sdk/application/__init__.py +1 -1
  6. application_sdk/clients/sql.py +167 -1
  7. application_sdk/clients/temporal.py +6 -6
  8. application_sdk/common/types.py +8 -0
  9. application_sdk/common/utils.py +1 -8
  10. application_sdk/constants.py +1 -1
  11. application_sdk/handlers/sql.py +10 -25
  12. application_sdk/interceptors/events.py +1 -1
  13. application_sdk/io/__init__.py +749 -0
  14. application_sdk/io/json.py +473 -0
  15. application_sdk/{outputs → io}/parquet.py +414 -47
  16. application_sdk/io/utils.py +307 -0
  17. application_sdk/observability/observability.py +16 -12
  18. application_sdk/server/fastapi/middleware/logmiddleware.py +23 -17
  19. application_sdk/server/fastapi/middleware/metrics.py +27 -24
  20. application_sdk/server/fastapi/models.py +1 -1
  21. application_sdk/server/fastapi/routers/server.py +1 -1
  22. application_sdk/server/fastapi/utils.py +10 -0
  23. application_sdk/services/eventstore.py +4 -4
  24. application_sdk/services/objectstore.py +14 -1
  25. application_sdk/services/secretstore.py +1 -1
  26. application_sdk/test_utils/hypothesis/strategies/outputs/json_output.py +0 -1
  27. application_sdk/test_utils/hypothesis/strategies/server/fastapi/__init__.py +1 -1
  28. application_sdk/version.py +1 -1
  29. application_sdk/worker.py +1 -1
  30. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/METADATA +9 -11
  31. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/RECORD +36 -43
  32. application_sdk/common/dataframe_utils.py +0 -42
  33. application_sdk/events/__init__.py +0 -5
  34. application_sdk/inputs/.cursor/BUGBOT.md +0 -250
  35. application_sdk/inputs/__init__.py +0 -168
  36. application_sdk/inputs/iceberg.py +0 -75
  37. application_sdk/inputs/json.py +0 -136
  38. application_sdk/inputs/parquet.py +0 -272
  39. application_sdk/inputs/sql_query.py +0 -271
  40. application_sdk/outputs/.cursor/BUGBOT.md +0 -295
  41. application_sdk/outputs/__init__.py +0 -453
  42. application_sdk/outputs/iceberg.py +0 -139
  43. application_sdk/outputs/json.py +0 -268
  44. /application_sdk/{events → interceptors}/models.py +0 -0
  45. /application_sdk/{common/dapr_utils.py → services/_utils.py} +0 -0
  46. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/WHEEL +0 -0
  47. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/licenses/LICENSE +0 -0
  48. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/licenses/NOTICE +0 -0
@@ -0,0 +1,749 @@
1
+ """Output module for handling data output operations.
2
+
3
+ This module provides base classes and utilities for handling various types of data outputs
4
+ in the application, including file outputs and object store interactions.
5
+ """
6
+
7
+ import gc
8
+ import inspect
9
+ import os
10
+ from abc import ABC, abstractmethod
11
+ from enum import Enum
12
+ from typing import (
13
+ TYPE_CHECKING,
14
+ Any,
15
+ AsyncGenerator,
16
+ AsyncIterator,
17
+ Dict,
18
+ Generator,
19
+ Iterator,
20
+ List,
21
+ Optional,
22
+ Union,
23
+ cast,
24
+ )
25
+
26
+ import orjson
27
+ from temporalio import activity
28
+
29
+ from application_sdk.activities.common.models import ActivityStatistics
30
+ from application_sdk.activities.common.utils import get_object_store_prefix
31
+ from application_sdk.common.types import DataframeType
32
+ from application_sdk.constants import ENABLE_ATLAN_UPLOAD, UPSTREAM_OBJECT_STORE_NAME
33
+ from application_sdk.io.utils import (
34
+ estimate_dataframe_record_size,
35
+ is_empty_dataframe,
36
+ path_gen,
37
+ )
38
+ from application_sdk.observability.logger_adaptor import get_logger
39
+ from application_sdk.observability.metrics_adaptor import MetricType
40
+ from application_sdk.services.objectstore import ObjectStore
41
+
42
+ logger = get_logger(__name__)
43
+ activity.logger = logger
44
+
45
+
46
+ if TYPE_CHECKING:
47
+ import daft # type: ignore
48
+ import pandas as pd
49
+
50
+
51
+ class Reader(ABC):
52
+ """Abstract base class for reader data sources.
53
+
54
+ This class defines the interface for reader handlers that can read data
55
+ from various sources in different formats. Follows Python's file I/O
56
+ pattern with read/close semantics and supports context managers.
57
+
58
+ Attributes:
59
+ path (str): Path where the reader will read from.
60
+ _is_closed (bool): Whether the reader has been closed.
61
+ _downloaded_files (List[str]): List of downloaded temporary files to clean up.
62
+ cleanup_on_close (bool): Whether to clean up downloaded temp files on close.
63
+
64
+ Example:
65
+ Using close() explicitly::
66
+
67
+ reader = ParquetFileReader(path="/data/input")
68
+ df = await reader.read()
69
+ await reader.close() # Cleans up any downloaded temp files
70
+
71
+ Using context manager (recommended)::
72
+
73
+ async with ParquetFileReader(path="/data/input") as reader:
74
+ df = await reader.read()
75
+ # close() called automatically
76
+
77
+ Reading in batches with context manager::
78
+
79
+ async with JsonFileReader(path="/data/input") as reader:
80
+ async for batch in reader.read_batches():
81
+ process(batch)
82
+ # close() called automatically
83
+ """
84
+
85
+ path: str
86
+ _is_closed: bool = False
87
+ _downloaded_files: List[str] = []
88
+ cleanup_on_close: bool = True
89
+
90
+ async def __aenter__(self) -> "Reader":
91
+ """Enter the async context manager.
92
+
93
+ Returns:
94
+ Reader: The reader instance.
95
+ """
96
+ return self
97
+
98
+ async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
99
+ """Exit the async context manager, closing the reader.
100
+
101
+ Args:
102
+ exc_type: Exception type if an exception was raised.
103
+ exc_val: Exception value if an exception was raised.
104
+ exc_tb: Exception traceback if an exception was raised.
105
+ """
106
+ await self.close()
107
+
108
+ async def close(self) -> None:
109
+ """Close the reader and clean up any downloaded temporary files.
110
+
111
+ This method cleans up any temporary files that were downloaded from
112
+ the object store during read operations. Calling close() multiple
113
+ times is safe (subsequent calls are no-ops).
114
+
115
+ Note:
116
+ Set ``cleanup_on_close=False`` during initialization to retain
117
+ downloaded files after closing.
118
+
119
+ Example::
120
+
121
+ reader = ParquetFileReader(path="/data/input")
122
+ df = await reader.read()
123
+ await reader.close() # Cleans up temp files
124
+ """
125
+ if self._is_closed:
126
+ return
127
+
128
+ if self.cleanup_on_close and self._downloaded_files:
129
+ await self._cleanup_downloaded_files()
130
+
131
+ self._is_closed = True
132
+
133
+ async def _cleanup_downloaded_files(self) -> None:
134
+ """Clean up downloaded temporary files.
135
+
136
+ Override this method in subclasses for custom cleanup behavior.
137
+ """
138
+ import shutil
139
+
140
+ for file_path in self._downloaded_files:
141
+ try:
142
+ if os.path.isfile(file_path):
143
+ os.remove(file_path)
144
+ elif os.path.isdir(file_path):
145
+ shutil.rmtree(file_path, ignore_errors=True)
146
+ except Exception as e:
147
+ logger.warning(f"Failed to clean up temporary file {file_path}: {e}")
148
+
149
+ self._downloaded_files.clear()
150
+
151
+ @abstractmethod
152
+ def read_batches(
153
+ self,
154
+ ) -> Union[
155
+ Iterator["pd.DataFrame"],
156
+ AsyncIterator["pd.DataFrame"],
157
+ Iterator["daft.DataFrame"],
158
+ AsyncIterator["daft.DataFrame"],
159
+ ]:
160
+ """Get an iterator of batched pandas DataFrames.
161
+
162
+ Returns:
163
+ Iterator["pd.DataFrame"]: An iterator of batched pandas DataFrames.
164
+
165
+ Raises:
166
+ NotImplementedError: If the method is not implemented.
167
+ ValueError: If the reader has been closed.
168
+ """
169
+ raise NotImplementedError
170
+
171
+ @abstractmethod
172
+ async def read(self) -> Union["pd.DataFrame", "daft.DataFrame"]:
173
+ """Get a single pandas or daft DataFrame.
174
+
175
+ Returns:
176
+ Union["pd.DataFrame", "daft.DataFrame"]: A pandas or daft DataFrame.
177
+
178
+ Raises:
179
+ NotImplementedError: If the method is not implemented.
180
+ ValueError: If the reader has been closed.
181
+ """
182
+ raise NotImplementedError
183
+
184
+
185
+ class WriteMode(Enum):
186
+ """Enumeration of write modes for output operations."""
187
+
188
+ APPEND = "append"
189
+ OVERWRITE = "overwrite"
190
+ OVERWRITE_PARTITIONS = "overwrite-partitions"
191
+
192
+
193
+ class Writer(ABC):
194
+ """Abstract base class for writer handlers.
195
+
196
+ This class defines the interface for writer handlers that can write data
197
+ to various destinations in different formats. Follows Python's file I/O
198
+ pattern with open/write/close semantics and supports context managers.
199
+
200
+ Attributes:
201
+ path (str): Path where the writer will be written.
202
+ output_prefix (str): Prefix for files when uploading to object store.
203
+ total_record_count (int): Total number of records processed.
204
+ chunk_count (int): Number of chunks the writer was split into.
205
+ buffer_size (int): Size of the buffer to write data to.
206
+ max_file_size_bytes (int): Maximum size of the file to write data to.
207
+ current_buffer_size (int): Current size of the buffer to write data to.
208
+ current_buffer_size_bytes (int): Current size of the buffer to write data to.
209
+ partitions (List[int]): Partitions of the writer.
210
+
211
+ Example:
212
+ Using close() explicitly::
213
+
214
+ writer = JsonFileWriter(path="/data/output")
215
+ await writer.write(dataframe)
216
+ await writer.write({"key": "value"}) # Dict support
217
+ stats = await writer.close()
218
+
219
+ Using context manager (recommended)::
220
+
221
+ async with JsonFileWriter(path="/data/output") as writer:
222
+ await writer.write(dataframe)
223
+ # close() called automatically
224
+ """
225
+
226
+ path: str
227
+ output_prefix: str
228
+ total_record_count: int
229
+ chunk_count: int
230
+ chunk_part: int
231
+ buffer_size: int
232
+ max_file_size_bytes: int
233
+ current_buffer_size: int
234
+ current_buffer_size_bytes: int
235
+ partitions: List[int]
236
+ extension: str
237
+ dataframe_type: DataframeType
238
+ _is_closed: bool = False
239
+ _statistics: Optional[ActivityStatistics] = None
240
+
241
+ async def __aenter__(self) -> "Writer":
242
+ """Enter the async context manager.
243
+
244
+ Returns:
245
+ Writer: The writer instance.
246
+ """
247
+ return self
248
+
249
+ async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
250
+ """Exit the async context manager, closing the writer.
251
+
252
+ Args:
253
+ exc_type: Exception type if an exception was raised.
254
+ exc_val: Exception value if an exception was raised.
255
+ exc_tb: Exception traceback if an exception was raised.
256
+ """
257
+ await self.close()
258
+
259
+ def _convert_to_dataframe(
260
+ self,
261
+ data: Union[
262
+ "pd.DataFrame", "daft.DataFrame", Dict[str, Any], List[Dict[str, Any]]
263
+ ],
264
+ ) -> Union["pd.DataFrame", "daft.DataFrame"]:
265
+ """Convert input data to a DataFrame if needed.
266
+
267
+ Args:
268
+ data: Input data - can be a DataFrame, dict, or list of dicts.
269
+
270
+ Returns:
271
+ A pandas or daft DataFrame depending on self.dataframe_type.
272
+
273
+ Raises:
274
+ TypeError: If data type is not supported or if dict/list input is used with daft when daft is not available.
275
+ """
276
+ import pandas as pd
277
+
278
+ # Already a pandas DataFrame - return as-is or convert to daft if needed
279
+ if isinstance(data, pd.DataFrame):
280
+ if self.dataframe_type == DataframeType.daft:
281
+ try:
282
+ import daft
283
+
284
+ return daft.from_pandas(data)
285
+ except ImportError:
286
+ raise TypeError(
287
+ "daft is not installed. Please install daft to use DataframeType.daft, "
288
+ "or use DataframeType.pandas instead."
289
+ )
290
+ return data
291
+
292
+ # Check for daft DataFrame
293
+ try:
294
+ import daft
295
+
296
+ if isinstance(data, daft.DataFrame):
297
+ return data
298
+ except ImportError:
299
+ pass
300
+
301
+ # Convert dict or list of dicts to DataFrame
302
+ if isinstance(data, dict) or (
303
+ isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict)
304
+ ):
305
+ # For daft dataframe_type, convert to daft DataFrame directly
306
+ if self.dataframe_type == DataframeType.daft:
307
+ try:
308
+ import daft
309
+
310
+ # Convert to columnar format for daft.from_pydict()
311
+ if isinstance(data, dict):
312
+ # Single dict: {"col1": "val1", "col2": "val2"} -> {"col1": ["val1"], "col2": ["val2"]}
313
+ columnar_data = {k: [v] for k, v in data.items()}
314
+ else:
315
+ # List of dicts: [{"col1": "v1"}, {"col1": "v2"}] -> {"col1": ["v1", "v2"]}
316
+ columnar_data = {}
317
+ for record in data:
318
+ for key, value in record.items():
319
+ if key not in columnar_data:
320
+ columnar_data[key] = []
321
+ columnar_data[key].append(value)
322
+ return daft.from_pydict(columnar_data)
323
+ except ImportError:
324
+ raise TypeError(
325
+ "Dict and list inputs require daft to be installed when using DataframeType.daft. "
326
+ "Please install daft or use DataframeType.pandas instead."
327
+ )
328
+ # For pandas dataframe_type, convert to pandas DataFrame
329
+ return pd.DataFrame([data] if isinstance(data, dict) else data)
330
+
331
+ raise TypeError(
332
+ f"Unsupported data type: {type(data).__name__}. "
333
+ "Expected DataFrame, dict, or list of dicts."
334
+ )
335
+
336
+ async def write(
337
+ self,
338
+ data: Union[
339
+ "pd.DataFrame", "daft.DataFrame", Dict[str, Any], List[Dict[str, Any]]
340
+ ],
341
+ **kwargs: Any,
342
+ ) -> None:
343
+ """Write data to the output destination.
344
+
345
+ Supports writing DataFrames, dicts (converted to single-row DataFrame),
346
+ or lists of dicts (converted to multi-row DataFrame).
347
+
348
+ Args:
349
+ data: Data to write - DataFrame, dict, or list of dicts.
350
+ **kwargs: Additional parameters passed to the underlying write method.
351
+
352
+ Raises:
353
+ ValueError: If the writer has been closed or dataframe_type is unsupported.
354
+ TypeError: If data type is not supported.
355
+ """
356
+ if self._is_closed:
357
+ raise ValueError("Cannot write to a closed writer")
358
+
359
+ # Convert to DataFrame if needed
360
+ dataframe = self._convert_to_dataframe(data)
361
+
362
+ if self.dataframe_type == DataframeType.pandas:
363
+ await self._write_dataframe(dataframe, **kwargs)
364
+ elif self.dataframe_type == DataframeType.daft:
365
+ await self._write_daft_dataframe(dataframe, **kwargs)
366
+ else:
367
+ raise ValueError(f"Unsupported dataframe_type: {self.dataframe_type}")
368
+
369
+ async def write_batches(
370
+ self,
371
+ dataframe: Union[
372
+ AsyncGenerator["pd.DataFrame", None],
373
+ Generator["pd.DataFrame", None, None],
374
+ AsyncGenerator["daft.DataFrame", None],
375
+ Generator["daft.DataFrame", None, None],
376
+ ],
377
+ ) -> None:
378
+ """Write batched DataFrames to the output destination.
379
+
380
+ Args:
381
+ dataframe: Async or sync generator yielding DataFrames.
382
+
383
+ Raises:
384
+ ValueError: If the writer has been closed or dataframe_type is unsupported.
385
+ """
386
+ if self._is_closed:
387
+ raise ValueError("Cannot write to a closed writer")
388
+
389
+ if self.dataframe_type == DataframeType.pandas:
390
+ await self._write_batched_dataframe(dataframe)
391
+ elif self.dataframe_type == DataframeType.daft:
392
+ await self._write_batched_daft_dataframe(dataframe)
393
+ else:
394
+ raise ValueError(f"Unsupported dataframe_type: {self.dataframe_type}")
395
+
396
+ async def _write_batched_dataframe(
397
+ self,
398
+ batched_dataframe: Union[
399
+ AsyncGenerator["pd.DataFrame", None], Generator["pd.DataFrame", None, None]
400
+ ],
401
+ ):
402
+ """Write a batched pandas DataFrame to Output.
403
+
404
+ This method writes the DataFrame to Output provided, potentially splitting it
405
+ into chunks based on chunk_size and buffer_size settings.
406
+
407
+ Args:
408
+ dataframe (pd.DataFrame): The DataFrame to write.
409
+
410
+ Note:
411
+ If the DataFrame is empty, the method returns without writing.
412
+ """
413
+ try:
414
+ if inspect.isasyncgen(batched_dataframe):
415
+ async for dataframe in batched_dataframe:
416
+ if not is_empty_dataframe(dataframe):
417
+ await self._write_dataframe(dataframe)
418
+ else:
419
+ # Cast to Generator since we've confirmed it's not an AsyncGenerator
420
+ sync_generator = cast(
421
+ Generator["pd.DataFrame", None, None], batched_dataframe
422
+ )
423
+ for dataframe in sync_generator:
424
+ if not is_empty_dataframe(dataframe):
425
+ await self._write_dataframe(dataframe)
426
+ except Exception as e:
427
+ logger.error(f"Error writing batched dataframe: {str(e)}")
428
+ raise
429
+
430
+ async def _write_dataframe(self, dataframe: "pd.DataFrame", **kwargs):
431
+ """Write a pandas DataFrame to Parquet files and upload to object store.
432
+
433
+ Args:
434
+ dataframe (pd.DataFrame): The DataFrame to write.
435
+ **kwargs: Additional parameters (currently unused for pandas DataFrames).
436
+ """
437
+ try:
438
+ if self.chunk_start is None:
439
+ self.chunk_part = 0
440
+ if len(dataframe) == 0:
441
+ return
442
+
443
+ chunk_size_bytes = estimate_dataframe_record_size(dataframe, self.extension)
444
+
445
+ for i in range(0, len(dataframe), self.buffer_size):
446
+ chunk = dataframe[i : i + self.buffer_size]
447
+
448
+ if (
449
+ self.current_buffer_size_bytes + chunk_size_bytes
450
+ > self.max_file_size_bytes
451
+ ):
452
+ output_file_name = f"{self.path}/{path_gen(self.chunk_count, self.chunk_part, extension=self.extension)}"
453
+ if os.path.exists(output_file_name):
454
+ await self._upload_file(output_file_name)
455
+ self.chunk_part += 1
456
+
457
+ self.current_buffer_size += len(chunk)
458
+ self.current_buffer_size_bytes += chunk_size_bytes * len(chunk)
459
+ await self._flush_buffer(chunk, self.chunk_part)
460
+
461
+ del chunk
462
+ gc.collect()
463
+
464
+ if self.current_buffer_size_bytes > 0:
465
+ # Finally upload the final file to the object store
466
+ output_file_name = f"{self.path}/{path_gen(self.chunk_count, self.chunk_part, extension=self.extension)}"
467
+ if os.path.exists(output_file_name):
468
+ await self._upload_file(output_file_name)
469
+ self.chunk_part += 1
470
+
471
+ # Record metrics for successful write
472
+ self.metrics.record_metric(
473
+ name="write_records",
474
+ value=len(dataframe),
475
+ metric_type=MetricType.COUNTER,
476
+ labels={"type": "pandas", "mode": WriteMode.APPEND.value},
477
+ description="Number of records written to files from pandas DataFrame",
478
+ )
479
+
480
+ # Record chunk metrics
481
+ self.metrics.record_metric(
482
+ name="chunks_written",
483
+ value=1,
484
+ metric_type=MetricType.COUNTER,
485
+ labels={"type": "pandas", "mode": WriteMode.APPEND.value},
486
+ description="Number of chunks written to files",
487
+ )
488
+
489
+ # If chunk_start is set we don't want to increment the chunk_count
490
+ # Since it should only increment the chunk_part in this case
491
+ if self.chunk_start is None:
492
+ self.chunk_count += 1
493
+ self.partitions.append(self.chunk_part)
494
+ except Exception as e:
495
+ # Record metrics for failed write
496
+ self.metrics.record_metric(
497
+ name="write_errors",
498
+ value=1,
499
+ metric_type=MetricType.COUNTER,
500
+ labels={
501
+ "type": "pandas",
502
+ "mode": WriteMode.APPEND.value,
503
+ "error": str(e),
504
+ },
505
+ description="Number of errors while writing to files",
506
+ )
507
+ logger.error(f"Error writing pandas dataframe to files: {str(e)}")
508
+ raise
509
+
510
+ async def _write_batched_daft_dataframe(
511
+ self,
512
+ batched_dataframe: Union[
513
+ AsyncGenerator["daft.DataFrame", None], # noqa: F821
514
+ Generator["daft.DataFrame", None, None], # noqa: F821
515
+ ],
516
+ ):
517
+ """Write a batched daft DataFrame to JSON files.
518
+
519
+ This method writes the DataFrame to JSON files, potentially splitting it
520
+ into chunks based on chunk_size and buffer_size settings.
521
+
522
+ Args:
523
+ dataframe (daft.DataFrame): The DataFrame to write.
524
+
525
+ Note:
526
+ If the DataFrame is empty, the method returns without writing.
527
+ """
528
+ try:
529
+ if inspect.isasyncgen(batched_dataframe):
530
+ async for dataframe in batched_dataframe:
531
+ if not is_empty_dataframe(dataframe):
532
+ await self._write_daft_dataframe(dataframe)
533
+ else:
534
+ # Cast to Generator since we've confirmed it's not an AsyncGenerator
535
+ sync_generator = cast(
536
+ Generator["daft.DataFrame", None, None], batched_dataframe
537
+ ) # noqa: F821
538
+ for dataframe in sync_generator:
539
+ if not is_empty_dataframe(dataframe):
540
+ await self._write_daft_dataframe(dataframe)
541
+ except Exception as e:
542
+ logger.error(f"Error writing batched daft dataframe: {str(e)}")
543
+ raise
544
+
545
+ @abstractmethod
546
+ async def _write_daft_dataframe(self, dataframe: "daft.DataFrame", **kwargs): # noqa: F821
547
+ """Write a daft DataFrame to the output destination.
548
+
549
+ Args:
550
+ dataframe (daft.DataFrame): The DataFrame to write.
551
+ **kwargs: Additional parameters passed through from write().
552
+ """
553
+ pass
554
+
555
+ @property
556
+ def statistics(self) -> ActivityStatistics:
557
+ """Get current statistics without closing the writer.
558
+
559
+ Returns:
560
+ ActivityStatistics: Current statistics (record count, chunk count, partitions).
561
+
562
+ Note:
563
+ This returns the current state. For final statistics after all
564
+ writes complete, use close() instead.
565
+ """
566
+ return ActivityStatistics(
567
+ total_record_count=self.total_record_count,
568
+ chunk_count=len(self.partitions),
569
+ partitions=self.partitions,
570
+ )
571
+
572
+ async def _finalize(self) -> None:
573
+ """Finalize the writer before closing.
574
+
575
+ Override this method in subclasses to perform any final flush operations,
576
+ upload remaining files, etc. This is called by close() before writing statistics.
577
+ """
578
+ pass
579
+
580
+ async def close(self) -> ActivityStatistics:
581
+ """Close the writer, flush buffers, upload files, and return statistics.
582
+
583
+ This method finalizes all pending writes, uploads any remaining files to
584
+ the object store, writes statistics, and marks the writer as closed.
585
+ Calling close() multiple times is safe (subsequent calls are no-ops).
586
+
587
+ The typename for statistics is automatically taken from `self.typename`
588
+ if it was set during initialization.
589
+
590
+ Returns:
591
+ ActivityStatistics: Final statistics including total_record_count,
592
+ chunk_count, and partitions.
593
+
594
+ Raises:
595
+ ValueError: If statistics data is invalid.
596
+ Exception: If there's an error during finalization or writing statistics.
597
+
598
+ Example:
599
+ ```python
600
+ writer = JsonFileWriter(path="/data/output", typename="table")
601
+ await writer.write(dataframe)
602
+ stats = await writer.close()
603
+ print(f"Wrote {stats.total_record_count} records")
604
+ ```
605
+ """
606
+ if self._is_closed:
607
+ if self._statistics:
608
+ return self._statistics
609
+ return self.statistics
610
+
611
+ try:
612
+ # Allow subclasses to perform final flush/upload operations
613
+ await self._finalize()
614
+
615
+ # Use self.typename if available
616
+ typename = getattr(self, "typename", None)
617
+
618
+ # Write statistics to file and object store
619
+ statistics_dict = await self._write_statistics(typename)
620
+ if not statistics_dict:
621
+ raise ValueError("No statistics data available")
622
+
623
+ self._statistics = ActivityStatistics.model_validate(statistics_dict)
624
+ if typename:
625
+ self._statistics.typename = typename
626
+
627
+ self._is_closed = True
628
+ return self._statistics
629
+
630
+ except Exception as e:
631
+ logger.error(f"Error closing writer: {str(e)}")
632
+ raise
633
+
634
+ async def _upload_file(self, file_name: str):
635
+ """Upload a file to the object store."""
636
+ # Get retain_local_copy from the writer instance, defaulting to False
637
+ retain_local = getattr(self, "retain_local_copy", False)
638
+
639
+ if ENABLE_ATLAN_UPLOAD:
640
+ await ObjectStore.upload_file(
641
+ source=file_name,
642
+ store_name=UPSTREAM_OBJECT_STORE_NAME,
643
+ retain_local_copy=True, # Always retain for the second upload to deployment store
644
+ destination=get_object_store_prefix(file_name),
645
+ )
646
+ await ObjectStore.upload_file(
647
+ source=file_name,
648
+ destination=get_object_store_prefix(file_name),
649
+ retain_local_copy=retain_local, # Respect the writer's retain_local_copy setting
650
+ )
651
+
652
+ self.current_buffer_size_bytes = 0
653
+
654
+ async def _flush_buffer(self, chunk: "pd.DataFrame", chunk_part: int):
655
+ """Flush the current buffer to a JSON file.
656
+
657
+ This method combines all DataFrames in the buffer, writes them to a JSON file,
658
+ and uploads the file to the object store.
659
+
660
+ Note:
661
+ If the buffer is empty or has no records, the method returns without writing.
662
+ """
663
+ try:
664
+ if not is_empty_dataframe(chunk):
665
+ self.total_record_count += len(chunk)
666
+ output_file_name = f"{self.path}/{path_gen(self.chunk_count, chunk_part, extension=self.extension)}"
667
+ await self._write_chunk(chunk, output_file_name)
668
+
669
+ self.current_buffer_size = 0
670
+
671
+ # Record chunk metrics
672
+ self.metrics.record_metric(
673
+ name="chunks_written",
674
+ value=1,
675
+ metric_type=MetricType.COUNTER,
676
+ labels={"type": "output"},
677
+ description="Number of chunks written to files",
678
+ )
679
+
680
+ except Exception as e:
681
+ # Record metrics for failed write
682
+ self.metrics.record_metric(
683
+ name="write_errors",
684
+ value=1,
685
+ metric_type=MetricType.COUNTER,
686
+ labels={"type": "output", "error": str(e)},
687
+ description="Number of errors while writing to files",
688
+ )
689
+ logger.error(f"Error flushing buffer to files: {str(e)}")
690
+ raise e
691
+
692
+ async def _write_statistics(
693
+ self, typename: Optional[str] = None
694
+ ) -> Optional[Dict[str, Any]]:
695
+ """Write statistics about the output to a JSON file.
696
+
697
+ Internal method called by close() to persist statistics.
698
+
699
+ Args:
700
+ typename (str, optional): Type name for organizing statistics.
701
+
702
+ Returns:
703
+ Dict containing statistics data.
704
+
705
+ Raises:
706
+ Exception: If there's an error writing or uploading the statistics.
707
+ """
708
+ try:
709
+ # prepare the statistics
710
+ statistics = {
711
+ "total_record_count": self.total_record_count,
712
+ "chunk_count": len(self.partitions),
713
+ "partitions": self.partitions,
714
+ }
715
+
716
+ # Ensure typename is included in the statistics payload (if provided)
717
+ if typename:
718
+ statistics["typename"] = typename
719
+
720
+ # Write the statistics to a json file inside a dedicated statistics/ folder
721
+ statistics_dir = os.path.join(self.path, "statistics")
722
+ os.makedirs(statistics_dir, exist_ok=True)
723
+ output_file_name = os.path.join(statistics_dir, "statistics.json.ignore")
724
+ # If chunk_start is provided, include it in the statistics filename
725
+ try:
726
+ cs = getattr(self, "chunk_start", None)
727
+ if cs is not None:
728
+ output_file_name = os.path.join(
729
+ statistics_dir, f"statistics-chunk-{cs}.json.ignore"
730
+ )
731
+ except Exception:
732
+ # If accessing chunk_start fails, fallback to default filename
733
+ pass
734
+
735
+ # Write the statistics dictionary to the JSON file
736
+ with open(output_file_name, "wb") as f:
737
+ f.write(orjson.dumps(statistics))
738
+
739
+ destination_file_path = get_object_store_prefix(output_file_name)
740
+ # Push the file to the object store
741
+ await ObjectStore.upload_file(
742
+ source=output_file_name,
743
+ destination=destination_file_path,
744
+ )
745
+
746
+ return statistics
747
+ except Exception as e:
748
+ logger.error(f"Error writing statistics: {str(e)}")
749
+ raise