atlan-application-sdk 1.1.1__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. application_sdk/activities/common/sql_utils.py +308 -0
  2. application_sdk/activities/common/utils.py +1 -45
  3. application_sdk/activities/metadata_extraction/sql.py +110 -353
  4. application_sdk/activities/query_extraction/sql.py +12 -11
  5. application_sdk/application/__init__.py +1 -1
  6. application_sdk/clients/sql.py +167 -1
  7. application_sdk/clients/temporal.py +6 -6
  8. application_sdk/common/types.py +8 -0
  9. application_sdk/common/utils.py +1 -8
  10. application_sdk/constants.py +1 -1
  11. application_sdk/handlers/sql.py +10 -25
  12. application_sdk/interceptors/events.py +1 -1
  13. application_sdk/io/__init__.py +654 -0
  14. application_sdk/io/json.py +429 -0
  15. application_sdk/{outputs → io}/parquet.py +358 -47
  16. application_sdk/io/utils.py +307 -0
  17. application_sdk/observability/observability.py +23 -12
  18. application_sdk/server/fastapi/middleware/logmiddleware.py +23 -17
  19. application_sdk/server/fastapi/middleware/metrics.py +27 -24
  20. application_sdk/server/fastapi/models.py +1 -1
  21. application_sdk/server/fastapi/routers/server.py +1 -1
  22. application_sdk/server/fastapi/utils.py +10 -0
  23. application_sdk/services/eventstore.py +4 -4
  24. application_sdk/services/secretstore.py +1 -1
  25. application_sdk/test_utils/hypothesis/strategies/outputs/json_output.py +0 -1
  26. application_sdk/test_utils/hypothesis/strategies/server/fastapi/__init__.py +1 -1
  27. application_sdk/version.py +1 -1
  28. application_sdk/worker.py +1 -1
  29. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.0.0.dist-info}/METADATA +9 -11
  30. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.0.0.dist-info}/RECORD +35 -42
  31. application_sdk/common/dataframe_utils.py +0 -42
  32. application_sdk/events/__init__.py +0 -5
  33. application_sdk/inputs/.cursor/BUGBOT.md +0 -250
  34. application_sdk/inputs/__init__.py +0 -168
  35. application_sdk/inputs/iceberg.py +0 -75
  36. application_sdk/inputs/json.py +0 -136
  37. application_sdk/inputs/parquet.py +0 -272
  38. application_sdk/inputs/sql_query.py +0 -271
  39. application_sdk/outputs/.cursor/BUGBOT.md +0 -295
  40. application_sdk/outputs/__init__.py +0 -453
  41. application_sdk/outputs/iceberg.py +0 -139
  42. application_sdk/outputs/json.py +0 -268
  43. /application_sdk/{events → interceptors}/models.py +0 -0
  44. /application_sdk/{common/dapr_utils.py → services/_utils.py} +0 -0
  45. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.0.0.dist-info}/WHEEL +0 -0
  46. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.0.0.dist-info}/licenses/LICENSE +0 -0
  47. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.0.0.dist-info}/licenses/NOTICE +0 -0
@@ -0,0 +1,429 @@
1
+ import os
2
+ from typing import TYPE_CHECKING, Any, AsyncIterator, Dict, List, Optional, Union
3
+
4
+ import orjson
5
+ from temporalio import activity
6
+
7
+ from application_sdk.common.types import DataframeType
8
+ from application_sdk.constants import DAPR_MAX_GRPC_MESSAGE_LENGTH
9
+ from application_sdk.io.utils import (
10
+ JSON_FILE_EXTENSION,
11
+ convert_datetime_to_epoch,
12
+ download_files,
13
+ path_gen,
14
+ process_null_fields,
15
+ )
16
+ from application_sdk.observability.logger_adaptor import get_logger
17
+ from application_sdk.observability.metrics_adaptor import MetricType, get_metrics
18
+
19
+ if TYPE_CHECKING:
20
+ import daft
21
+ import pandas as pd
22
+
23
+ from application_sdk.io import Reader, Writer
24
+
25
+ logger = get_logger(__name__)
26
+ activity.logger = logger
27
+
28
+
29
+ class JsonFileReader(Reader):
30
+ """
31
+ JSON File Reader class to read data from JSON files using daft and pandas.
32
+ Supports reading both single files and directories containing multiple JSON files.
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ path: str,
38
+ file_names: Optional[List[str]] = None,
39
+ chunk_size: Optional[int] = 100000,
40
+ dataframe_type: DataframeType = DataframeType.pandas,
41
+ ):
42
+ """Initialize the JsonInput class.
43
+
44
+ Args:
45
+ path (str): Path to JSON file or directory containing JSON files.
46
+ It accepts both types of paths:
47
+ local path or object store path
48
+ Wildcards are not supported.
49
+ file_names (Optional[List[str]]): List of specific file names to read. Defaults to None.
50
+ chunk_size (int): Number of rows per batch. Defaults to 100000.
51
+
52
+ Raises:
53
+ ValueError: When path is not provided or when single file path is combined with file_names
54
+ """
55
+ self.extension = JSON_FILE_EXTENSION
56
+
57
+ # Validate that single file path and file_names are not both specified
58
+ if path.endswith(self.extension) and file_names:
59
+ raise ValueError(
60
+ f"Cannot specify both a single file path ('{path}') and file_names filter. "
61
+ f"Either provide a directory path with file_names, or specify the exact file path without file_names."
62
+ )
63
+
64
+ self.path = path
65
+ self.chunk_size = chunk_size
66
+ self.file_names = file_names
67
+ self.dataframe_type = dataframe_type
68
+
69
+ async def read(self) -> Union["pd.DataFrame", "daft.DataFrame"]:
70
+ """
71
+ Method to read the data from the json files in the path
72
+ and return as a single combined pandas dataframe
73
+ """
74
+ if self.dataframe_type == DataframeType.pandas:
75
+ return await self._get_dataframe()
76
+ elif self.dataframe_type == DataframeType.daft:
77
+ return await self._get_daft_dataframe()
78
+ else:
79
+ raise ValueError(f"Unsupported dataframe_type: {self.dataframe_type}")
80
+
81
+ def read_batches(
82
+ self,
83
+ ) -> Union[
84
+ AsyncIterator["pd.DataFrame"],
85
+ AsyncIterator["daft.DataFrame"],
86
+ ]:
87
+ """
88
+ Method to read the data from the json files in the path
89
+ and return as a batched pandas dataframe
90
+ """
91
+ if self.dataframe_type == DataframeType.pandas:
92
+ return self._get_batched_dataframe()
93
+ elif self.dataframe_type == DataframeType.daft:
94
+ return self._get_batched_daft_dataframe()
95
+ else:
96
+ raise ValueError(f"Unsupported dataframe_type: {self.dataframe_type}")
97
+
98
+ async def _get_batched_dataframe(
99
+ self,
100
+ ) -> AsyncIterator["pd.DataFrame"]:
101
+ """
102
+ Method to read the data from the json files in the path
103
+ and return as a batched pandas dataframe
104
+ """
105
+ try:
106
+ import pandas as pd
107
+
108
+ # Ensure files are available (local or downloaded)
109
+ json_files = await download_files(
110
+ self.path, self.extension, self.file_names
111
+ )
112
+ logger.info(f"Reading {len(json_files)} JSON files in batches")
113
+
114
+ for json_file in json_files:
115
+ json_reader_obj = pd.read_json(
116
+ json_file,
117
+ chunksize=self.chunk_size,
118
+ lines=True,
119
+ )
120
+ for chunk in json_reader_obj:
121
+ yield chunk
122
+ except Exception as e:
123
+ logger.error(f"Error reading batched data from JSON: {str(e)}")
124
+ raise
125
+
126
+ async def _get_dataframe(self) -> "pd.DataFrame":
127
+ """
128
+ Method to read the data from the json files in the path
129
+ and return as a single combined pandas dataframe
130
+ """
131
+ try:
132
+ import pandas as pd
133
+
134
+ # Ensure files are available (local or downloaded)
135
+ json_files = await download_files(
136
+ self.path, self.extension, self.file_names
137
+ )
138
+ logger.info(f"Reading {len(json_files)} JSON files as pandas dataframe")
139
+
140
+ return pd.concat(
141
+ (pd.read_json(json_file, lines=True) for json_file in json_files),
142
+ ignore_index=True,
143
+ )
144
+
145
+ except Exception as e:
146
+ logger.error(f"Error reading data from JSON: {str(e)}")
147
+ raise
148
+
149
+ async def _get_batched_daft_dataframe(
150
+ self,
151
+ ) -> AsyncIterator["daft.DataFrame"]: # noqa: F821
152
+ """
153
+ Method to read the data from the json files in the path
154
+ and return as a batched daft dataframe
155
+ """
156
+ try:
157
+ import daft
158
+
159
+ # Ensure files are available (local or downloaded)
160
+ json_files = await download_files(
161
+ self.path, self.extension, self.file_names
162
+ )
163
+ logger.info(f"Reading {len(json_files)} JSON files as daft batches")
164
+
165
+ # Yield each discovered file as separate batch with chunking
166
+ for json_file in json_files:
167
+ yield daft.read_json(json_file, _chunk_size=self.chunk_size)
168
+ except Exception as e:
169
+ logger.error(f"Error reading batched data from JSON using daft: {str(e)}")
170
+ raise
171
+
172
+ async def _get_daft_dataframe(self) -> "daft.DataFrame": # noqa: F821
173
+ """
174
+ Method to read the data from the json files in the path
175
+ and return as a single combined daft dataframe
176
+ """
177
+ try:
178
+ import daft
179
+
180
+ # Ensure files are available (local or downloaded)
181
+ json_files = await download_files(
182
+ self.path, self.extension, self.file_names
183
+ )
184
+ logger.info(f"Reading {len(json_files)} JSON files with daft")
185
+
186
+ # Use the discovered/downloaded files directly
187
+ return daft.read_json(json_files)
188
+ except Exception as e:
189
+ logger.error(f"Error reading data from JSON using daft: {str(e)}")
190
+ raise
191
+
192
+
193
+ class JsonFileWriter(Writer):
194
+ """Output handler for writing data to JSON files.
195
+
196
+ This class provides functionality for writing data to JSON files with support
197
+ for chunking large datasets, buffering, and automatic file path generation.
198
+ It can handle both pandas and daft DataFrames as input.
199
+
200
+ The output can be written to local files and optionally uploaded to an object
201
+ store. Files are named using a configurable path generation scheme that
202
+ includes chunk numbers for split files.
203
+
204
+ Attributes:
205
+ path (str): Full path where JSON files will be written.
206
+ typename (Optional[str]): Type identifier for the data being written.
207
+ chunk_start (Optional[int]): Starting index for chunk numbering.
208
+ buffer_size (int): Size of the write buffer in bytes.
209
+ chunk_size (int): Maximum number of records per chunk.
210
+ total_record_count (int): Total number of records processed.
211
+ chunk_count (int): Number of chunks written.
212
+ buffer (List[Union[pd.DataFrame, daft.DataFrame]]): Buffer for accumulating
213
+ data before writing.
214
+ """
215
+
216
+ def __init__(
217
+ self,
218
+ path: str,
219
+ typename: Optional[str] = None,
220
+ chunk_start: Optional[int] = None,
221
+ buffer_size: Optional[int] = 5000,
222
+ chunk_size: Optional[int] = 50000, # to limit the memory usage on upload
223
+ total_record_count: Optional[int] = 0,
224
+ chunk_count: Optional[int] = 0,
225
+ start_marker: Optional[str] = None,
226
+ end_marker: Optional[str] = None,
227
+ retain_local_copy: Optional[bool] = False,
228
+ dataframe_type: DataframeType = DataframeType.pandas,
229
+ **kwargs: Dict[str, Any],
230
+ ):
231
+ """Initialize the JSON output handler.
232
+
233
+ Args:
234
+ path (str): Full path where JSON files will be written.
235
+ typename (Optional[str], optional): Type identifier for the data being written.
236
+ If provided, a subdirectory with this name will be created under path.
237
+ Defaults to None.
238
+ chunk_start (Optional[int], optional): Starting index for chunk numbering.
239
+ Defaults to None.
240
+ buffer_size (int, optional): Size of the buffer in bytes.
241
+ Defaults to 10MB (1024 * 1024 * 10).
242
+ chunk_size (Optional[int], optional): Maximum number of records per chunk. If None, uses config value.
243
+ Defaults to None.
244
+ total_record_count (int, optional): Initial total record count.
245
+ Defaults to 0.
246
+ chunk_count (int, optional): Initial chunk count.
247
+ Defaults to 0.
248
+ retain_local_copy (bool, optional): Whether to retain the local copy of the files.
249
+ Defaults to False.
250
+ dataframe_type (DataframeType, optional): Type of dataframe to write. Defaults to DataframeType.pandas.
251
+ """
252
+ self.path = path
253
+ self.typename = typename
254
+ self.chunk_start = chunk_start
255
+ self.total_record_count = total_record_count
256
+ self.chunk_count = chunk_count
257
+ self.buffer_size = buffer_size
258
+ self.chunk_size = chunk_size or 50000 # to limit the memory usage on upload
259
+ self.buffer: List[Union["pd.DataFrame", "daft.DataFrame"]] = [] # noqa: F821
260
+ self.current_buffer_size = 0
261
+ self.current_buffer_size_bytes = 0 # Track estimated buffer size in bytes
262
+ self.max_file_size_bytes = int(
263
+ DAPR_MAX_GRPC_MESSAGE_LENGTH * 0.9
264
+ ) # 90% of DAPR limit as safety buffer
265
+ self.start_marker = start_marker
266
+ self.end_marker = end_marker
267
+ self.partitions = []
268
+ self.chunk_part = 0
269
+ self.metrics = get_metrics()
270
+ self.retain_local_copy = retain_local_copy
271
+ self.extension = JSON_FILE_EXTENSION
272
+ self.dataframe_type = dataframe_type
273
+ self._is_closed = False
274
+ self._statistics = None
275
+
276
+ if not self.path:
277
+ raise ValueError("path is required")
278
+
279
+ if typename:
280
+ self.path = os.path.join(self.path, typename)
281
+ os.makedirs(self.path, exist_ok=True)
282
+
283
+ if self.chunk_start:
284
+ self.chunk_count = self.chunk_start + self.chunk_count
285
+
286
+ async def _write_daft_dataframe(
287
+ self,
288
+ dataframe: "daft.DataFrame",
289
+ preserve_fields: Optional[List[str]] = None,
290
+ null_to_empty_dict_fields: Optional[List[str]] = None,
291
+ **kwargs,
292
+ ): # noqa: F821
293
+ """Write a daft DataFrame to JSON files.
294
+
295
+ This method converts the daft DataFrame to pandas and writes it to JSON files.
296
+
297
+ Args:
298
+ dataframe (daft.DataFrame): The DataFrame to write.
299
+ preserve_fields (Optional[List[str]]): List of fields to preserve during null processing.
300
+ Defaults to ["identity_cycle", "number_columns_in_part_key",
301
+ "columns_participating_in_part_key", "engine", "is_insertable_into", "is_typed"].
302
+ null_to_empty_dict_fields (Optional[List[str]]): List of fields to convert from null to empty dict.
303
+ Defaults to ["attributes", "customAttributes"].
304
+
305
+ Note:
306
+ Daft does not have built-in JSON writing support, so we are using orjson.
307
+ """
308
+ # Initialize default values for mutable arguments
309
+ if preserve_fields is None:
310
+ preserve_fields = [
311
+ "identity_cycle",
312
+ "number_columns_in_part_key",
313
+ "columns_participating_in_part_key",
314
+ "engine",
315
+ "is_insertable_into",
316
+ "is_typed",
317
+ ]
318
+ if null_to_empty_dict_fields is None:
319
+ null_to_empty_dict_fields = [
320
+ "attributes",
321
+ "customAttributes",
322
+ ]
323
+
324
+ try:
325
+ if self.chunk_start is None:
326
+ self.chunk_part = 0
327
+
328
+ buffer = []
329
+ for row in dataframe.iter_rows():
330
+ self.total_record_count += 1
331
+ # Convert datetime fields to epoch timestamps before serialization
332
+ row = convert_datetime_to_epoch(row)
333
+ # Remove null attributes from the row recursively, preserving specified fields
334
+ cleaned_row = process_null_fields(
335
+ row, preserve_fields, null_to_empty_dict_fields
336
+ )
337
+ # Serialize the row and add it to the buffer
338
+ serialized_row = orjson.dumps(
339
+ cleaned_row, option=orjson.OPT_APPEND_NEWLINE
340
+ )
341
+ buffer.append(serialized_row)
342
+ self.current_buffer_size += 1
343
+ self.current_buffer_size_bytes += len(serialized_row)
344
+
345
+ # If the buffer size is reached append to the file and clear the buffer
346
+ if self.current_buffer_size >= self.buffer_size:
347
+ await self._flush_daft_buffer(buffer, self.chunk_part)
348
+
349
+ if self.current_buffer_size_bytes > self.max_file_size_bytes or (
350
+ self.total_record_count > 0
351
+ and self.total_record_count % self.chunk_size == 0
352
+ ):
353
+ output_file_name = f"{self.path}/{path_gen(self.chunk_count, self.chunk_part, self.start_marker, self.end_marker, extension=self.extension)}"
354
+ if os.path.exists(output_file_name):
355
+ await self._upload_file(output_file_name)
356
+ self.chunk_part += 1
357
+
358
+ # Write any remaining rows in the buffer
359
+ if self.current_buffer_size > 0:
360
+ await self._flush_daft_buffer(buffer, self.chunk_part)
361
+
362
+ # Record metrics for successful write
363
+ self.metrics.record_metric(
364
+ name="json_write_records",
365
+ value=dataframe.count_rows(),
366
+ metric_type=MetricType.COUNTER,
367
+ labels={"type": "daft"},
368
+ description="Number of records written to JSON files from daft DataFrame",
369
+ )
370
+ except Exception as e:
371
+ # Record metrics for failed write
372
+ self.metrics.record_metric(
373
+ name="json_write_errors",
374
+ value=1,
375
+ metric_type=MetricType.COUNTER,
376
+ labels={"type": "daft", "error": str(e)},
377
+ description="Number of errors while writing to JSON files",
378
+ )
379
+ logger.error(f"Error writing daft dataframe to json: {str(e)}")
380
+ raise
381
+
382
+ async def _flush_daft_buffer(self, buffer: List[str], chunk_part: int):
383
+ """Flush the current buffer to a JSON file.
384
+
385
+ This method combines all DataFrames in the buffer, writes them to a JSON file,
386
+ and uploads the file to the object store.
387
+ """
388
+ output_file_name = f"{self.path}/{path_gen(self.chunk_count, chunk_part, self.start_marker, self.end_marker, extension=self.extension)}"
389
+ with open(output_file_name, "ab+") as f:
390
+ f.writelines(buffer)
391
+ buffer.clear() # Clear the buffer
392
+
393
+ self.current_buffer_size = 0
394
+
395
+ # Record chunk metrics
396
+ self.metrics.record_metric(
397
+ name="json_chunks_written",
398
+ value=1,
399
+ metric_type=MetricType.COUNTER,
400
+ labels={"type": "daft"},
401
+ description="Number of chunks written to JSON files",
402
+ )
403
+
404
+ async def _write_chunk(self, chunk: "pd.DataFrame", file_name: str):
405
+ """Write a chunk to a JSON file.
406
+
407
+ This method writes a chunk to a JSON file and uploads the file to the object store.
408
+ """
409
+ mode = "w" if not os.path.exists(file_name) else "a"
410
+ with open(file_name, mode=mode) as f:
411
+ chunk.to_json(f, orient="records", lines=True)
412
+
413
+ async def _finalize(self) -> None:
414
+ """Finalize the JSON writer before closing.
415
+
416
+ Uploads any remaining buffered data to the object store.
417
+ """
418
+ # Upload the final file if there's remaining buffered data
419
+ if self.current_buffer_size_bytes > 0:
420
+ output_file_name = f"{self.path}/{path_gen(self.chunk_count, self.chunk_part, self.start_marker, self.end_marker, extension=self.extension)}"
421
+ if os.path.exists(output_file_name):
422
+ await self._upload_file(output_file_name)
423
+ self.chunk_part += 1
424
+
425
+ # If chunk_start is set we don't want to increment the chunk_count
426
+ # Since it should only increment the chunk_part in this case
427
+ if self.chunk_start is None:
428
+ self.chunk_count += 1
429
+ self.partitions.append(self.chunk_part)