atlan-application-sdk 1.1.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. application_sdk/activities/common/sql_utils.py +308 -0
  2. application_sdk/activities/common/utils.py +1 -45
  3. application_sdk/activities/metadata_extraction/sql.py +110 -353
  4. application_sdk/activities/query_extraction/sql.py +12 -11
  5. application_sdk/application/__init__.py +1 -1
  6. application_sdk/clients/sql.py +167 -1
  7. application_sdk/clients/temporal.py +6 -6
  8. application_sdk/common/types.py +8 -0
  9. application_sdk/common/utils.py +1 -8
  10. application_sdk/constants.py +1 -1
  11. application_sdk/handlers/sql.py +10 -25
  12. application_sdk/interceptors/events.py +1 -1
  13. application_sdk/io/__init__.py +654 -0
  14. application_sdk/io/json.py +429 -0
  15. application_sdk/{outputs → io}/parquet.py +358 -47
  16. application_sdk/io/utils.py +307 -0
  17. application_sdk/observability/observability.py +23 -12
  18. application_sdk/server/fastapi/middleware/logmiddleware.py +23 -17
  19. application_sdk/server/fastapi/middleware/metrics.py +27 -24
  20. application_sdk/server/fastapi/models.py +1 -1
  21. application_sdk/server/fastapi/routers/server.py +1 -1
  22. application_sdk/server/fastapi/utils.py +10 -0
  23. application_sdk/services/eventstore.py +4 -4
  24. application_sdk/services/objectstore.py +30 -7
  25. application_sdk/services/secretstore.py +1 -1
  26. application_sdk/test_utils/hypothesis/strategies/outputs/json_output.py +0 -1
  27. application_sdk/test_utils/hypothesis/strategies/server/fastapi/__init__.py +1 -1
  28. application_sdk/version.py +1 -1
  29. application_sdk/worker.py +1 -1
  30. {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/METADATA +9 -11
  31. {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/RECORD +36 -43
  32. application_sdk/common/dataframe_utils.py +0 -42
  33. application_sdk/events/__init__.py +0 -5
  34. application_sdk/inputs/.cursor/BUGBOT.md +0 -250
  35. application_sdk/inputs/__init__.py +0 -168
  36. application_sdk/inputs/iceberg.py +0 -75
  37. application_sdk/inputs/json.py +0 -136
  38. application_sdk/inputs/parquet.py +0 -272
  39. application_sdk/inputs/sql_query.py +0 -271
  40. application_sdk/outputs/.cursor/BUGBOT.md +0 -295
  41. application_sdk/outputs/__init__.py +0 -445
  42. application_sdk/outputs/iceberg.py +0 -139
  43. application_sdk/outputs/json.py +0 -268
  44. /application_sdk/{events → interceptors}/models.py +0 -0
  45. /application_sdk/{common/dapr_utils.py → services/_utils.py} +0 -0
  46. {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/WHEEL +0 -0
  47. {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/licenses/LICENSE +0 -0
  48. {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/licenses/NOTICE +0 -0
@@ -1,268 +0,0 @@
1
- import os
2
- from datetime import datetime
3
- from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
4
-
5
- import orjson
6
- from temporalio import activity
7
-
8
- from application_sdk.activities.common.models import ActivityStatistics
9
- from application_sdk.constants import DAPR_MAX_GRPC_MESSAGE_LENGTH
10
- from application_sdk.observability.logger_adaptor import get_logger
11
- from application_sdk.observability.metrics_adaptor import MetricType, get_metrics
12
- from application_sdk.outputs import Output
13
-
14
- logger = get_logger(__name__)
15
- activity.logger = logger
16
-
17
- if TYPE_CHECKING:
18
- import daft # type: ignore
19
- import pandas as pd
20
-
21
-
22
- def convert_datetime_to_epoch(data: Any) -> Any:
23
- """Convert datetime objects to epoch timestamps in milliseconds.
24
-
25
- Args:
26
- data: The data to convert
27
-
28
- Returns:
29
- The converted data with datetime fields as epoch timestamps
30
- """
31
- if isinstance(data, datetime):
32
- return int(data.timestamp() * 1000)
33
- elif isinstance(data, dict):
34
- return {k: convert_datetime_to_epoch(v) for k, v in data.items()}
35
- elif isinstance(data, list):
36
- return [convert_datetime_to_epoch(item) for item in data]
37
- return data
38
-
39
-
40
- class JsonOutput(Output):
41
- """Output handler for writing data to JSON files.
42
-
43
- This class provides functionality for writing data to JSON files with support
44
- for chunking large datasets, buffering, and automatic file path generation.
45
- It can handle both pandas and daft DataFrames as input.
46
-
47
- The output can be written to local files and optionally uploaded to an object
48
- store. Files are named using a configurable path generation scheme that
49
- includes chunk numbers for split files.
50
-
51
- Attributes:
52
- output_path (Optional[str]): Base path where JSON files will be written.
53
- output_suffix (str): Suffix added to file paths when uploading to object store.
54
- typename (Optional[str]): Type identifier for the data being written.
55
- chunk_start (Optional[int]): Starting index for chunk numbering.
56
- buffer_size (int): Size of the write buffer in bytes.
57
- chunk_size (int): Maximum number of records per chunk.
58
- total_record_count (int): Total number of records processed.
59
- chunk_count (int): Number of chunks written.
60
- buffer (List[Union[pd.DataFrame, daft.DataFrame]]): Buffer for accumulating
61
- data before writing.
62
- """
63
-
64
- _EXTENSION = ".json"
65
-
66
- def __init__(
67
- self,
68
- output_suffix: str,
69
- output_path: Optional[str] = None,
70
- typename: Optional[str] = None,
71
- chunk_start: Optional[int] = None,
72
- buffer_size: int = 5000,
73
- chunk_size: Optional[int] = 50000, # to limit the memory usage on upload
74
- total_record_count: int = 0,
75
- chunk_count: int = 0,
76
- start_marker: Optional[str] = None,
77
- end_marker: Optional[str] = None,
78
- retain_local_copy: bool = False,
79
- **kwargs: Dict[str, Any],
80
- ):
81
- """Initialize the JSON output handler.
82
-
83
- Args:
84
- output_path (str): Path where JSON files will be written.
85
- output_suffix (str): Prefix for files when uploading to object store.
86
- chunk_start (Optional[int], optional): Starting index for chunk numbering.
87
- Defaults to None.
88
- buffer_size (int, optional): Size of the buffer in bytes.
89
- Defaults to 10MB (1024 * 1024 * 10).
90
- chunk_size (Optional[int], optional): Maximum number of records per chunk. If None, uses config value.
91
- Defaults to None.
92
- total_record_count (int, optional): Initial total record count.
93
- Defaults to 0.
94
- chunk_count (int, optional): Initial chunk count.
95
- Defaults to 0.
96
- retain_local_copy (bool, optional): Whether to retain the local copy of the files.
97
- Defaults to False.
98
- """
99
- self.output_path = output_path
100
- self.output_suffix = output_suffix
101
- self.typename = typename
102
- self.chunk_start = chunk_start
103
- self.total_record_count = total_record_count
104
- self.chunk_count = chunk_count
105
- self.buffer_size = buffer_size
106
- self.chunk_size = chunk_size or 50000 # to limit the memory usage on upload
107
- self.buffer: List[Union["pd.DataFrame", "daft.DataFrame"]] = [] # noqa: F821
108
- self.current_buffer_size = 0
109
- self.current_buffer_size_bytes = 0 # Track estimated buffer size in bytes
110
- self.max_file_size_bytes = int(
111
- DAPR_MAX_GRPC_MESSAGE_LENGTH * 0.9
112
- ) # 90% of DAPR limit as safety buffer
113
- self.start_marker = start_marker
114
- self.end_marker = end_marker
115
- self.partitions = []
116
- self.chunk_part = 0
117
- self.metrics = get_metrics()
118
- self.retain_local_copy = retain_local_copy
119
-
120
- if not self.output_path:
121
- raise ValueError("output_path is required")
122
-
123
- self.output_path = os.path.join(self.output_path, output_suffix)
124
- if typename:
125
- self.output_path = os.path.join(self.output_path, typename)
126
- os.makedirs(self.output_path, exist_ok=True)
127
-
128
- if self.chunk_start:
129
- self.chunk_count = self.chunk_start + self.chunk_count
130
-
131
- async def write_daft_dataframe(
132
- self,
133
- dataframe: "daft.DataFrame",
134
- preserve_fields: Optional[List[str]] = [
135
- "identity_cycle",
136
- "number_columns_in_part_key",
137
- "columns_participating_in_part_key",
138
- "engine",
139
- "is_insertable_into",
140
- "is_typed",
141
- ],
142
- null_to_empty_dict_fields: Optional[List[str]] = [
143
- "attributes",
144
- "customAttributes",
145
- ],
146
- ): # noqa: F821
147
- """Write a daft DataFrame to JSON files.
148
-
149
- This method converts the daft DataFrame to pandas and writes it to JSON files.
150
-
151
- Args:
152
- dataframe (daft.DataFrame): The DataFrame to write.
153
-
154
- Note:
155
- Daft does not have built-in JSON writing support, so we are using orjson.
156
- """
157
- try:
158
- if self.chunk_start is None:
159
- self.chunk_part = 0
160
-
161
- buffer = []
162
- for row in dataframe.iter_rows():
163
- self.total_record_count += 1
164
- # Convert datetime fields to epoch timestamps before serialization
165
- row = convert_datetime_to_epoch(row)
166
- # Remove null attributes from the row recursively, preserving specified fields
167
- cleaned_row = self.process_null_fields(
168
- row, preserve_fields, null_to_empty_dict_fields
169
- )
170
- # Serialize the row and add it to the buffer
171
- serialized_row = orjson.dumps(
172
- cleaned_row, option=orjson.OPT_APPEND_NEWLINE
173
- )
174
- buffer.append(serialized_row)
175
- self.current_buffer_size += 1
176
- self.current_buffer_size_bytes += len(serialized_row)
177
-
178
- # If the buffer size is reached append to the file and clear the buffer
179
- if self.current_buffer_size >= self.buffer_size:
180
- await self.flush_daft_buffer(buffer, self.chunk_part)
181
-
182
- if self.current_buffer_size_bytes > self.max_file_size_bytes or (
183
- self.total_record_count > 0
184
- and self.total_record_count % self.chunk_size == 0
185
- ):
186
- output_file_name = f"{self.output_path}/{self.path_gen(self.chunk_count, self.chunk_part, self.start_marker, self.end_marker)}"
187
- if os.path.exists(output_file_name):
188
- await self._upload_file(output_file_name)
189
- self.chunk_part += 1
190
-
191
- # Write any remaining rows in the buffer
192
- if self.current_buffer_size > 0:
193
- await self.flush_daft_buffer(buffer, self.chunk_part)
194
-
195
- # Record metrics for successful write
196
- self.metrics.record_metric(
197
- name="json_write_records",
198
- value=dataframe.count_rows(),
199
- metric_type=MetricType.COUNTER,
200
- labels={"type": "daft"},
201
- description="Number of records written to JSON files from daft DataFrame",
202
- )
203
- except Exception as e:
204
- # Record metrics for failed write
205
- self.metrics.record_metric(
206
- name="json_write_errors",
207
- value=1,
208
- metric_type=MetricType.COUNTER,
209
- labels={"type": "daft", "error": str(e)},
210
- description="Number of errors while writing to JSON files",
211
- )
212
- logger.error(f"Error writing daft dataframe to json: {str(e)}")
213
-
214
- async def flush_daft_buffer(self, buffer: List[str], chunk_part: int):
215
- """Flush the current buffer to a JSON file.
216
-
217
- This method combines all DataFrames in the buffer, writes them to a JSON file,
218
- and uploads the file to the object store.
219
- """
220
- output_file_name = (
221
- f"{self.output_path}/{self.path_gen(self.chunk_count, chunk_part)}"
222
- )
223
- with open(output_file_name, "ab+") as f:
224
- f.writelines(buffer)
225
- buffer.clear() # Clear the buffer
226
-
227
- self.current_buffer_size = 0
228
-
229
- # Record chunk metrics
230
- self.metrics.record_metric(
231
- name="json_chunks_written",
232
- value=1,
233
- metric_type=MetricType.COUNTER,
234
- labels={"type": "daft"},
235
- description="Number of chunks written to JSON files",
236
- )
237
-
238
- async def write_chunk(self, chunk: "pd.DataFrame", file_name: str):
239
- """Write a chunk to a JSON file.
240
-
241
- This method writes a chunk to a JSON file and uploads the file to the object store.
242
- """
243
- mode = "w" if not os.path.exists(file_name) else "a"
244
- chunk.to_json(file_name, orient="records", lines=True, mode=mode)
245
-
246
- async def get_statistics(
247
- self, typename: Optional[str] = None
248
- ) -> ActivityStatistics:
249
- """Get the statistics of the JSON files.
250
-
251
- This method returns the statistics of the JSON files.
252
- """
253
- # Finally upload the final file
254
- if self.current_buffer_size_bytes > 0:
255
- output_file_name = (
256
- f"{self.output_path}/{self.path_gen(self.chunk_count, self.chunk_part)}"
257
- )
258
- if os.path.exists(output_file_name):
259
- await self._upload_file(output_file_name)
260
- self.chunk_part += 1
261
-
262
- # If chunk_start is set we don't want to increment the chunk_count
263
- # Since it should only increment the chunk_part in this case
264
- if self.chunk_start is None:
265
- self.chunk_count += 1
266
- self.partitions.append(self.chunk_part)
267
-
268
- return await super().get_statistics(typename)
File without changes