hotglue-singer-sdk 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. hotglue_singer_sdk/__init__.py +34 -0
  2. hotglue_singer_sdk/authenticators.py +554 -0
  3. hotglue_singer_sdk/cli/__init__.py +1 -0
  4. hotglue_singer_sdk/cli/common_options.py +37 -0
  5. hotglue_singer_sdk/configuration/__init__.py +1 -0
  6. hotglue_singer_sdk/configuration/_dict_config.py +101 -0
  7. hotglue_singer_sdk/exceptions.py +52 -0
  8. hotglue_singer_sdk/helpers/__init__.py +1 -0
  9. hotglue_singer_sdk/helpers/_catalog.py +122 -0
  10. hotglue_singer_sdk/helpers/_classproperty.py +18 -0
  11. hotglue_singer_sdk/helpers/_compat.py +15 -0
  12. hotglue_singer_sdk/helpers/_flattening.py +374 -0
  13. hotglue_singer_sdk/helpers/_schema.py +100 -0
  14. hotglue_singer_sdk/helpers/_secrets.py +41 -0
  15. hotglue_singer_sdk/helpers/_simpleeval.py +678 -0
  16. hotglue_singer_sdk/helpers/_singer.py +280 -0
  17. hotglue_singer_sdk/helpers/_state.py +282 -0
  18. hotglue_singer_sdk/helpers/_typing.py +231 -0
  19. hotglue_singer_sdk/helpers/_util.py +27 -0
  20. hotglue_singer_sdk/helpers/capabilities.py +240 -0
  21. hotglue_singer_sdk/helpers/jsonpath.py +39 -0
  22. hotglue_singer_sdk/io_base.py +134 -0
  23. hotglue_singer_sdk/mapper.py +691 -0
  24. hotglue_singer_sdk/mapper_base.py +156 -0
  25. hotglue_singer_sdk/plugin_base.py +415 -0
  26. hotglue_singer_sdk/py.typed +0 -0
  27. hotglue_singer_sdk/sinks/__init__.py +14 -0
  28. hotglue_singer_sdk/sinks/batch.py +90 -0
  29. hotglue_singer_sdk/sinks/core.py +412 -0
  30. hotglue_singer_sdk/sinks/record.py +66 -0
  31. hotglue_singer_sdk/sinks/sql.py +299 -0
  32. hotglue_singer_sdk/streams/__init__.py +14 -0
  33. hotglue_singer_sdk/streams/core.py +1294 -0
  34. hotglue_singer_sdk/streams/graphql.py +74 -0
  35. hotglue_singer_sdk/streams/rest.py +611 -0
  36. hotglue_singer_sdk/streams/sql.py +1023 -0
  37. hotglue_singer_sdk/tap_base.py +580 -0
  38. hotglue_singer_sdk/target_base.py +554 -0
  39. hotglue_singer_sdk/target_sdk/__init__.py +0 -0
  40. hotglue_singer_sdk/target_sdk/auth.py +124 -0
  41. hotglue_singer_sdk/target_sdk/client.py +286 -0
  42. hotglue_singer_sdk/target_sdk/common.py +13 -0
  43. hotglue_singer_sdk/target_sdk/lambda.py +121 -0
  44. hotglue_singer_sdk/target_sdk/rest.py +108 -0
  45. hotglue_singer_sdk/target_sdk/sinks.py +16 -0
  46. hotglue_singer_sdk/target_sdk/target.py +570 -0
  47. hotglue_singer_sdk/target_sdk/target_base.py +627 -0
  48. hotglue_singer_sdk/testing.py +198 -0
  49. hotglue_singer_sdk/typing.py +603 -0
  50. hotglue_singer_sdk-1.0.2.dist-info/METADATA +53 -0
  51. hotglue_singer_sdk-1.0.2.dist-info/RECORD +53 -0
  52. hotglue_singer_sdk-1.0.2.dist-info/WHEEL +4 -0
  53. hotglue_singer_sdk-1.0.2.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,412 @@
1
+ """Sink classes load data to a target."""
2
+
3
+ import abc
4
+ import datetime
5
+ import time
6
+ from logging import Logger
7
+ from types import MappingProxyType
8
+ from typing import Any, Dict, List, Mapping, Optional, Union
9
+
10
+ from dateutil import parser
11
+ from jsonschema import Draft4Validator, FormatChecker
12
+
13
+ from hotglue_singer_sdk.helpers._compat import final
14
+ from hotglue_singer_sdk.helpers._typing import (
15
+ DatetimeErrorTreatmentEnum,
16
+ get_datelike_property_type,
17
+ handle_invalid_timestamp_in_record,
18
+ )
19
+ from hotglue_singer_sdk.plugin_base import PluginBase
20
+
21
+ JSONSchemaValidator = Draft4Validator
22
+
23
+
24
+ class Sink(metaclass=abc.ABCMeta):
25
+ """Abstract base class for target sinks."""
26
+
27
+ # max timestamp/datetime supported, used to reset invalid dates
28
+
29
+ logger: Logger
30
+
31
+ MAX_SIZE_DEFAULT = 10000
32
+
33
+ def __init__(
34
+ self,
35
+ target: PluginBase,
36
+ stream_name: str,
37
+ schema: Dict,
38
+ key_properties: Optional[List[str]],
39
+ ) -> None:
40
+ """Initialize target sink.
41
+
42
+ Args:
43
+ target: Target instance.
44
+ stream_name: Name of the stream to sink.
45
+ schema: Schema of the stream to sink.
46
+ key_properties: Primary key of the stream to sink.
47
+ """
48
+ self.logger = target.logger
49
+ self._config = dict(target.config)
50
+ self._pending_batch: Optional[dict] = None
51
+ self.stream_name = stream_name
52
+ self.logger.info(f"Initializing target sink for stream '{stream_name}'...")
53
+ self.schema = schema
54
+ if self.include_sdc_metadata_properties:
55
+ self._add_sdc_metadata_to_schema()
56
+ else:
57
+ self._remove_sdc_metadata_from_schema()
58
+ self.records_to_drain: Union[List[dict], Any] = []
59
+ self._context_draining: Optional[dict] = None
60
+ self.latest_state: Optional[dict] = None
61
+ self._draining_state: Optional[dict] = None
62
+ self.drained_state: Optional[dict] = None
63
+ self.key_properties = key_properties or []
64
+
65
+ # Tally counters
66
+ self._total_records_written: int = 0
67
+ self._total_dupe_records_merged: int = 0
68
+ self._total_records_read: int = 0
69
+ self._batch_records_read: int = 0
70
+ self._batch_dupe_records_merged: int = 0
71
+
72
+ self._validator = Draft4Validator(schema, format_checker=FormatChecker())
73
+
74
+ def _get_context(self, record: dict) -> dict:
75
+ """Return an empty dictionary by default.
76
+
77
+ NOTE: Future versions of the SDK may expand the available context attributes.
78
+
79
+ Args:
80
+ record: Individual record in the stream.
81
+
82
+ Returns:
83
+ TODO
84
+ """
85
+ return {}
86
+
87
+ # Size properties
88
+
89
+ @property
90
+ def max_size(self) -> int:
91
+ """Get max batch size.
92
+
93
+ Returns:
94
+ Max number of records to batch before `is_full=True`
95
+ """
96
+ return self.MAX_SIZE_DEFAULT
97
+
98
+ @property
99
+ def current_size(self) -> int:
100
+ """Get current batch size.
101
+
102
+ Returns:
103
+ The number of records to drain.
104
+ """
105
+ return self._batch_records_read
106
+
107
+ @property
108
+ def is_full(self) -> bool:
109
+ """Check against size limit.
110
+
111
+ Returns:
112
+ True if the sink needs to be drained.
113
+ """
114
+ return self.current_size >= self.max_size
115
+
116
+ # Tally methods
117
+
118
+ @final
119
+ def tally_record_read(self, count: int = 1) -> None:
120
+ """Increment the records read tally.
121
+
122
+ This method is called automatically by the SDK when records are read.
123
+
124
+ Args:
125
+ count: Number to increase record count by.
126
+ """
127
+ self._total_records_read += count
128
+ self._batch_records_read += count
129
+
130
+ @final
131
+ def tally_record_written(self, count: int = 1) -> None:
132
+ """Increment the records written tally.
133
+
134
+ This method is called automatically by the SDK after
135
+ :meth:`~hotglue_singer_sdk.Sink.process_record()`
136
+ or :meth:`~hotglue_singer_sdk.Sink.process_batch()`.
137
+
138
+ Args:
139
+ count: Number to increase record count by.
140
+ """
141
+ self._total_records_written += count
142
+
143
+ @final
144
+ def tally_duplicate_merged(self, count: int = 1) -> None:
145
+ """Increment the records merged tally.
146
+
147
+ This method should be called directly by the Target implementation.
148
+
149
+ Args:
150
+ count: Number to increase record count by.
151
+ """
152
+ self._total_dupe_records_merged += count
153
+ self._batch_dupe_records_merged += count
154
+
155
+ # Properties
156
+
157
+ @property
158
+ def config(self) -> Mapping[str, Any]:
159
+ """Get plugin configuration.
160
+
161
+ Returns:
162
+ A frozen (read-only) config dictionary map.
163
+ """
164
+ return MappingProxyType(self._config)
165
+
166
+ @property
167
+ def include_sdc_metadata_properties(self) -> bool:
168
+ """Check if metadata columns should be added.
169
+
170
+ Returns:
171
+ True if metadata columns should be added.
172
+ """
173
+ return self.config.get("add_record_metadata", False)
174
+
175
+ @property
176
+ def datetime_error_treatment(self) -> DatetimeErrorTreatmentEnum:
177
+ """Return a treatment to use for datetime parse errors: ERROR. MAX, or NULL.
178
+
179
+ Returns:
180
+ TODO
181
+ """
182
+ return DatetimeErrorTreatmentEnum.ERROR
183
+
184
+ # Record processing
185
+
186
+ def _add_sdc_metadata_to_record(
187
+ self, record: dict, message: dict, context: dict
188
+ ) -> None:
189
+ """Populate metadata _sdc columns from incoming record message.
190
+
191
+ Record metadata specs documented at:
192
+ https://sdk.meltano.com/en/latest/implementation/record_metadata.md
193
+
194
+ Args:
195
+ record: Individual record in the stream.
196
+ message: TODO
197
+ context: Stream partition or context dictionary.
198
+ """
199
+ record["_sdc_extracted_at"] = message.get("time_extracted")
200
+ record["_sdc_received_at"] = datetime.datetime.now().isoformat()
201
+ record["_sdc_batched_at"] = (
202
+ context.get("batch_start_time", None) or datetime.datetime.now()
203
+ ).isoformat()
204
+ record["_sdc_deleted_at"] = record.get("_sdc_deleted_at")
205
+ record["_sdc_sequence"] = int(round(time.time() * 1000))
206
+ record["_sdc_table_version"] = message.get("version")
207
+
208
+ def _add_sdc_metadata_to_schema(self) -> None:
209
+ """Add _sdc metadata columns.
210
+
211
+ Record metadata specs documented at:
212
+ https://sdk.meltano.com/en/latest/implementation/record_metadata.md
213
+ """
214
+ properties_dict = self.schema["properties"]
215
+ for col in {
216
+ "_sdc_extracted_at",
217
+ "_sdc_received_at",
218
+ "_sdc_batched_at",
219
+ "_sdc_deleted_at",
220
+ }:
221
+ properties_dict[col] = {
222
+ "type": ["null", "string"],
223
+ "format": "date-time",
224
+ }
225
+ for col in {"_sdc_sequence", "_sdc_table_version"}:
226
+ properties_dict[col] = {"type": ["null", "integer"]}
227
+
228
+ def _remove_sdc_metadata_from_schema(self) -> None:
229
+ """Remove _sdc metadata columns.
230
+
231
+ Record metadata specs documented at:
232
+ https://sdk.meltano.com/en/latest/implementation/record_metadata.md
233
+ """
234
+ properties_dict = self.schema["properties"]
235
+ for col in {
236
+ "_sdc_extracted_at",
237
+ "_sdc_received_at",
238
+ "_sdc_batched_at",
239
+ "_sdc_deleted_at",
240
+ "_sdc_sequence",
241
+ "_sdc_table_version",
242
+ }:
243
+ properties_dict.pop(col, None)
244
+
245
+ def _remove_sdc_metadata_from_record(self, record: dict) -> None:
246
+ """Remove metadata _sdc columns from incoming record message.
247
+
248
+ Record metadata specs documented at:
249
+ https://sdk.meltano.com/en/latest/implementation/record_metadata.md
250
+
251
+ Args:
252
+ record: Individual record in the stream.
253
+ """
254
+ record.pop("_sdc_extracted_at", None)
255
+ record.pop("_sdc_received_at", None)
256
+ record.pop("_sdc_batched_at", None)
257
+ record.pop("_sdc_deleted_at", None)
258
+ record.pop("_sdc_sequence", None)
259
+ record.pop("_sdc_table_version", None)
260
+
261
+ # Record validation
262
+
263
+ def _validate_and_parse(self, record: Dict) -> Dict:
264
+ """Validate or repair the record, parsing to python-native types as needed.
265
+
266
+ Args:
267
+ record: Individual record in the stream.
268
+
269
+ Returns:
270
+ TODO
271
+ """
272
+ self._validator.validate(record)
273
+ self._parse_timestamps_in_record(
274
+ record=record, schema=self.schema, treatment=self.datetime_error_treatment
275
+ )
276
+ return record
277
+
278
+ def _parse_timestamps_in_record(
279
+ self, record: Dict, schema: Dict, treatment: DatetimeErrorTreatmentEnum
280
+ ) -> None:
281
+ """Parse strings to datetime.datetime values, repairing or erroring on failure.
282
+
283
+ Attempts to parse every field that is of type date/datetime/time. If its value
284
+ is out of range, repair logic will be driven by the `treatment` input arg:
285
+ MAX, NULL, or ERROR.
286
+
287
+ Args:
288
+ record: Individual record in the stream.
289
+ schema: TODO
290
+ treatment: TODO
291
+ """
292
+ for key in record.keys():
293
+ datelike_type = get_datelike_property_type(schema["properties"][key])
294
+ if datelike_type:
295
+ try:
296
+ date_val = record[key]
297
+ if record[key] is not None:
298
+ date_val = parser.parse(date_val)
299
+ except Exception as ex:
300
+ date_val = handle_invalid_timestamp_in_record(
301
+ record,
302
+ [key],
303
+ date_val,
304
+ datelike_type,
305
+ ex,
306
+ treatment,
307
+ self.logger,
308
+ )
309
+ record[key] = date_val
310
+
311
+ def _after_process_record(self, context: dict) -> None:
312
+ """Perform post-processing and record keeping. Internal hook.
313
+
314
+ Args:
315
+ context: Stream partition or context dictionary.
316
+ """
317
+ pass
318
+
319
+ # SDK developer overrides:
320
+
321
+ def preprocess_record(self, record: Dict, context: dict) -> dict:
322
+ """Process incoming record and return a modified result.
323
+
324
+ Args:
325
+ record: Individual record in the stream.
326
+ context: Stream partition or context dictionary.
327
+
328
+ Returns:
329
+ A new, processed record.
330
+ """
331
+ return record
332
+
333
+ @abc.abstractmethod
334
+ def process_record(self, record: dict, context: dict) -> None:
335
+ """Load the latest record from the stream.
336
+
337
+ Implementations may either load to the `context` dict for staging (the
338
+ default behavior for Batch types), or permanently write out to the target.
339
+
340
+ Anything appended to :attr:`hotglue_singer_sdk.Sink.records_to_drain` will be
341
+ automatically passed to
342
+ :meth:`~hotglue_singer_sdk.Sink.process_batch()` to be permanently written during the
343
+ process_batch operation.
344
+
345
+ If duplicates are merged, these can be tracked via
346
+ :meth:`~hotglue_singer_sdk.Sink.tally_duplicate_merged()`.
347
+
348
+ Args:
349
+ record: Individual record in the stream.
350
+ context: Stream partition or context dictionary.
351
+ """
352
+ pass
353
+
354
+ def start_drain(self) -> dict:
355
+ """Set and return `self._context_draining`.
356
+
357
+ Returns:
358
+ TODO
359
+ """
360
+ self._context_draining = self._pending_batch or {}
361
+ self._pending_batch = None
362
+ return self._context_draining
363
+
364
+ @abc.abstractmethod
365
+ def process_batch(self, context: dict) -> None:
366
+ """Process all records per the batch's `context` dictionary.
367
+
368
+ If duplicates are merged, these can optionally be tracked via
369
+ `tally_duplicate_merged()`.
370
+
371
+ Args:
372
+ context: Stream partition or context dictionary.
373
+
374
+ Raises:
375
+ NotImplementedError: If derived class does not override this method.
376
+ """
377
+ raise NotImplementedError("No handling exists for process_batch().")
378
+
379
+ def mark_drained(self) -> None:
380
+ """Reset `records_to_drain` and any other tracking."""
381
+ self.drained_state = self._draining_state
382
+ self._draining_state = None
383
+ self._context_draining = None
384
+ if self._batch_records_read:
385
+ self.tally_record_written(
386
+ self._batch_records_read - self._batch_dupe_records_merged
387
+ )
388
+ self._batch_records_read = 0
389
+
390
+ def activate_version(self, new_version: int) -> None:
391
+ """Bump the active version of the target table.
392
+
393
+ This method should be overridden by developers if a custom implementation is
394
+ expected.
395
+
396
+ Args:
397
+ new_version: The version number to activate.
398
+ """
399
+ _ = new_version
400
+ self.logger.warning(
401
+ "ACTIVATE_VERSION message received but not implemented by this target. "
402
+ "Ignoring."
403
+ )
404
+
405
+ def clean_up(self) -> None:
406
+ """Perform any clean up actions required at end of a stream.
407
+
408
+ Implementations should ensure that clean up does not affect resources
409
+ that may be in use from other instances of the same sink. Stream name alone
410
+ should not be relied on, it's recommended to use a uuid as well.
411
+ """
412
+ pass
@@ -0,0 +1,66 @@
1
+ """Sink classes load data to a target."""
2
+
3
+ import abc
4
+
5
+ from hotglue_singer_sdk.helpers._compat import final
6
+ from hotglue_singer_sdk.sinks.core import Sink
7
+
8
+
9
+ class RecordSink(Sink):
10
+ """Base class for singleton record writers."""
11
+
12
+ current_size = 0 # Records are always written directly
13
+
14
+ def _after_process_record(self, context: dict) -> None:
15
+ """Perform post-processing and record keeping. Internal hook.
16
+
17
+ The RecordSink class uses this method to tally each record written.
18
+
19
+ Args:
20
+ context: Stream partition or context dictionary.
21
+ """
22
+ self.tally_record_written()
23
+
24
+ @final
25
+ def process_batch(self, context: dict) -> None:
26
+ """Do nothing and return immediately.
27
+
28
+ The RecordSink class does not support batching.
29
+
30
+ This method may not be overridden.
31
+
32
+ Args:
33
+ context: Stream partition or context dictionary.
34
+ """
35
+ pass
36
+
37
+ @final
38
+ def start_batch(self, context: dict) -> None:
39
+ """Do nothing and return immediately.
40
+
41
+ The RecordSink class does not support batching.
42
+
43
+ This method may not be overridden.
44
+
45
+ Args:
46
+ context: Stream partition or context dictionary.
47
+ """
48
+ pass
49
+
50
+ @abc.abstractmethod
51
+ def process_record(self, record: dict, context: dict) -> None:
52
+ """Load the latest record from the stream.
53
+
54
+ This method must be overridden.
55
+
56
+ Implementations should permanently serialize each record to the target
57
+ prior to returning.
58
+
59
+ If duplicates are merged/skipped instead of being loaded, merges can be
60
+ tracked via :meth:`~hotglue_singer_sdk.Sink.tally_duplicate_merged()`.
61
+
62
+ Args:
63
+ record: Individual record in the stream.
64
+ context: Stream partition or context dictionary.
65
+ """
66
+ pass