hotglue-singer-sdk 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hotglue_singer_sdk/__init__.py +34 -0
- hotglue_singer_sdk/authenticators.py +554 -0
- hotglue_singer_sdk/cli/__init__.py +1 -0
- hotglue_singer_sdk/cli/common_options.py +37 -0
- hotglue_singer_sdk/configuration/__init__.py +1 -0
- hotglue_singer_sdk/configuration/_dict_config.py +101 -0
- hotglue_singer_sdk/exceptions.py +52 -0
- hotglue_singer_sdk/helpers/__init__.py +1 -0
- hotglue_singer_sdk/helpers/_catalog.py +122 -0
- hotglue_singer_sdk/helpers/_classproperty.py +18 -0
- hotglue_singer_sdk/helpers/_compat.py +15 -0
- hotglue_singer_sdk/helpers/_flattening.py +374 -0
- hotglue_singer_sdk/helpers/_schema.py +100 -0
- hotglue_singer_sdk/helpers/_secrets.py +41 -0
- hotglue_singer_sdk/helpers/_simpleeval.py +678 -0
- hotglue_singer_sdk/helpers/_singer.py +280 -0
- hotglue_singer_sdk/helpers/_state.py +282 -0
- hotglue_singer_sdk/helpers/_typing.py +231 -0
- hotglue_singer_sdk/helpers/_util.py +27 -0
- hotglue_singer_sdk/helpers/capabilities.py +240 -0
- hotglue_singer_sdk/helpers/jsonpath.py +39 -0
- hotglue_singer_sdk/io_base.py +134 -0
- hotglue_singer_sdk/mapper.py +691 -0
- hotglue_singer_sdk/mapper_base.py +156 -0
- hotglue_singer_sdk/plugin_base.py +415 -0
- hotglue_singer_sdk/py.typed +0 -0
- hotglue_singer_sdk/sinks/__init__.py +14 -0
- hotglue_singer_sdk/sinks/batch.py +90 -0
- hotglue_singer_sdk/sinks/core.py +412 -0
- hotglue_singer_sdk/sinks/record.py +66 -0
- hotglue_singer_sdk/sinks/sql.py +299 -0
- hotglue_singer_sdk/streams/__init__.py +14 -0
- hotglue_singer_sdk/streams/core.py +1294 -0
- hotglue_singer_sdk/streams/graphql.py +74 -0
- hotglue_singer_sdk/streams/rest.py +611 -0
- hotglue_singer_sdk/streams/sql.py +1023 -0
- hotglue_singer_sdk/tap_base.py +580 -0
- hotglue_singer_sdk/target_base.py +554 -0
- hotglue_singer_sdk/target_sdk/__init__.py +0 -0
- hotglue_singer_sdk/target_sdk/auth.py +124 -0
- hotglue_singer_sdk/target_sdk/client.py +286 -0
- hotglue_singer_sdk/target_sdk/common.py +13 -0
- hotglue_singer_sdk/target_sdk/lambda.py +121 -0
- hotglue_singer_sdk/target_sdk/rest.py +108 -0
- hotglue_singer_sdk/target_sdk/sinks.py +16 -0
- hotglue_singer_sdk/target_sdk/target.py +570 -0
- hotglue_singer_sdk/target_sdk/target_base.py +627 -0
- hotglue_singer_sdk/testing.py +198 -0
- hotglue_singer_sdk/typing.py +603 -0
- hotglue_singer_sdk-1.0.2.dist-info/METADATA +53 -0
- hotglue_singer_sdk-1.0.2.dist-info/RECORD +53 -0
- hotglue_singer_sdk-1.0.2.dist-info/WHEEL +4 -0
- hotglue_singer_sdk-1.0.2.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,1294 @@
|
|
|
1
|
+
"""Stream abstract class."""
|
|
2
|
+
|
|
3
|
+
import abc
|
|
4
|
+
import copy
|
|
5
|
+
import datetime
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
from os import PathLike
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from types import MappingProxyType
|
|
11
|
+
from typing import (
|
|
12
|
+
Any,
|
|
13
|
+
Callable,
|
|
14
|
+
Dict,
|
|
15
|
+
Generator,
|
|
16
|
+
Iterable,
|
|
17
|
+
List,
|
|
18
|
+
Mapping,
|
|
19
|
+
Optional,
|
|
20
|
+
Tuple,
|
|
21
|
+
Type,
|
|
22
|
+
TypeVar,
|
|
23
|
+
Union,
|
|
24
|
+
cast,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
import pendulum
|
|
28
|
+
import requests
|
|
29
|
+
import singer
|
|
30
|
+
import concurrent.futures
|
|
31
|
+
from singer import RecordMessage, Schema, SchemaMessage, StateMessage
|
|
32
|
+
|
|
33
|
+
from hotglue_singer_sdk.exceptions import InvalidStreamSortException, MaxRecordsLimitException
|
|
34
|
+
from hotglue_singer_sdk.helpers._catalog import pop_deselected_record_properties
|
|
35
|
+
from hotglue_singer_sdk.helpers._compat import final
|
|
36
|
+
from hotglue_singer_sdk.helpers._flattening import get_flattening_options
|
|
37
|
+
from hotglue_singer_sdk.helpers._schema import SchemaPlus
|
|
38
|
+
from hotglue_singer_sdk.helpers._singer import (
|
|
39
|
+
Catalog,
|
|
40
|
+
CatalogEntry,
|
|
41
|
+
MetadataMapping,
|
|
42
|
+
SelectionMask,
|
|
43
|
+
)
|
|
44
|
+
from hotglue_singer_sdk.helpers._state import (
|
|
45
|
+
finalize_state_progress_markers,
|
|
46
|
+
get_starting_replication_value,
|
|
47
|
+
get_state_partitions_list,
|
|
48
|
+
get_writeable_state_dict,
|
|
49
|
+
increment_state,
|
|
50
|
+
log_sort_error,
|
|
51
|
+
reset_state_progress_markers,
|
|
52
|
+
write_replication_key_signpost,
|
|
53
|
+
write_starting_replication_value,
|
|
54
|
+
)
|
|
55
|
+
from hotglue_singer_sdk.helpers._typing import conform_record_data_types, is_datetime_type
|
|
56
|
+
from hotglue_singer_sdk.helpers._util import utc_now
|
|
57
|
+
from hotglue_singer_sdk.mapper import RemoveRecordTransform, SameRecordTransform, StreamMap
|
|
58
|
+
from hotglue_singer_sdk.plugin_base import PluginBase as TapBaseClass
|
|
59
|
+
|
|
60
|
+
# Replication methods
|
|
61
|
+
REPLICATION_FULL_TABLE = "FULL_TABLE"
|
|
62
|
+
REPLICATION_INCREMENTAL = "INCREMENTAL"
|
|
63
|
+
REPLICATION_LOG_BASED = "LOG_BASED"
|
|
64
|
+
|
|
65
|
+
FactoryType = TypeVar("FactoryType", bound="Stream")
|
|
66
|
+
|
|
67
|
+
METRICS_LOG_LEVEL_SETTING = "metrics_log_level"
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class Stream(metaclass=abc.ABCMeta):
|
|
71
|
+
"""Abstract base class for tap streams."""
|
|
72
|
+
|
|
73
|
+
STATE_MSG_FREQUENCY = 10000 # Number of records between state messages
|
|
74
|
+
_MAX_RECORDS_LIMIT: Optional[int] = None
|
|
75
|
+
|
|
76
|
+
# Used for nested stream relationships
|
|
77
|
+
parent_stream_type: Optional[Type["Stream"]] = None
|
|
78
|
+
ignore_parent_replication_key: bool = False
|
|
79
|
+
parallelization_limit: int = 1
|
|
80
|
+
|
|
81
|
+
# Internal API cost aggregator
|
|
82
|
+
_sync_costs: Dict[str, int] = {}
|
|
83
|
+
|
|
84
|
+
def __init__(
|
|
85
|
+
self,
|
|
86
|
+
tap: TapBaseClass,
|
|
87
|
+
schema: Optional[Union[str, PathLike, Dict[str, Any], Schema]] = None,
|
|
88
|
+
name: Optional[str] = None,
|
|
89
|
+
) -> None:
|
|
90
|
+
"""Init tap stream.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
tap: Singer Tap this stream belongs to.
|
|
94
|
+
schema: JSON schema for records in this stream.
|
|
95
|
+
name: Name of this stream.
|
|
96
|
+
|
|
97
|
+
Raises:
|
|
98
|
+
ValueError: TODO
|
|
99
|
+
FileNotFoundError: TODO
|
|
100
|
+
"""
|
|
101
|
+
if name:
|
|
102
|
+
self.name: str = name
|
|
103
|
+
if not self.name:
|
|
104
|
+
raise ValueError("Missing argument or class variable 'name'.")
|
|
105
|
+
|
|
106
|
+
self.logger: logging.Logger = tap.logger
|
|
107
|
+
self.tap_name: str = tap.name
|
|
108
|
+
self._config: dict = dict(tap.config)
|
|
109
|
+
self._tap = tap
|
|
110
|
+
self._tap_state = tap.state
|
|
111
|
+
self._tap_input_catalog: Optional[Catalog] = None
|
|
112
|
+
self._stream_maps: Optional[List[StreamMap]] = None
|
|
113
|
+
self.forced_replication_method: Optional[str] = None
|
|
114
|
+
self._replication_key: Optional[str] = None
|
|
115
|
+
self._primary_keys: Optional[List[str]] = None
|
|
116
|
+
self._state_partitioning_keys: Optional[List[str]] = None
|
|
117
|
+
self._schema_filepath: Optional[Path] = None
|
|
118
|
+
self._metadata: Optional[MetadataMapping] = None
|
|
119
|
+
self._mask: Optional[SelectionMask] = None
|
|
120
|
+
self._schema: dict
|
|
121
|
+
self.child_streams: List[Stream] = []
|
|
122
|
+
self._minimum_start_time: Optional[datetime.datetime] = None
|
|
123
|
+
|
|
124
|
+
if schema:
|
|
125
|
+
if isinstance(schema, (PathLike, str)):
|
|
126
|
+
if not Path(schema).is_file():
|
|
127
|
+
raise FileNotFoundError(
|
|
128
|
+
f"Could not find schema file '{self.schema_filepath}'."
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
self._schema_filepath = Path(schema)
|
|
132
|
+
elif isinstance(schema, dict):
|
|
133
|
+
self._schema = schema
|
|
134
|
+
elif isinstance(schema, Schema):
|
|
135
|
+
self._schema = schema.to_dict()
|
|
136
|
+
else:
|
|
137
|
+
raise ValueError(
|
|
138
|
+
f"Unexpected type {type(schema).__name__} for arg 'schema'."
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
if self.schema_filepath:
|
|
142
|
+
self._schema = json.loads(Path(self.schema_filepath).read_text())
|
|
143
|
+
|
|
144
|
+
if not self.schema:
|
|
145
|
+
raise ValueError(
|
|
146
|
+
f"Could not initialize schema for stream '{self.name}'. "
|
|
147
|
+
"A valid schema object or filepath was not provided."
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
@property
|
|
151
|
+
def stream_maps(self) -> List[StreamMap]:
|
|
152
|
+
"""Get stream transformation maps.
|
|
153
|
+
|
|
154
|
+
The 0th item is the primary stream map. List should not be empty.
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
A list of one or more map transformations for this stream.
|
|
158
|
+
"""
|
|
159
|
+
if self._stream_maps:
|
|
160
|
+
return self._stream_maps
|
|
161
|
+
|
|
162
|
+
if self._tap.mapper:
|
|
163
|
+
self._stream_maps = self._tap.mapper.stream_maps[self.name]
|
|
164
|
+
self.logger.info(
|
|
165
|
+
f"Tap has custom mapper. Using {len(self.stream_maps)} provided map(s)."
|
|
166
|
+
)
|
|
167
|
+
else:
|
|
168
|
+
self.logger.info(
|
|
169
|
+
f"No custom mapper provided for '{self.name}'. "
|
|
170
|
+
"Using SameRecordTransform."
|
|
171
|
+
)
|
|
172
|
+
self._stream_maps = [
|
|
173
|
+
SameRecordTransform(
|
|
174
|
+
stream_alias=self.name,
|
|
175
|
+
raw_schema=self.schema,
|
|
176
|
+
key_properties=self.primary_keys,
|
|
177
|
+
flattening_options=get_flattening_options(self.config),
|
|
178
|
+
)
|
|
179
|
+
]
|
|
180
|
+
return self._stream_maps
|
|
181
|
+
|
|
182
|
+
@property
|
|
183
|
+
def is_timestamp_replication_key(self) -> bool:
|
|
184
|
+
"""Check is replication key is a timestamp.
|
|
185
|
+
|
|
186
|
+
Developers can override to `True` in order to force this value, although this
|
|
187
|
+
should not be required in most use cases since the type can generally be
|
|
188
|
+
accurately detected from the JSON Schema.
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
True if the stream uses a timestamp-based replication key.
|
|
192
|
+
"""
|
|
193
|
+
if not self.replication_key:
|
|
194
|
+
return False
|
|
195
|
+
type_dict = self.schema.get("properties", {}).get(self.replication_key)
|
|
196
|
+
return is_datetime_type(type_dict)
|
|
197
|
+
|
|
198
|
+
def get_starting_replication_key_value(
|
|
199
|
+
self, context: Optional[dict]
|
|
200
|
+
) -> Optional[Any]:
|
|
201
|
+
"""Get starting replication key.
|
|
202
|
+
|
|
203
|
+
Will return the value of the stream's replication key when `--state` is passed.
|
|
204
|
+
If no prior state exists, will return `None`.
|
|
205
|
+
|
|
206
|
+
Developers should use this method to seed incremental processing for
|
|
207
|
+
non-datetime replication keys. For datetime and date replication keys, use
|
|
208
|
+
:meth:`~hotglue_singer_sdk.Stream.get_starting_timestamp()`
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
context: Stream partition or context dictionary.
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
Starting replication value.
|
|
215
|
+
"""
|
|
216
|
+
state = self.get_context_state(context)
|
|
217
|
+
|
|
218
|
+
return get_starting_replication_value(state)
|
|
219
|
+
|
|
220
|
+
def get_starting_timestamp(
|
|
221
|
+
self, context: Optional[dict]
|
|
222
|
+
) -> Optional[datetime.datetime]:
|
|
223
|
+
"""Get starting replication timestamp.
|
|
224
|
+
|
|
225
|
+
Will return the value of the stream's replication key when `--state` is passed.
|
|
226
|
+
If no state exists, will return `start_date` if set, or `None` if neither
|
|
227
|
+
the stream state nor `start_date` is set.
|
|
228
|
+
|
|
229
|
+
Developers should use this method to seed incremental processing for date
|
|
230
|
+
and datetime replication keys. For non-datetime replication keys, use
|
|
231
|
+
:meth:`~hotglue_singer_sdk.Stream.get_starting_replication_key_value()`
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
context: Stream partition or context dictionary.
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
`start_date` from config, or state value if using timestamp replication.
|
|
238
|
+
|
|
239
|
+
Raises:
|
|
240
|
+
ValueError: If the replication value is not a valid timestamp.
|
|
241
|
+
"""
|
|
242
|
+
value = self.get_starting_replication_key_value(context)
|
|
243
|
+
|
|
244
|
+
if value is None:
|
|
245
|
+
return None
|
|
246
|
+
|
|
247
|
+
if not self.is_timestamp_replication_key:
|
|
248
|
+
raise ValueError(
|
|
249
|
+
f"The replication key {self.replication_key} is not of timestamp type"
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
return cast(datetime.datetime, pendulum.parse(value))
|
|
253
|
+
|
|
254
|
+
def get_starting_time(self, context, is_inclusive=False):
|
|
255
|
+
start_date = self.config.get("start_date")
|
|
256
|
+
if start_date:
|
|
257
|
+
start_date = pendulum.parse(self.config.get("start_date"))
|
|
258
|
+
rep_key = self.get_starting_timestamp(context)
|
|
259
|
+
|
|
260
|
+
if is_inclusive:
|
|
261
|
+
rep_key = rep_key + datetime.timedelta(seconds=1)
|
|
262
|
+
|
|
263
|
+
# if stream has a minimum start time and start date or replication key is less than the minimum start time, return the minimum start time
|
|
264
|
+
if hasattr(self, "minimum_start_time") and self.minimum_start_time and (rep_key or start_date) < self.minimum_start_time:
|
|
265
|
+
return self.minimum_start_time
|
|
266
|
+
return rep_key or start_date
|
|
267
|
+
|
|
268
|
+
@final
|
|
269
|
+
@property
|
|
270
|
+
def selected(self) -> bool:
|
|
271
|
+
"""Check if stream is selected.
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
True if the stream is selected.
|
|
275
|
+
"""
|
|
276
|
+
return self.mask.get((), True)
|
|
277
|
+
|
|
278
|
+
@final
|
|
279
|
+
@property
|
|
280
|
+
def has_selected_descendents(self) -> bool:
|
|
281
|
+
"""Check descendents.
|
|
282
|
+
|
|
283
|
+
Returns:
|
|
284
|
+
True if any child streams are selected, recursively.
|
|
285
|
+
"""
|
|
286
|
+
for child in self.child_streams or []:
|
|
287
|
+
if child.selected or child.has_selected_descendents:
|
|
288
|
+
return True
|
|
289
|
+
|
|
290
|
+
return False
|
|
291
|
+
|
|
292
|
+
@final
|
|
293
|
+
@property
|
|
294
|
+
def descendent_streams(self) -> List["Stream"]:
|
|
295
|
+
"""Get child streams.
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
A list of all children, recursively.
|
|
299
|
+
"""
|
|
300
|
+
result: List[Stream] = list(self.child_streams) or []
|
|
301
|
+
for child in self.child_streams:
|
|
302
|
+
result += child.descendent_streams or []
|
|
303
|
+
return result
|
|
304
|
+
|
|
305
|
+
def _write_replication_key_signpost(
|
|
306
|
+
self,
|
|
307
|
+
context: Optional[dict],
|
|
308
|
+
value: Union[datetime.datetime, str, int, float],
|
|
309
|
+
) -> None:
|
|
310
|
+
"""Write the signpost value, if available.
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
context: Stream partition or context dictionary.
|
|
314
|
+
value: TODO
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
TODO
|
|
318
|
+
"""
|
|
319
|
+
if not value:
|
|
320
|
+
return
|
|
321
|
+
|
|
322
|
+
state = self.get_context_state(context)
|
|
323
|
+
write_replication_key_signpost(state, value)
|
|
324
|
+
|
|
325
|
+
def compare_start_date(self, value: str, start_date_value: str) -> str:
|
|
326
|
+
"""Compare a bookmark value to a start date and return the most recent value.
|
|
327
|
+
|
|
328
|
+
If the replication key is a datetime-formatted string, this method will parse
|
|
329
|
+
the value and compare it to the start date. Otherwise, the bookmark value is
|
|
330
|
+
returned.
|
|
331
|
+
|
|
332
|
+
If the tap uses a non-datetime replication key (e.g. an UNIX timestamp), the
|
|
333
|
+
developer is encouraged to override this method to provide custom logic for
|
|
334
|
+
comparing the bookmark value to the start date.
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
value: The replication key value.
|
|
338
|
+
start_date_value: The start date value from the config.
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
The most recent value between the bookmark and start date.
|
|
342
|
+
"""
|
|
343
|
+
if self.is_timestamp_replication_key:
|
|
344
|
+
return max(value, start_date_value, key=pendulum.parse)
|
|
345
|
+
else:
|
|
346
|
+
return value
|
|
347
|
+
|
|
348
|
+
def _write_starting_replication_value(self, context: Optional[dict]) -> None:
|
|
349
|
+
"""Write the starting replication value, if available.
|
|
350
|
+
|
|
351
|
+
Args:
|
|
352
|
+
context: Stream partition or context dictionary.
|
|
353
|
+
"""
|
|
354
|
+
value = None
|
|
355
|
+
state = self.get_context_state(context)
|
|
356
|
+
|
|
357
|
+
if self.replication_key:
|
|
358
|
+
replication_key_value = state.get("replication_key_value")
|
|
359
|
+
if replication_key_value and self.replication_key == state.get(
|
|
360
|
+
"replication_key"
|
|
361
|
+
):
|
|
362
|
+
value = replication_key_value
|
|
363
|
+
|
|
364
|
+
# Use start_date if it is more recent than the replication_key state
|
|
365
|
+
start_date_value: Optional[str] = self.config.get("start_date")
|
|
366
|
+
if start_date_value:
|
|
367
|
+
if not value:
|
|
368
|
+
value = start_date_value
|
|
369
|
+
else:
|
|
370
|
+
value = self.compare_start_date(value, start_date_value)
|
|
371
|
+
|
|
372
|
+
write_starting_replication_value(state, value)
|
|
373
|
+
|
|
374
|
+
def get_replication_key_signpost(
|
|
375
|
+
self, context: Optional[dict]
|
|
376
|
+
) -> Optional[Union[datetime.datetime, Any]]:
|
|
377
|
+
"""Get the replication signpost.
|
|
378
|
+
|
|
379
|
+
For timestamp-based replication keys, this defaults to `utc_now()`. For
|
|
380
|
+
non-timestamp replication keys, default to `None`. For consistency in subsequent
|
|
381
|
+
calls, the value will be frozen (cached) at its initially called state, per
|
|
382
|
+
partition argument if applicable.
|
|
383
|
+
|
|
384
|
+
Developers may optionally override this method in advanced use cases such
|
|
385
|
+
as unsorted incremental streams or complex hierarchical stream scenarios.
|
|
386
|
+
For more info: :doc:`/implementation/state`
|
|
387
|
+
|
|
388
|
+
Args:
|
|
389
|
+
context: Stream partition or context dictionary.
|
|
390
|
+
|
|
391
|
+
Returns:
|
|
392
|
+
Max allowable bookmark value for this stream's replication key.
|
|
393
|
+
"""
|
|
394
|
+
if self.is_timestamp_replication_key:
|
|
395
|
+
return utc_now()
|
|
396
|
+
|
|
397
|
+
return None
|
|
398
|
+
|
|
399
|
+
@property
|
|
400
|
+
def schema_filepath(self) -> Optional[Path]:
|
|
401
|
+
"""Get path to schema file.
|
|
402
|
+
|
|
403
|
+
Returns:
|
|
404
|
+
Path to a schema file for the stream or `None` if n/a.
|
|
405
|
+
"""
|
|
406
|
+
return self._schema_filepath
|
|
407
|
+
|
|
408
|
+
@property
|
|
409
|
+
def schema(self) -> dict:
|
|
410
|
+
"""Get schema.
|
|
411
|
+
|
|
412
|
+
Returns:
|
|
413
|
+
JSON Schema dictionary for this stream.
|
|
414
|
+
"""
|
|
415
|
+
return self._schema
|
|
416
|
+
|
|
417
|
+
@property
|
|
418
|
+
def primary_keys(self) -> Optional[List[str]]:
|
|
419
|
+
"""Get primary keys.
|
|
420
|
+
|
|
421
|
+
Returns:
|
|
422
|
+
A list of primary key(s) for the stream.
|
|
423
|
+
"""
|
|
424
|
+
if not self._primary_keys:
|
|
425
|
+
return []
|
|
426
|
+
return self._primary_keys
|
|
427
|
+
|
|
428
|
+
@primary_keys.setter
|
|
429
|
+
def primary_keys(self, new_value: List[str]) -> None:
|
|
430
|
+
"""Set primary key(s) for the stream.
|
|
431
|
+
|
|
432
|
+
Args:
|
|
433
|
+
new_value: TODO
|
|
434
|
+
"""
|
|
435
|
+
self._primary_keys = new_value
|
|
436
|
+
|
|
437
|
+
@property
|
|
438
|
+
def state_partitioning_keys(self) -> Optional[List[str]]:
|
|
439
|
+
"""Get state partition keys.
|
|
440
|
+
|
|
441
|
+
If not set, a default partitioning will be inherited from the stream's context.
|
|
442
|
+
If an empty list is set (`[]`), state will be held in one bookmark per stream.
|
|
443
|
+
|
|
444
|
+
Returns:
|
|
445
|
+
Partition keys for the stream state bookmarks.
|
|
446
|
+
"""
|
|
447
|
+
return self._state_partitioning_keys
|
|
448
|
+
|
|
449
|
+
@state_partitioning_keys.setter
|
|
450
|
+
def state_partitioning_keys(self, new_value: Optional[List[str]]) -> None:
|
|
451
|
+
"""Set partition keys for the stream state bookmarks.
|
|
452
|
+
|
|
453
|
+
If not set, a default partitioning will be inherited from the stream's context.
|
|
454
|
+
If an empty list is set (`[]`), state will be held in one bookmark per stream.
|
|
455
|
+
|
|
456
|
+
Args:
|
|
457
|
+
new_value: the new list of keys
|
|
458
|
+
"""
|
|
459
|
+
self._state_partitioning_keys = new_value
|
|
460
|
+
|
|
461
|
+
@property
|
|
462
|
+
def replication_key(self) -> Optional[str]:
|
|
463
|
+
"""Get replication key.
|
|
464
|
+
|
|
465
|
+
Returns:
|
|
466
|
+
Replication key for the stream.
|
|
467
|
+
"""
|
|
468
|
+
if not self._replication_key:
|
|
469
|
+
return None
|
|
470
|
+
return self._replication_key
|
|
471
|
+
|
|
472
|
+
@replication_key.setter
|
|
473
|
+
def replication_key(self, new_value: str) -> None:
|
|
474
|
+
"""Set replication key for the stream.
|
|
475
|
+
|
|
476
|
+
Args:
|
|
477
|
+
new_value: TODO
|
|
478
|
+
"""
|
|
479
|
+
self._replication_key = new_value
|
|
480
|
+
|
|
481
|
+
@property
|
|
482
|
+
def minimum_start_time(self) -> Optional[datetime.datetime]:
|
|
483
|
+
return self._minimum_start_time if hasattr(self, "_minimum_start_time") else None
|
|
484
|
+
|
|
485
|
+
@minimum_start_time.setter
|
|
486
|
+
def minimum_start_time(self, value: Optional[datetime.datetime]):
|
|
487
|
+
self._minimum_start_time = value
|
|
488
|
+
|
|
489
|
+
@property
|
|
490
|
+
def is_sorted(self) -> bool:
|
|
491
|
+
"""Expect stream to be sorted.
|
|
492
|
+
|
|
493
|
+
When `True`, incremental streams will attempt to resume if unexpectedly
|
|
494
|
+
interrupted.
|
|
495
|
+
|
|
496
|
+
Returns:
|
|
497
|
+
`True` if stream is sorted. Defaults to `False`.
|
|
498
|
+
"""
|
|
499
|
+
return False
|
|
500
|
+
|
|
501
|
+
@property
|
|
502
|
+
def check_sorted(self) -> bool:
|
|
503
|
+
"""Check if stream is sorted.
|
|
504
|
+
|
|
505
|
+
This setting enables additional checks which may trigger
|
|
506
|
+
`InvalidStreamSortException` if records are found which are unsorted.
|
|
507
|
+
|
|
508
|
+
Returns:
|
|
509
|
+
`True` if sorting is checked. Defaults to `True`.
|
|
510
|
+
"""
|
|
511
|
+
return True
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
def get_paging_windows(self, context: Optional[dict]) -> List[Dict[str, Any]]:
|
|
516
|
+
"""
|
|
517
|
+
Generate list of paging windows to be added to each context.
|
|
518
|
+
|
|
519
|
+
Returns:
|
|
520
|
+
A list of paging windows to be added to each context.
|
|
521
|
+
"""
|
|
522
|
+
return []
|
|
523
|
+
|
|
524
|
+
@property
|
|
525
|
+
def metadata(self) -> MetadataMapping:
|
|
526
|
+
"""Get stream metadata.
|
|
527
|
+
|
|
528
|
+
Metadata attributes (`inclusion`, `selected`, etc.) are part of the Singer spec.
|
|
529
|
+
|
|
530
|
+
Metadata from an input catalog will override standard metadata.
|
|
531
|
+
|
|
532
|
+
Returns:
|
|
533
|
+
A mapping from property breadcrumbs to metadata objects.
|
|
534
|
+
"""
|
|
535
|
+
if self._metadata is not None:
|
|
536
|
+
return self._metadata
|
|
537
|
+
|
|
538
|
+
if self._tap_input_catalog:
|
|
539
|
+
catalog_entry = self._tap_input_catalog.get_stream(self.tap_stream_id)
|
|
540
|
+
if catalog_entry:
|
|
541
|
+
self._metadata = catalog_entry.metadata
|
|
542
|
+
return self._metadata
|
|
543
|
+
|
|
544
|
+
self._metadata = MetadataMapping.get_standard_metadata(
|
|
545
|
+
schema=self.schema,
|
|
546
|
+
replication_method=self.forced_replication_method,
|
|
547
|
+
key_properties=self.primary_keys or [],
|
|
548
|
+
valid_replication_keys=(
|
|
549
|
+
[self.replication_key] if self.replication_key else None
|
|
550
|
+
),
|
|
551
|
+
schema_name=None,
|
|
552
|
+
)
|
|
553
|
+
|
|
554
|
+
# If there's no input catalog, select all streams
|
|
555
|
+
if self._tap_input_catalog is None:
|
|
556
|
+
self._metadata.root.selected = True
|
|
557
|
+
|
|
558
|
+
return self._metadata
|
|
559
|
+
|
|
560
|
+
@property
|
|
561
|
+
def _singer_catalog_entry(self) -> CatalogEntry:
|
|
562
|
+
"""Return catalog entry as specified by the Singer catalog spec.
|
|
563
|
+
|
|
564
|
+
Returns:
|
|
565
|
+
TODO
|
|
566
|
+
"""
|
|
567
|
+
return CatalogEntry(
|
|
568
|
+
tap_stream_id=self.tap_stream_id,
|
|
569
|
+
stream=self.name,
|
|
570
|
+
schema=SchemaPlus.from_dict(self.schema),
|
|
571
|
+
metadata=self.metadata,
|
|
572
|
+
key_properties=self.primary_keys or [],
|
|
573
|
+
replication_key=self.replication_key,
|
|
574
|
+
replication_method=self.replication_method,
|
|
575
|
+
is_view=None,
|
|
576
|
+
database=None,
|
|
577
|
+
table=None,
|
|
578
|
+
row_count=None,
|
|
579
|
+
stream_alias=None,
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
@property
|
|
583
|
+
def _singer_catalog(self) -> Catalog:
|
|
584
|
+
"""TODO.
|
|
585
|
+
|
|
586
|
+
Returns:
|
|
587
|
+
TODO
|
|
588
|
+
"""
|
|
589
|
+
return Catalog([(self.tap_stream_id, self._singer_catalog_entry)])
|
|
590
|
+
|
|
591
|
+
@property
|
|
592
|
+
def config(self) -> Mapping[str, Any]:
|
|
593
|
+
"""Get stream configuration.
|
|
594
|
+
|
|
595
|
+
Returns:
|
|
596
|
+
A frozen (read-only) config dictionary map.
|
|
597
|
+
"""
|
|
598
|
+
return MappingProxyType(self._config)
|
|
599
|
+
|
|
600
|
+
@property
|
|
601
|
+
def tap_stream_id(self) -> str:
|
|
602
|
+
"""Return a unique stream ID.
|
|
603
|
+
|
|
604
|
+
Default implementations will return `self.name` but this behavior may be
|
|
605
|
+
overridden if required by the developer.
|
|
606
|
+
|
|
607
|
+
Returns:
|
|
608
|
+
Unique stream ID.
|
|
609
|
+
"""
|
|
610
|
+
return self.name
|
|
611
|
+
|
|
612
|
+
@property
|
|
613
|
+
def replication_method(self) -> str:
|
|
614
|
+
"""Get replication method.
|
|
615
|
+
|
|
616
|
+
Returns:
|
|
617
|
+
Replication method to be used for this stream.
|
|
618
|
+
"""
|
|
619
|
+
if self.forced_replication_method:
|
|
620
|
+
return str(self.forced_replication_method)
|
|
621
|
+
if self.replication_key:
|
|
622
|
+
return REPLICATION_INCREMENTAL
|
|
623
|
+
return REPLICATION_FULL_TABLE
|
|
624
|
+
|
|
625
|
+
# State properties:
|
|
626
|
+
|
|
627
|
+
@property
|
|
628
|
+
def tap_state(self) -> dict:
|
|
629
|
+
"""Return a writeable state dict for the entire tap.
|
|
630
|
+
|
|
631
|
+
Note: This dictionary is shared (and writable) across all streams.
|
|
632
|
+
|
|
633
|
+
This method is internal to the SDK and should not need to be overridden.
|
|
634
|
+
Developers may access this property but this is not recommended except in
|
|
635
|
+
advanced use cases. Instead, developers should access the latest stream
|
|
636
|
+
replication key values using :meth:`~hotglue_singer_sdk.Stream.get_starting_timestamp()`
|
|
637
|
+
for timestamp keys, or
|
|
638
|
+
:meth:`~hotglue_singer_sdk.Stream.get_starting_replication_key_value()` for
|
|
639
|
+
non-timestamp keys.
|
|
640
|
+
|
|
641
|
+
Returns:
|
|
642
|
+
A writeable state dict for the entire tap.
|
|
643
|
+
"""
|
|
644
|
+
return self._tap_state
|
|
645
|
+
|
|
646
|
+
def get_context_state(self, context: Optional[dict]) -> dict:
|
|
647
|
+
"""Return a writable state dict for the given context.
|
|
648
|
+
|
|
649
|
+
Gives a partitioned context state if applicable; else returns stream state.
|
|
650
|
+
A blank state will be created in none exists.
|
|
651
|
+
|
|
652
|
+
This method is internal to the SDK and should not need to be overridden.
|
|
653
|
+
Developers may access this property but this is not recommended except in
|
|
654
|
+
advanced use cases. Instead, developers should access the latest stream
|
|
655
|
+
replication key values using
|
|
656
|
+
:meth:`~hotglue_singer_sdk.Stream.get_starting_timestamp()` for timestamp keys, or
|
|
657
|
+
:meth:`~hotglue_singer_sdk.Stream.get_starting_replication_key_value()` for
|
|
658
|
+
non-timestamp keys.
|
|
659
|
+
|
|
660
|
+
Partition level may be overridden by
|
|
661
|
+
:attr:`~hotglue_singer_sdk.Stream.state_partitioning_keys` if set.
|
|
662
|
+
|
|
663
|
+
Args:
|
|
664
|
+
context: Stream partition or context dictionary.
|
|
665
|
+
|
|
666
|
+
Returns:
|
|
667
|
+
A partitioned context state if applicable; else returns stream state.
|
|
668
|
+
A blank state will be created in none exists.
|
|
669
|
+
"""
|
|
670
|
+
state_partition_context = self._get_state_partition_context(context)
|
|
671
|
+
if state_partition_context:
|
|
672
|
+
return get_writeable_state_dict(
|
|
673
|
+
self.tap_state,
|
|
674
|
+
self.name,
|
|
675
|
+
state_partition_context=state_partition_context,
|
|
676
|
+
)
|
|
677
|
+
return self.stream_state
|
|
678
|
+
|
|
679
|
+
@property
|
|
680
|
+
def stream_state(self) -> dict:
|
|
681
|
+
"""Get writable state.
|
|
682
|
+
|
|
683
|
+
This method is internal to the SDK and should not need to be overridden.
|
|
684
|
+
Developers may access this property but this is not recommended except in
|
|
685
|
+
advanced use cases. Instead, developers should access the latest stream
|
|
686
|
+
replication key values using :meth:`~hotglue_singer_sdk.Stream.get_starting_timestamp()`
|
|
687
|
+
for timestamp keys, or
|
|
688
|
+
:meth:`~hotglue_singer_sdk.Stream.get_starting_replication_key_value()` for
|
|
689
|
+
non-timestamp keys.
|
|
690
|
+
|
|
691
|
+
A blank state entry will be created if one doesn't already exist.
|
|
692
|
+
|
|
693
|
+
Returns:
|
|
694
|
+
A writable state dict for this stream.
|
|
695
|
+
"""
|
|
696
|
+
return get_writeable_state_dict(self.tap_state, self.name)
|
|
697
|
+
|
|
698
|
+
# Partitions
|
|
699
|
+
|
|
700
|
+
@property
|
|
701
|
+
def partitions(self) -> Optional[List[dict]]:
|
|
702
|
+
"""Get stream partitions.
|
|
703
|
+
|
|
704
|
+
Developers may override this property to provide a default partitions list.
|
|
705
|
+
|
|
706
|
+
By default, this method returns a list of any partitions which are already
|
|
707
|
+
defined in state, otherwise None.
|
|
708
|
+
|
|
709
|
+
Returns:
|
|
710
|
+
A list of partition key dicts (if applicable), otherwise `None`.
|
|
711
|
+
"""
|
|
712
|
+
result: List[dict] = []
|
|
713
|
+
for partition_state in (
|
|
714
|
+
get_state_partitions_list(self.tap_state, self.name) or []
|
|
715
|
+
):
|
|
716
|
+
result.append(partition_state["context"])
|
|
717
|
+
return result or None
|
|
718
|
+
|
|
719
|
+
# Private bookmarking methods
|
|
720
|
+
|
|
721
|
+
def _increment_stream_state(
|
|
722
|
+
self, latest_record: Dict[str, Any], *, context: Optional[dict] = None
|
|
723
|
+
) -> None:
|
|
724
|
+
"""Update state of stream or partition with data from the provided record.
|
|
725
|
+
|
|
726
|
+
Raises InvalidStreamSortException is self.is_sorted = True and unsorted data is
|
|
727
|
+
detected.
|
|
728
|
+
|
|
729
|
+
Args:
|
|
730
|
+
latest_record: TODO
|
|
731
|
+
context: Stream partition or context dictionary.
|
|
732
|
+
|
|
733
|
+
Raises:
|
|
734
|
+
ValueError: TODO
|
|
735
|
+
"""
|
|
736
|
+
state_dict = self.get_context_state(context)
|
|
737
|
+
if latest_record:
|
|
738
|
+
if self.replication_method in [
|
|
739
|
+
REPLICATION_INCREMENTAL,
|
|
740
|
+
REPLICATION_LOG_BASED,
|
|
741
|
+
]:
|
|
742
|
+
if not self.replication_key:
|
|
743
|
+
raise ValueError(
|
|
744
|
+
f"Could not detect replication key for '{self.name}' stream"
|
|
745
|
+
f"(replication method={self.replication_method})"
|
|
746
|
+
)
|
|
747
|
+
treat_as_sorted = self.is_sorted
|
|
748
|
+
if not treat_as_sorted and self.state_partitioning_keys is not None:
|
|
749
|
+
# Streams with custom state partitioning are not resumable.
|
|
750
|
+
treat_as_sorted = False
|
|
751
|
+
increment_state(
|
|
752
|
+
state_dict,
|
|
753
|
+
replication_key=self.replication_key,
|
|
754
|
+
latest_record=latest_record,
|
|
755
|
+
is_sorted=treat_as_sorted,
|
|
756
|
+
check_sorted=self.check_sorted,
|
|
757
|
+
)
|
|
758
|
+
|
|
759
|
+
# Private message authoring methods:
|
|
760
|
+
|
|
761
|
+
def _write_state_message(self) -> None:
|
|
762
|
+
"""Write out a STATE message with the latest state."""
|
|
763
|
+
singer.write_message(StateMessage(value=self.tap_state))
|
|
764
|
+
|
|
765
|
+
def _generate_schema_messages(self) -> Generator[SchemaMessage, None, None]:
|
|
766
|
+
"""Generate schema messages from stream maps.
|
|
767
|
+
|
|
768
|
+
Yields:
|
|
769
|
+
Schema message objects.
|
|
770
|
+
"""
|
|
771
|
+
bookmark_keys = [self.replication_key] if self.replication_key else None
|
|
772
|
+
for stream_map in self.stream_maps:
|
|
773
|
+
if isinstance(stream_map, RemoveRecordTransform):
|
|
774
|
+
# Don't emit schema if the stream's records are all ignored.
|
|
775
|
+
continue
|
|
776
|
+
|
|
777
|
+
schema_message = SchemaMessage(
|
|
778
|
+
stream_map.stream_alias,
|
|
779
|
+
stream_map.transformed_schema,
|
|
780
|
+
stream_map.transformed_key_properties,
|
|
781
|
+
bookmark_keys,
|
|
782
|
+
)
|
|
783
|
+
yield schema_message
|
|
784
|
+
|
|
785
|
+
def _write_schema_message(self) -> None:
|
|
786
|
+
"""Write out a SCHEMA message with the stream schema."""
|
|
787
|
+
for schema_message in self._generate_schema_messages():
|
|
788
|
+
singer.write_message(schema_message)
|
|
789
|
+
|
|
790
|
+
@property
|
|
791
|
+
def mask(self) -> SelectionMask:
|
|
792
|
+
"""Get a boolean mask for stream and property selection.
|
|
793
|
+
|
|
794
|
+
Returns:
|
|
795
|
+
A mapping of breadcrumbs to boolean values, representing stream and field
|
|
796
|
+
selection.
|
|
797
|
+
"""
|
|
798
|
+
if self._mask is None:
|
|
799
|
+
self._mask = self.metadata.resolve_selection()
|
|
800
|
+
return self._mask
|
|
801
|
+
|
|
802
|
+
def _generate_record_messages(
|
|
803
|
+
self,
|
|
804
|
+
record: dict,
|
|
805
|
+
) -> Generator[RecordMessage, None, None]:
|
|
806
|
+
"""Write out a RECORD message.
|
|
807
|
+
|
|
808
|
+
Args:
|
|
809
|
+
record: A single stream record.
|
|
810
|
+
|
|
811
|
+
Yields:
|
|
812
|
+
Record message objects.
|
|
813
|
+
"""
|
|
814
|
+
pop_deselected_record_properties(record, self.schema, self.mask, self.logger)
|
|
815
|
+
record = conform_record_data_types(
|
|
816
|
+
stream_name=self.name,
|
|
817
|
+
row=record,
|
|
818
|
+
schema=self.schema,
|
|
819
|
+
logger=self.logger,
|
|
820
|
+
)
|
|
821
|
+
for stream_map in self.stream_maps:
|
|
822
|
+
mapped_record = stream_map.transform(record)
|
|
823
|
+
# Emit record if not filtered
|
|
824
|
+
if mapped_record is not None:
|
|
825
|
+
record_message = RecordMessage(
|
|
826
|
+
stream=stream_map.stream_alias,
|
|
827
|
+
record=mapped_record,
|
|
828
|
+
version=None,
|
|
829
|
+
time_extracted=utc_now(),
|
|
830
|
+
)
|
|
831
|
+
|
|
832
|
+
yield record_message
|
|
833
|
+
|
|
834
|
+
def _write_record_message(self, record: dict) -> None:
|
|
835
|
+
"""Write out a RECORD message.
|
|
836
|
+
|
|
837
|
+
Args:
|
|
838
|
+
record: A single stream record.
|
|
839
|
+
"""
|
|
840
|
+
for record_message in self._generate_record_messages(record):
|
|
841
|
+
singer.write_message(record_message)
|
|
842
|
+
|
|
843
|
+
@property
|
|
844
|
+
def _metric_logging_function(self) -> Optional[Callable]:
|
|
845
|
+
"""Return the metrics logging function.
|
|
846
|
+
|
|
847
|
+
Returns:
|
|
848
|
+
The logging function for emitting metrics.
|
|
849
|
+
|
|
850
|
+
Raises:
|
|
851
|
+
ValueError: If logging level setting is an unsupported value.
|
|
852
|
+
"""
|
|
853
|
+
if METRICS_LOG_LEVEL_SETTING not in self.config:
|
|
854
|
+
return self.logger.info
|
|
855
|
+
|
|
856
|
+
if self.config[METRICS_LOG_LEVEL_SETTING].upper() == "INFO":
|
|
857
|
+
return self.logger.info
|
|
858
|
+
|
|
859
|
+
if self.config[METRICS_LOG_LEVEL_SETTING].upper() == "DEBUG":
|
|
860
|
+
return self.logger.debug
|
|
861
|
+
|
|
862
|
+
if self.config[METRICS_LOG_LEVEL_SETTING].upper() == "NONE":
|
|
863
|
+
return None
|
|
864
|
+
|
|
865
|
+
raise ValueError(
|
|
866
|
+
"Unexpected logging level for metrics: "
|
|
867
|
+
+ self.config[METRICS_LOG_LEVEL_SETTING]
|
|
868
|
+
)
|
|
869
|
+
|
|
870
|
+
def _write_metric_log(self, metric: dict, extra_tags: Optional[dict]) -> None:
|
|
871
|
+
"""Emit a metric log. Optionally with appended tag info.
|
|
872
|
+
|
|
873
|
+
Args:
|
|
874
|
+
metric: TODO
|
|
875
|
+
extra_tags: TODO
|
|
876
|
+
|
|
877
|
+
Returns:
|
|
878
|
+
None
|
|
879
|
+
"""
|
|
880
|
+
if not self._metric_logging_function:
|
|
881
|
+
return None
|
|
882
|
+
|
|
883
|
+
if extra_tags:
|
|
884
|
+
metric["tags"].update(extra_tags)
|
|
885
|
+
self._metric_logging_function(f"INFO METRIC: {str(metric)}")
|
|
886
|
+
|
|
887
|
+
def _write_record_count_log(
|
|
888
|
+
self, record_count: int, context: Optional[dict]
|
|
889
|
+
) -> None:
|
|
890
|
+
"""Emit a metric log. Optionally with appended tag info.
|
|
891
|
+
|
|
892
|
+
Args:
|
|
893
|
+
record_count: TODO
|
|
894
|
+
context: Stream partition or context dictionary.
|
|
895
|
+
"""
|
|
896
|
+
extra_tags = {} if not context else {"context": context}
|
|
897
|
+
counter_metric: Dict[str, Any] = {
|
|
898
|
+
"type": "counter",
|
|
899
|
+
"metric": "record_count",
|
|
900
|
+
"value": record_count,
|
|
901
|
+
"tags": {"stream": self.name},
|
|
902
|
+
}
|
|
903
|
+
self._write_metric_log(counter_metric, extra_tags=extra_tags)
|
|
904
|
+
|
|
905
|
+
def _write_request_duration_log(
|
|
906
|
+
self,
|
|
907
|
+
endpoint: str,
|
|
908
|
+
response: requests.Response,
|
|
909
|
+
context: Optional[dict],
|
|
910
|
+
extra_tags: Optional[dict],
|
|
911
|
+
) -> None:
|
|
912
|
+
"""TODO.
|
|
913
|
+
|
|
914
|
+
Args:
|
|
915
|
+
endpoint: TODO
|
|
916
|
+
response: TODO
|
|
917
|
+
context: Stream partition or context dictionary.
|
|
918
|
+
extra_tags: TODO
|
|
919
|
+
"""
|
|
920
|
+
request_duration_metric: Dict[str, Any] = {
|
|
921
|
+
"type": "timer",
|
|
922
|
+
"metric": "http_request_duration",
|
|
923
|
+
"value": response.elapsed.total_seconds(),
|
|
924
|
+
"tags": {
|
|
925
|
+
"endpoint": endpoint,
|
|
926
|
+
"http_status_code": response.status_code,
|
|
927
|
+
"status": "succeeded" if response.status_code < 400 else "failed",
|
|
928
|
+
},
|
|
929
|
+
}
|
|
930
|
+
extra_tags = extra_tags or {}
|
|
931
|
+
if context:
|
|
932
|
+
extra_tags["context"] = context
|
|
933
|
+
self._write_metric_log(metric=request_duration_metric, extra_tags=extra_tags)
|
|
934
|
+
|
|
935
|
+
def log_sync_costs(self) -> None:
|
|
936
|
+
"""Log a summary of Sync costs.
|
|
937
|
+
|
|
938
|
+
The costs are calculated via `calculate_sync_cost`.
|
|
939
|
+
This method can be overridden to log results in a custom
|
|
940
|
+
format. It is only called once at the end of the life of
|
|
941
|
+
the stream.
|
|
942
|
+
"""
|
|
943
|
+
if len(self._sync_costs) > 0:
|
|
944
|
+
msg = f"Total Sync costs for stream {self.name}: {self._sync_costs}"
|
|
945
|
+
self.logger.info(msg)
|
|
946
|
+
|
|
947
|
+
def _check_max_record_limit(self, record_count: int) -> None:
|
|
948
|
+
"""TODO.
|
|
949
|
+
|
|
950
|
+
Args:
|
|
951
|
+
record_count: TODO.
|
|
952
|
+
|
|
953
|
+
Raises:
|
|
954
|
+
MaxRecordsLimitException: TODO.
|
|
955
|
+
"""
|
|
956
|
+
if (
|
|
957
|
+
self._MAX_RECORDS_LIMIT is not None
|
|
958
|
+
and record_count >= self._MAX_RECORDS_LIMIT
|
|
959
|
+
):
|
|
960
|
+
raise MaxRecordsLimitException(
|
|
961
|
+
"Stream prematurely aborted due to the stream's max record "
|
|
962
|
+
f"limit ({self._MAX_RECORDS_LIMIT}) being reached."
|
|
963
|
+
)
|
|
964
|
+
|
|
965
|
+
# Handle interim stream state
|
|
966
|
+
|
|
967
|
+
def reset_state_progress_markers(self, state: Optional[dict] = None) -> None:
|
|
968
|
+
"""Reset progress markers. If all=True, all state contexts will be set.
|
|
969
|
+
|
|
970
|
+
This method is internal to the SDK and should not need to be overridden.
|
|
971
|
+
|
|
972
|
+
Args:
|
|
973
|
+
state: State object to promote progress markers with.
|
|
974
|
+
"""
|
|
975
|
+
if state is None or state == {}:
|
|
976
|
+
context: Optional[dict]
|
|
977
|
+
for context in self.partitions or [{}]:
|
|
978
|
+
context = context or None
|
|
979
|
+
state = self.get_context_state(context)
|
|
980
|
+
reset_state_progress_markers(state)
|
|
981
|
+
return
|
|
982
|
+
|
|
983
|
+
reset_state_progress_markers(state)
|
|
984
|
+
|
|
985
|
+
def finalize_state_progress_markers(self, state: Optional[dict] = None) -> None:
|
|
986
|
+
"""Reset progress markers. If all=True, all state contexts will be finalized.
|
|
987
|
+
|
|
988
|
+
This method is internal to the SDK and should not need to be overridden.
|
|
989
|
+
|
|
990
|
+
If all=True and the stream has children, child streams will also be finalized.
|
|
991
|
+
|
|
992
|
+
Args:
|
|
993
|
+
state: State object to promote progress markers with.
|
|
994
|
+
"""
|
|
995
|
+
if state is None or state == {}:
|
|
996
|
+
for child_stream in self.child_streams or []:
|
|
997
|
+
child_stream.finalize_state_progress_markers()
|
|
998
|
+
|
|
999
|
+
context: Optional[dict]
|
|
1000
|
+
for context in self.partitions or [{}]:
|
|
1001
|
+
context = context or None
|
|
1002
|
+
state = self.get_context_state(context)
|
|
1003
|
+
finalize_state_progress_markers(state)
|
|
1004
|
+
return
|
|
1005
|
+
|
|
1006
|
+
finalize_state_progress_markers(state)
|
|
1007
|
+
|
|
1008
|
+
def get_child_threads(self) -> int:
|
|
1009
|
+
"""Get the number of child threads to use.
|
|
1010
|
+
|
|
1011
|
+
Returns:
|
|
1012
|
+
The number of child threads to use.
|
|
1013
|
+
"""
|
|
1014
|
+
highest_parallelization_limit = 1
|
|
1015
|
+
for child_stream in self.child_streams:
|
|
1016
|
+
if (child_stream.selected or child_stream.has_selected_descendents) and child_stream.parallelization_limit > 0:
|
|
1017
|
+
if child_stream.parallelization_limit > highest_parallelization_limit:
|
|
1018
|
+
highest_parallelization_limit = child_stream.parallelization_limit
|
|
1019
|
+
return highest_parallelization_limit
|
|
1020
|
+
|
|
1021
|
+
def _sync_children_with_threads(self, child_context: List[Dict]) -> None:
|
|
1022
|
+
requests_no = len(child_context)
|
|
1023
|
+
with concurrent.futures.ThreadPoolExecutor(
|
|
1024
|
+
max_workers=requests_no
|
|
1025
|
+
) as executor:
|
|
1026
|
+
futures = {
|
|
1027
|
+
executor.submit(self._sync_children, x): x for x in child_context
|
|
1028
|
+
}
|
|
1029
|
+
# Process each future as it completes
|
|
1030
|
+
for future in concurrent.futures.as_completed(futures):
|
|
1031
|
+
# Yield records
|
|
1032
|
+
future.result()
|
|
1033
|
+
|
|
1034
|
+
# Private sync methods:
|
|
1035
|
+
|
|
1036
|
+
def _sync_records( # noqa C901 # too complex
|
|
1037
|
+
self, context: Optional[dict] = None
|
|
1038
|
+
) -> None:
|
|
1039
|
+
"""Sync records, emitting RECORD and STATE messages.
|
|
1040
|
+
|
|
1041
|
+
Args:
|
|
1042
|
+
context: Stream partition or context dictionary.
|
|
1043
|
+
|
|
1044
|
+
Raises:
|
|
1045
|
+
InvalidStreamSortException: TODO
|
|
1046
|
+
"""
|
|
1047
|
+
record_count = 0
|
|
1048
|
+
current_context: Optional[dict]
|
|
1049
|
+
context_list: Optional[List[dict]]
|
|
1050
|
+
context_list = [context] if context is not None else self.partitions
|
|
1051
|
+
selected = self.selected
|
|
1052
|
+
|
|
1053
|
+
for current_context in context_list or [{}]:
|
|
1054
|
+
partition_record_count = 0
|
|
1055
|
+
current_context = current_context or None
|
|
1056
|
+
state = self.get_context_state(current_context)
|
|
1057
|
+
state_partition_context = self._get_state_partition_context(current_context)
|
|
1058
|
+
self._write_starting_replication_value(current_context)
|
|
1059
|
+
child_context: Optional[dict] = (
|
|
1060
|
+
None if current_context is None else copy.copy(current_context)
|
|
1061
|
+
)
|
|
1062
|
+
|
|
1063
|
+
# check if any of the child streams have a parallelization limit greater than 1
|
|
1064
|
+
child_threads = self.get_child_threads()
|
|
1065
|
+
# if the number of child threads is greater than 1, we need to use threads
|
|
1066
|
+
use_threads = False
|
|
1067
|
+
if child_threads > 1:
|
|
1068
|
+
use_threads = True
|
|
1069
|
+
# create a list of child contexts to use for parallelization
|
|
1070
|
+
paralellization_context = []
|
|
1071
|
+
|
|
1072
|
+
for record_result in self.get_records(current_context):
|
|
1073
|
+
if isinstance(record_result, tuple):
|
|
1074
|
+
# Tuple items should be the record and the child context
|
|
1075
|
+
record, child_context = record_result
|
|
1076
|
+
else:
|
|
1077
|
+
record = record_result
|
|
1078
|
+
child_context = copy.copy(
|
|
1079
|
+
self.get_child_context(record=record, context=child_context)
|
|
1080
|
+
)
|
|
1081
|
+
for key, val in (state_partition_context or {}).items():
|
|
1082
|
+
# Add state context to records if not already present
|
|
1083
|
+
if key not in record:
|
|
1084
|
+
record[key] = val
|
|
1085
|
+
|
|
1086
|
+
# Sync children, except when primary mapper filters out the record
|
|
1087
|
+
if self.stream_maps[0].get_filter_result(record):
|
|
1088
|
+
# if use_threads is True and the number of child contexts in the list is less than the number of child threads, add the child context to the list
|
|
1089
|
+
if use_threads:
|
|
1090
|
+
# if the number of child contexts in the list is less than the number of child threads, add the child context to the list
|
|
1091
|
+
if len(paralellization_context) < child_threads:
|
|
1092
|
+
paralellization_context.append(child_context)
|
|
1093
|
+
# if the number of child contexts in the list is equal to the number of child threads, sync the children with threads
|
|
1094
|
+
if len(paralellization_context) == child_threads:
|
|
1095
|
+
self._sync_children_with_threads(paralellization_context)
|
|
1096
|
+
paralellization_context = []
|
|
1097
|
+
else:
|
|
1098
|
+
self._sync_children(child_context)
|
|
1099
|
+
self._check_max_record_limit(record_count)
|
|
1100
|
+
if selected:
|
|
1101
|
+
if (record_count - 1) % self.STATE_MSG_FREQUENCY == 0:
|
|
1102
|
+
self._write_state_message()
|
|
1103
|
+
self._write_record_message(record)
|
|
1104
|
+
try:
|
|
1105
|
+
self._increment_stream_state(record, context=current_context)
|
|
1106
|
+
except InvalidStreamSortException as ex:
|
|
1107
|
+
log_sort_error(
|
|
1108
|
+
log_fn=self.logger.error,
|
|
1109
|
+
ex=ex,
|
|
1110
|
+
record_count=record_count + 1,
|
|
1111
|
+
partition_record_count=partition_record_count + 1,
|
|
1112
|
+
current_context=current_context,
|
|
1113
|
+
state_partition_context=state_partition_context,
|
|
1114
|
+
stream_name=self.name,
|
|
1115
|
+
)
|
|
1116
|
+
raise ex
|
|
1117
|
+
|
|
1118
|
+
record_count += 1
|
|
1119
|
+
partition_record_count += 1
|
|
1120
|
+
|
|
1121
|
+
# if parallelization context is not empty, sync the children with threads
|
|
1122
|
+
if use_threads and len(paralellization_context) > 0:
|
|
1123
|
+
self._sync_children_with_threads(paralellization_context)
|
|
1124
|
+
paralellization_context = []
|
|
1125
|
+
if current_context == state_partition_context:
|
|
1126
|
+
# Finalize per-partition state only if 1:1 with context
|
|
1127
|
+
finalize_state_progress_markers(state)
|
|
1128
|
+
if not context:
|
|
1129
|
+
# Finalize total stream only if we have the full full context.
|
|
1130
|
+
# Otherwise will be finalized by tap at end of sync.
|
|
1131
|
+
finalize_state_progress_markers(self.stream_state)
|
|
1132
|
+
self._write_record_count_log(record_count=record_count, context=context)
|
|
1133
|
+
# Reset interim bookmarks before emitting final STATE message:
|
|
1134
|
+
self._write_state_message()
|
|
1135
|
+
|
|
1136
|
+
# Public methods ("final", not recommended to be overridden)
|
|
1137
|
+
|
|
1138
|
+
@final
|
|
1139
|
+
def sync(self, context: Optional[dict] = None) -> None:
|
|
1140
|
+
"""Sync this stream.
|
|
1141
|
+
|
|
1142
|
+
This method is internal to the SDK and should not need to be overridden.
|
|
1143
|
+
|
|
1144
|
+
Args:
|
|
1145
|
+
context: Stream partition or context dictionary.
|
|
1146
|
+
"""
|
|
1147
|
+
msg = f"Beginning {self.replication_method.lower()} sync of '{self.name}'"
|
|
1148
|
+
if context:
|
|
1149
|
+
msg += f" with context: {context}"
|
|
1150
|
+
self.logger.info(f"{msg}...")
|
|
1151
|
+
|
|
1152
|
+
# Use a replication signpost, if available
|
|
1153
|
+
signpost = self.get_replication_key_signpost(context)
|
|
1154
|
+
if signpost:
|
|
1155
|
+
self._write_replication_key_signpost(context, signpost)
|
|
1156
|
+
|
|
1157
|
+
# Send a SCHEMA message to the downstream target:
|
|
1158
|
+
if self.selected:
|
|
1159
|
+
self._write_schema_message()
|
|
1160
|
+
# Sync the records themselves:
|
|
1161
|
+
self._sync_records(context)
|
|
1162
|
+
|
|
1163
|
+
def _sync_children(self, child_context: dict) -> None:
|
|
1164
|
+
for child_stream in self.child_streams:
|
|
1165
|
+
if child_stream.selected or child_stream.has_selected_descendents:
|
|
1166
|
+
child_stream.state_partitioning_keys = list(
|
|
1167
|
+
set(child_stream.state_partitioning_keys or [])
|
|
1168
|
+
| set(child_context.keys())
|
|
1169
|
+
)
|
|
1170
|
+
child_stream.sync(context=child_context)
|
|
1171
|
+
|
|
1172
|
+
# Overridable Methods
|
|
1173
|
+
|
|
1174
|
+
def apply_catalog(self, catalog: Catalog) -> None:
|
|
1175
|
+
"""Apply a catalog dict, updating any settings overridden within the catalog.
|
|
1176
|
+
|
|
1177
|
+
Developers may override this method in order to introduce advanced catalog
|
|
1178
|
+
parsing, or to explicitly fail on advanced catalog customizations which
|
|
1179
|
+
are not supported by the tap.
|
|
1180
|
+
|
|
1181
|
+
Args:
|
|
1182
|
+
catalog: Catalog object passed to the tap. Defines schema, primary and
|
|
1183
|
+
replication keys, as well as selection metadata.
|
|
1184
|
+
"""
|
|
1185
|
+
self._tap_input_catalog = catalog
|
|
1186
|
+
|
|
1187
|
+
catalog_entry = catalog.get_stream(self.name)
|
|
1188
|
+
if catalog_entry:
|
|
1189
|
+
self.primary_keys = catalog_entry.key_properties
|
|
1190
|
+
self.replication_key = catalog_entry.replication_key
|
|
1191
|
+
if catalog_entry.replication_method:
|
|
1192
|
+
self.forced_replication_method = catalog_entry.replication_method
|
|
1193
|
+
|
|
1194
|
+
def _get_state_partition_context(self, context: Optional[dict]) -> Optional[Dict]:
|
|
1195
|
+
"""Override state handling if Stream.state_partitioning_keys is specified.
|
|
1196
|
+
|
|
1197
|
+
Args:
|
|
1198
|
+
context: Stream partition or context dictionary.
|
|
1199
|
+
|
|
1200
|
+
Returns:
|
|
1201
|
+
TODO
|
|
1202
|
+
"""
|
|
1203
|
+
if context is None:
|
|
1204
|
+
return None
|
|
1205
|
+
|
|
1206
|
+
if self.state_partitioning_keys is None:
|
|
1207
|
+
return None
|
|
1208
|
+
|
|
1209
|
+
return {k: v for k, v in context.items() if k in self.state_partitioning_keys}
|
|
1210
|
+
|
|
1211
|
+
def get_child_context(self, record: dict, context: Optional[dict]) -> dict:
|
|
1212
|
+
"""Return a child context object from the record and optional provided context.
|
|
1213
|
+
|
|
1214
|
+
By default, will return context if provided and otherwise the record dict.
|
|
1215
|
+
|
|
1216
|
+
Developers may override this behavior to send specific information to child
|
|
1217
|
+
streams for context.
|
|
1218
|
+
|
|
1219
|
+
Args:
|
|
1220
|
+
record: Individual record in the stream.
|
|
1221
|
+
context: Stream partition or context dictionary.
|
|
1222
|
+
|
|
1223
|
+
Returns:
|
|
1224
|
+
A dictionary with context values for a child stream.
|
|
1225
|
+
|
|
1226
|
+
Raises:
|
|
1227
|
+
NotImplementedError: If the stream has children but this method is not
|
|
1228
|
+
overriden.
|
|
1229
|
+
"""
|
|
1230
|
+
if context is None:
|
|
1231
|
+
for child_stream in self.child_streams:
|
|
1232
|
+
if child_stream.state_partitioning_keys is None:
|
|
1233
|
+
parent_type = type(self).__name__
|
|
1234
|
+
child_type = type(child_stream).__name__
|
|
1235
|
+
raise NotImplementedError(
|
|
1236
|
+
"No child context behavior was defined between parent stream "
|
|
1237
|
+
f"'{self.name}' and child stream '{child_stream.name}'."
|
|
1238
|
+
"The parent stream must define "
|
|
1239
|
+
f"`{parent_type}.get_child_context()` and/or the child stream "
|
|
1240
|
+
f"must define `{child_type}.state_partitioning_keys`."
|
|
1241
|
+
)
|
|
1242
|
+
|
|
1243
|
+
return context or record
|
|
1244
|
+
|
|
1245
|
+
# Abstract Methods
|
|
1246
|
+
|
|
1247
|
+
@abc.abstractmethod
|
|
1248
|
+
def get_records(
|
|
1249
|
+
self, context: Optional[dict]
|
|
1250
|
+
) -> Iterable[Union[dict, Tuple[dict, dict]]]:
|
|
1251
|
+
"""Abstract row generator function. Must be overridden by the child class.
|
|
1252
|
+
|
|
1253
|
+
Each row emitted should be a dictionary of property names to their values.
|
|
1254
|
+
Returns either a record dict or a tuple: (record_dict, child_context)
|
|
1255
|
+
|
|
1256
|
+
A method which should retrieve data from the source and return records
|
|
1257
|
+
incrementally using the python `yield` operator.
|
|
1258
|
+
|
|
1259
|
+
Only custom stream types need to define this method. REST and GraphQL streams
|
|
1260
|
+
should instead use the class-specific methods for REST or GraphQL, respectively.
|
|
1261
|
+
|
|
1262
|
+
This method takes an optional `context` argument, which can be safely ignored
|
|
1263
|
+
unless the stream is a child stream or requires partitioning.
|
|
1264
|
+
More info: :doc:`/partitioning`.
|
|
1265
|
+
|
|
1266
|
+
Parent streams can optionally return a tuple, in which
|
|
1267
|
+
case the second item in the tuple being a `child_context` dictionary for the
|
|
1268
|
+
stream's `context`.
|
|
1269
|
+
More info: :doc:`/parent_streams`
|
|
1270
|
+
|
|
1271
|
+
Args:
|
|
1272
|
+
context: Stream partition or context dictionary.
|
|
1273
|
+
"""
|
|
1274
|
+
pass
|
|
1275
|
+
|
|
1276
|
+
def post_process(self, row: dict, context: Optional[dict] = None) -> Optional[dict]:
|
|
1277
|
+
"""As needed, append or transform raw data to match expected structure.
|
|
1278
|
+
|
|
1279
|
+
Optional. This method gives developers an opportunity to "clean up" the results
|
|
1280
|
+
prior to returning records to the downstream tap - for instance: cleaning,
|
|
1281
|
+
renaming, or appending properties to the raw record result returned from the
|
|
1282
|
+
API.
|
|
1283
|
+
|
|
1284
|
+
Developers may also return `None` from this method to filter out
|
|
1285
|
+
invalid or not-applicable records from the stream.
|
|
1286
|
+
|
|
1287
|
+
Args:
|
|
1288
|
+
row: Individual record in the stream.
|
|
1289
|
+
context: Stream partition or context dictionary.
|
|
1290
|
+
|
|
1291
|
+
Returns:
|
|
1292
|
+
The resulting record dict, or `None` if the record should be excluded.
|
|
1293
|
+
"""
|
|
1294
|
+
return row
|