airbyte-cdk 6.31.2.dev0__py3-none-any.whl → 6.33.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/cli/source_declarative_manifest/_run.py +9 -3
- airbyte_cdk/connector_builder/connector_builder_handler.py +3 -2
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +7 -7
- airbyte_cdk/sources/declarative/auth/jwt.py +17 -11
- airbyte_cdk/sources/declarative/auth/oauth.py +89 -23
- airbyte_cdk/sources/declarative/auth/token_provider.py +4 -5
- airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py +19 -9
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +145 -43
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +51 -2
- airbyte_cdk/sources/declarative/declarative_stream.py +3 -1
- airbyte_cdk/sources/declarative/extractors/record_filter.py +3 -5
- airbyte_cdk/sources/declarative/incremental/__init__.py +6 -0
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +400 -0
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +3 -0
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +35 -3
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +20 -7
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +41 -5
- airbyte_cdk/sources/declarative/parsers/custom_code_compiler.py +143 -0
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +313 -30
- airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +5 -5
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +46 -12
- airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +22 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +4 -4
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +6 -12
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +1 -1
- airbyte_cdk/sources/declarative/schema/__init__.py +2 -0
- airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +44 -5
- airbyte_cdk/sources/http_logger.py +1 -1
- airbyte_cdk/sources/streams/concurrent/clamping.py +99 -0
- airbyte_cdk/sources/streams/concurrent/cursor.py +51 -57
- airbyte_cdk/sources/streams/concurrent/cursor_types.py +32 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +22 -13
- airbyte_cdk/sources/streams/core.py +6 -6
- airbyte_cdk/sources/streams/http/http.py +1 -2
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +231 -62
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +171 -88
- airbyte_cdk/sources/types.py +4 -2
- airbyte_cdk/sources/utils/transform.py +23 -2
- airbyte_cdk/test/utils/manifest_only_fixtures.py +1 -2
- airbyte_cdk/utils/datetime_helpers.py +499 -0
- airbyte_cdk/utils/slice_hasher.py +8 -1
- airbyte_cdk-6.33.0.dist-info/LICENSE_SHORT +1 -0
- {airbyte_cdk-6.31.2.dev0.dist-info → airbyte_cdk-6.33.0.dist-info}/METADATA +6 -6
- {airbyte_cdk-6.31.2.dev0.dist-info → airbyte_cdk-6.33.0.dist-info}/RECORD +47 -41
- {airbyte_cdk-6.31.2.dev0.dist-info → airbyte_cdk-6.33.0.dist-info}/WHEEL +1 -1
- {airbyte_cdk-6.31.2.dev0.dist-info → airbyte_cdk-6.33.0.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.31.2.dev0.dist-info → airbyte_cdk-6.33.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,143 @@
|
|
1
|
+
"""Contains functions to compile custom code from text."""
|
2
|
+
|
3
|
+
import hashlib
|
4
|
+
import os
|
5
|
+
import sys
|
6
|
+
from collections.abc import Mapping
|
7
|
+
from types import ModuleType
|
8
|
+
from typing import Any, cast
|
9
|
+
|
10
|
+
from typing_extensions import Literal
|
11
|
+
|
12
|
+
ChecksumType = Literal["md5", "sha256"]
|
13
|
+
CHECKSUM_FUNCTIONS = {
|
14
|
+
"md5": hashlib.md5,
|
15
|
+
"sha256": hashlib.sha256,
|
16
|
+
}
|
17
|
+
COMPONENTS_MODULE_NAME = "components"
|
18
|
+
SDM_COMPONENTS_MODULE_NAME = "source_declarative_manifest.components"
|
19
|
+
INJECTED_MANIFEST = "__injected_declarative_manifest"
|
20
|
+
INJECTED_COMPONENTS_PY = "__injected_components_py"
|
21
|
+
INJECTED_COMPONENTS_PY_CHECKSUMS = "__injected_components_py_checksums"
|
22
|
+
ENV_VAR_ALLOW_CUSTOM_CODE = "AIRBYTE_ALLOW_CUSTOM_CODE"
|
23
|
+
|
24
|
+
|
25
|
+
class AirbyteCodeTamperedError(Exception):
|
26
|
+
"""Raised when the connector's components module does not match its checksum.
|
27
|
+
|
28
|
+
This is a fatal error, as it can be a sign of code tampering.
|
29
|
+
"""
|
30
|
+
|
31
|
+
|
32
|
+
class AirbyteCustomCodeNotPermittedError(Exception):
|
33
|
+
"""Raised when custom code is attempted to be run in an environment that does not support it."""
|
34
|
+
|
35
|
+
def __init__(self) -> None:
|
36
|
+
super().__init__(
|
37
|
+
"Custom connector code is not permitted in this environment. "
|
38
|
+
"If you need to run custom code, please ask your administrator to set the `AIRBYTE_ALLOW_CUSTOM_CODE` "
|
39
|
+
"environment variable to 'true' in your Airbyte environment. "
|
40
|
+
"If you see this message in Airbyte Cloud, your workspace does not allow executing "
|
41
|
+
"custom connector code."
|
42
|
+
)
|
43
|
+
|
44
|
+
|
45
|
+
def _hash_text(input_text: str, hash_type: str = "md5") -> str:
|
46
|
+
"""Return the hash of the input text using the specified hash type."""
|
47
|
+
if not input_text:
|
48
|
+
raise ValueError("Input text cannot be empty.")
|
49
|
+
|
50
|
+
hash_object = CHECKSUM_FUNCTIONS[hash_type]()
|
51
|
+
hash_object.update(input_text.encode())
|
52
|
+
return hash_object.hexdigest()
|
53
|
+
|
54
|
+
|
55
|
+
def custom_code_execution_permitted() -> bool:
|
56
|
+
"""Return `True` if custom code execution is permitted, otherwise `False`.
|
57
|
+
|
58
|
+
Custom code execution is permitted if the `AIRBYTE_ALLOW_CUSTOM_CODE` environment variable is set to 'true'.
|
59
|
+
"""
|
60
|
+
return os.environ.get(ENV_VAR_ALLOW_CUSTOM_CODE, "").lower() == "true"
|
61
|
+
|
62
|
+
|
63
|
+
def validate_python_code(
|
64
|
+
code_text: str,
|
65
|
+
checksums: dict[str, str] | None,
|
66
|
+
) -> None:
|
67
|
+
"""Validate the provided Python code text against the provided checksums.
|
68
|
+
|
69
|
+
Currently we fail if no checksums are provided, although this may change in the future.
|
70
|
+
"""
|
71
|
+
if not checksums:
|
72
|
+
raise ValueError(f"A checksum is required to validate the code. Received: {checksums}")
|
73
|
+
|
74
|
+
for checksum_type, checksum in checksums.items():
|
75
|
+
if checksum_type not in CHECKSUM_FUNCTIONS:
|
76
|
+
raise ValueError(
|
77
|
+
f"Unsupported checksum type: {checksum_type}. Supported checksum types are: {CHECKSUM_FUNCTIONS.keys()}"
|
78
|
+
)
|
79
|
+
|
80
|
+
if _hash_text(code_text, checksum_type) != checksum:
|
81
|
+
raise AirbyteCodeTamperedError(f"{checksum_type} checksum does not match.")
|
82
|
+
|
83
|
+
|
84
|
+
def get_registered_components_module(
|
85
|
+
config: Mapping[str, Any] | None,
|
86
|
+
) -> ModuleType | None:
|
87
|
+
"""Get a components module object based on the provided config.
|
88
|
+
|
89
|
+
If custom python components is provided, this will be loaded. Otherwise, we will
|
90
|
+
attempt to load from the `components` module already imported/registered in sys.modules.
|
91
|
+
|
92
|
+
If custom `components.py` text is provided in config, it will be registered with sys.modules
|
93
|
+
so that it can be later imported by manifest declarations which reference the provided classes.
|
94
|
+
|
95
|
+
Returns `None` if no components is provided and the `components` module is not found.
|
96
|
+
"""
|
97
|
+
if config and INJECTED_COMPONENTS_PY in config:
|
98
|
+
if not custom_code_execution_permitted():
|
99
|
+
raise AirbyteCustomCodeNotPermittedError
|
100
|
+
|
101
|
+
# Create a new module object and execute the provided Python code text within it
|
102
|
+
python_text: str = config[INJECTED_COMPONENTS_PY]
|
103
|
+
return register_components_module_from_string(
|
104
|
+
components_py_text=python_text,
|
105
|
+
checksums=config.get(INJECTED_COMPONENTS_PY_CHECKSUMS, None),
|
106
|
+
)
|
107
|
+
|
108
|
+
# Check for `components` or `source_declarative_manifest.components`.
|
109
|
+
if SDM_COMPONENTS_MODULE_NAME in sys.modules:
|
110
|
+
return cast(ModuleType, sys.modules.get(SDM_COMPONENTS_MODULE_NAME))
|
111
|
+
|
112
|
+
if COMPONENTS_MODULE_NAME in sys.modules:
|
113
|
+
return cast(ModuleType, sys.modules.get(COMPONENTS_MODULE_NAME))
|
114
|
+
|
115
|
+
# Could not find module 'components' in `sys.modules`
|
116
|
+
# and INJECTED_COMPONENTS_PY was not provided in config.
|
117
|
+
return None
|
118
|
+
|
119
|
+
|
120
|
+
def register_components_module_from_string(
|
121
|
+
components_py_text: str,
|
122
|
+
checksums: dict[str, Any] | None,
|
123
|
+
) -> ModuleType:
|
124
|
+
"""Load and return the components module from a provided string containing the python code."""
|
125
|
+
# First validate the code
|
126
|
+
validate_python_code(
|
127
|
+
code_text=components_py_text,
|
128
|
+
checksums=checksums,
|
129
|
+
)
|
130
|
+
|
131
|
+
# Create a new module object
|
132
|
+
components_module = ModuleType(name=COMPONENTS_MODULE_NAME)
|
133
|
+
|
134
|
+
# Execute the module text in the module's namespace
|
135
|
+
exec(components_py_text, components_module.__dict__)
|
136
|
+
|
137
|
+
# Register the module in `sys.modules`` so it can be imported as
|
138
|
+
# `source_declarative_manifest.components` and/or `components`.
|
139
|
+
sys.modules[SDM_COMPONENTS_MODULE_NAME] = components_module
|
140
|
+
sys.modules[COMPONENTS_MODULE_NAME] = components_module
|
141
|
+
|
142
|
+
# Now you can import and use the module
|
143
|
+
return components_module
|
@@ -87,6 +87,8 @@ from airbyte_cdk.sources.declarative.extractors.record_filter import (
|
|
87
87
|
)
|
88
88
|
from airbyte_cdk.sources.declarative.incremental import (
|
89
89
|
ChildPartitionResumableFullRefreshCursor,
|
90
|
+
ConcurrentCursorFactory,
|
91
|
+
ConcurrentPerPartitionCursor,
|
90
92
|
CursorFactory,
|
91
93
|
DatetimeBasedCursor,
|
92
94
|
DeclarativeCursor,
|
@@ -101,6 +103,7 @@ from airbyte_cdk.sources.declarative.migrations.legacy_to_per_partition_state_mi
|
|
101
103
|
LegacyToPerPartitionStateMigration,
|
102
104
|
)
|
103
105
|
from airbyte_cdk.sources.declarative.models import (
|
106
|
+
Clamping,
|
104
107
|
CustomStateMigration,
|
105
108
|
)
|
106
109
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
@@ -130,6 +133,9 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
|
|
130
133
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
131
134
|
CheckStream as CheckStreamModel,
|
132
135
|
)
|
136
|
+
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
137
|
+
ComplexFieldType as ComplexFieldTypeModel,
|
138
|
+
)
|
133
139
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
134
140
|
ComponentMappingDefinition as ComponentMappingDefinitionModel,
|
135
141
|
)
|
@@ -363,6 +369,10 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
|
|
363
369
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
364
370
|
ZipfileDecoder as ZipfileDecoderModel,
|
365
371
|
)
|
372
|
+
from airbyte_cdk.sources.declarative.parsers.custom_code_compiler import (
|
373
|
+
COMPONENTS_MODULE_NAME,
|
374
|
+
SDM_COMPONENTS_MODULE_NAME,
|
375
|
+
)
|
366
376
|
from airbyte_cdk.sources.declarative.partition_routers import (
|
367
377
|
CartesianProductStreamSlicer,
|
368
378
|
ListPartitionRouter,
|
@@ -422,6 +432,7 @@ from airbyte_cdk.sources.declarative.retrievers import (
|
|
422
432
|
SimpleRetrieverTestReadDecorator,
|
423
433
|
)
|
424
434
|
from airbyte_cdk.sources.declarative.schema import (
|
435
|
+
ComplexFieldType,
|
425
436
|
DefaultSchemaLoader,
|
426
437
|
DynamicSchemaLoader,
|
427
438
|
InlineSchemaLoader,
|
@@ -456,6 +467,16 @@ from airbyte_cdk.sources.message import (
|
|
456
467
|
InMemoryMessageRepository,
|
457
468
|
LogAppenderMessageRepositoryDecorator,
|
458
469
|
MessageRepository,
|
470
|
+
NoopMessageRepository,
|
471
|
+
)
|
472
|
+
from airbyte_cdk.sources.streams.concurrent.clamping import (
|
473
|
+
ClampingEndProvider,
|
474
|
+
ClampingStrategy,
|
475
|
+
DayClampingStrategy,
|
476
|
+
MonthClampingStrategy,
|
477
|
+
NoClamping,
|
478
|
+
WeekClampingStrategy,
|
479
|
+
Weekday,
|
459
480
|
)
|
460
481
|
from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField
|
461
482
|
from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import (
|
@@ -486,6 +507,7 @@ class ModelToComponentFactory:
|
|
486
507
|
disable_cache: bool = False,
|
487
508
|
disable_resumable_full_refresh: bool = False,
|
488
509
|
message_repository: Optional[MessageRepository] = None,
|
510
|
+
connector_state_manager: Optional[ConnectorStateManager] = None,
|
489
511
|
):
|
490
512
|
self._init_mappings()
|
491
513
|
self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice
|
@@ -497,6 +519,7 @@ class ModelToComponentFactory:
|
|
497
519
|
self._message_repository = message_repository or InMemoryMessageRepository(
|
498
520
|
self._evaluate_log_level(emit_connector_builder_messages)
|
499
521
|
)
|
522
|
+
self._connector_state_manager = connector_state_manager or ConnectorStateManager()
|
500
523
|
|
501
524
|
def _init_mappings(self) -> None:
|
502
525
|
self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = {
|
@@ -555,6 +578,7 @@ class ModelToComponentFactory:
|
|
555
578
|
DynamicSchemaLoaderModel: self.create_dynamic_schema_loader,
|
556
579
|
SchemaTypeIdentifierModel: self.create_schema_type_identifier,
|
557
580
|
TypesMapModel: self.create_types_map,
|
581
|
+
ComplexFieldTypeModel: self.create_complex_field_type,
|
558
582
|
JwtAuthenticatorModel: self.create_jwt_authenticator,
|
559
583
|
LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration,
|
560
584
|
ListPartitionRouterModel: self.create_list_partition_router,
|
@@ -878,7 +902,15 @@ class ModelToComponentFactory:
|
|
878
902
|
def create_check_dynamic_stream(
|
879
903
|
model: CheckDynamicStreamModel, config: Config, **kwargs: Any
|
880
904
|
) -> CheckDynamicStream:
|
881
|
-
|
905
|
+
assert model.use_check_availability is not None # for mypy
|
906
|
+
|
907
|
+
use_check_availability = model.use_check_availability
|
908
|
+
|
909
|
+
return CheckDynamicStream(
|
910
|
+
stream_count=model.stream_count,
|
911
|
+
use_check_availability=use_check_availability,
|
912
|
+
parameters={},
|
913
|
+
)
|
882
914
|
|
883
915
|
def create_composite_error_handler(
|
884
916
|
self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any
|
@@ -904,15 +936,24 @@ class ModelToComponentFactory:
|
|
904
936
|
|
905
937
|
def create_concurrent_cursor_from_datetime_based_cursor(
|
906
938
|
self,
|
907
|
-
state_manager: ConnectorStateManager,
|
908
939
|
model_type: Type[BaseModel],
|
909
940
|
component_definition: ComponentDefinition,
|
910
941
|
stream_name: str,
|
911
942
|
stream_namespace: Optional[str],
|
912
943
|
config: Config,
|
913
|
-
|
944
|
+
message_repository: Optional[MessageRepository] = None,
|
945
|
+
runtime_lookback_window: Optional[datetime.timedelta] = None,
|
914
946
|
**kwargs: Any,
|
915
947
|
) -> ConcurrentCursor:
|
948
|
+
# Per-partition incremental streams can dynamically create child cursors which will pass their current
|
949
|
+
# state via the stream_state keyword argument. Incremental syncs without parent streams use the
|
950
|
+
# incoming state and connector_state_manager that is initialized when the component factory is created
|
951
|
+
stream_state = (
|
952
|
+
self._connector_state_manager.get_stream_state(stream_name, stream_namespace)
|
953
|
+
if "stream_state" not in kwargs
|
954
|
+
else kwargs["stream_state"]
|
955
|
+
)
|
956
|
+
|
916
957
|
component_type = component_definition.get("type")
|
917
958
|
if component_definition.get("type") != model_type.__name__:
|
918
959
|
raise ValueError(
|
@@ -972,10 +1013,22 @@ class ModelToComponentFactory:
|
|
972
1013
|
connector_state_converter = CustomFormatConcurrentStreamStateConverter(
|
973
1014
|
datetime_format=datetime_format,
|
974
1015
|
input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats,
|
975
|
-
is_sequential_state=True,
|
1016
|
+
is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state
|
976
1017
|
cursor_granularity=cursor_granularity,
|
977
1018
|
)
|
978
1019
|
|
1020
|
+
# Adjusts the stream state by applying the runtime lookback window.
|
1021
|
+
# This is used to ensure correct state handling in case of failed partitions.
|
1022
|
+
stream_state_value = stream_state.get(cursor_field.cursor_field_key)
|
1023
|
+
if runtime_lookback_window and stream_state_value:
|
1024
|
+
new_stream_state = (
|
1025
|
+
connector_state_converter.parse_timestamp(stream_state_value)
|
1026
|
+
- runtime_lookback_window
|
1027
|
+
)
|
1028
|
+
stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format(
|
1029
|
+
new_stream_state
|
1030
|
+
)
|
1031
|
+
|
979
1032
|
start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime]
|
980
1033
|
if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel):
|
981
1034
|
start_date_runtime_value = self.create_min_max_datetime(
|
@@ -1042,12 +1095,59 @@ class ModelToComponentFactory:
|
|
1042
1095
|
if evaluated_step:
|
1043
1096
|
step_length = parse_duration(evaluated_step)
|
1044
1097
|
|
1098
|
+
clamping_strategy: ClampingStrategy = NoClamping()
|
1099
|
+
if datetime_based_cursor_model.clamping:
|
1100
|
+
# While it is undesirable to interpolate within the model factory (as opposed to at runtime),
|
1101
|
+
# it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime
|
1102
|
+
# object which we want to keep agnostic of being low-code
|
1103
|
+
target = InterpolatedString(
|
1104
|
+
string=datetime_based_cursor_model.clamping.target,
|
1105
|
+
parameters=datetime_based_cursor_model.parameters or {},
|
1106
|
+
)
|
1107
|
+
evaluated_target = target.eval(config=config)
|
1108
|
+
match evaluated_target:
|
1109
|
+
case "DAY":
|
1110
|
+
clamping_strategy = DayClampingStrategy()
|
1111
|
+
end_date_provider = ClampingEndProvider(
|
1112
|
+
DayClampingStrategy(is_ceiling=False),
|
1113
|
+
end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice
|
1114
|
+
granularity=cursor_granularity or datetime.timedelta(seconds=1),
|
1115
|
+
)
|
1116
|
+
case "WEEK":
|
1117
|
+
if (
|
1118
|
+
not datetime_based_cursor_model.clamping.target_details
|
1119
|
+
or "weekday" not in datetime_based_cursor_model.clamping.target_details
|
1120
|
+
):
|
1121
|
+
raise ValueError(
|
1122
|
+
"Given WEEK clamping, weekday needs to be provided as target_details"
|
1123
|
+
)
|
1124
|
+
weekday = self._assemble_weekday(
|
1125
|
+
datetime_based_cursor_model.clamping.target_details["weekday"]
|
1126
|
+
)
|
1127
|
+
clamping_strategy = WeekClampingStrategy(weekday)
|
1128
|
+
end_date_provider = ClampingEndProvider(
|
1129
|
+
WeekClampingStrategy(weekday, is_ceiling=False),
|
1130
|
+
end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice
|
1131
|
+
granularity=cursor_granularity or datetime.timedelta(days=1),
|
1132
|
+
)
|
1133
|
+
case "MONTH":
|
1134
|
+
clamping_strategy = MonthClampingStrategy()
|
1135
|
+
end_date_provider = ClampingEndProvider(
|
1136
|
+
MonthClampingStrategy(is_ceiling=False),
|
1137
|
+
end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice
|
1138
|
+
granularity=cursor_granularity or datetime.timedelta(days=1),
|
1139
|
+
)
|
1140
|
+
case _:
|
1141
|
+
raise ValueError(
|
1142
|
+
f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH"
|
1143
|
+
)
|
1144
|
+
|
1045
1145
|
return ConcurrentCursor(
|
1046
1146
|
stream_name=stream_name,
|
1047
1147
|
stream_namespace=stream_namespace,
|
1048
1148
|
stream_state=stream_state,
|
1049
|
-
message_repository=self._message_repository,
|
1050
|
-
connector_state_manager=
|
1149
|
+
message_repository=message_repository or self._message_repository,
|
1150
|
+
connector_state_manager=self._connector_state_manager,
|
1051
1151
|
connector_state_converter=connector_state_converter,
|
1052
1152
|
cursor_field=cursor_field,
|
1053
1153
|
slice_boundary_fields=slice_boundary_fields,
|
@@ -1056,6 +1156,100 @@ class ModelToComponentFactory:
|
|
1056
1156
|
lookback_window=lookback_window,
|
1057
1157
|
slice_range=step_length,
|
1058
1158
|
cursor_granularity=cursor_granularity,
|
1159
|
+
clamping_strategy=clamping_strategy,
|
1160
|
+
)
|
1161
|
+
|
1162
|
+
def _assemble_weekday(self, weekday: str) -> Weekday:
|
1163
|
+
match weekday:
|
1164
|
+
case "MONDAY":
|
1165
|
+
return Weekday.MONDAY
|
1166
|
+
case "TUESDAY":
|
1167
|
+
return Weekday.TUESDAY
|
1168
|
+
case "WEDNESDAY":
|
1169
|
+
return Weekday.WEDNESDAY
|
1170
|
+
case "THURSDAY":
|
1171
|
+
return Weekday.THURSDAY
|
1172
|
+
case "FRIDAY":
|
1173
|
+
return Weekday.FRIDAY
|
1174
|
+
case "SATURDAY":
|
1175
|
+
return Weekday.SATURDAY
|
1176
|
+
case "SUNDAY":
|
1177
|
+
return Weekday.SUNDAY
|
1178
|
+
case _:
|
1179
|
+
raise ValueError(f"Unknown weekday {weekday}")
|
1180
|
+
|
1181
|
+
def create_concurrent_cursor_from_perpartition_cursor(
|
1182
|
+
self,
|
1183
|
+
state_manager: ConnectorStateManager,
|
1184
|
+
model_type: Type[BaseModel],
|
1185
|
+
component_definition: ComponentDefinition,
|
1186
|
+
stream_name: str,
|
1187
|
+
stream_namespace: Optional[str],
|
1188
|
+
config: Config,
|
1189
|
+
stream_state: MutableMapping[str, Any],
|
1190
|
+
partition_router: PartitionRouter,
|
1191
|
+
**kwargs: Any,
|
1192
|
+
) -> ConcurrentPerPartitionCursor:
|
1193
|
+
component_type = component_definition.get("type")
|
1194
|
+
if component_definition.get("type") != model_type.__name__:
|
1195
|
+
raise ValueError(
|
1196
|
+
f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead"
|
1197
|
+
)
|
1198
|
+
|
1199
|
+
datetime_based_cursor_model = model_type.parse_obj(component_definition)
|
1200
|
+
|
1201
|
+
if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel):
|
1202
|
+
raise ValueError(
|
1203
|
+
f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}"
|
1204
|
+
)
|
1205
|
+
|
1206
|
+
interpolated_cursor_field = InterpolatedString.create(
|
1207
|
+
datetime_based_cursor_model.cursor_field,
|
1208
|
+
parameters=datetime_based_cursor_model.parameters or {},
|
1209
|
+
)
|
1210
|
+
cursor_field = CursorField(interpolated_cursor_field.eval(config=config))
|
1211
|
+
|
1212
|
+
datetime_format = datetime_based_cursor_model.datetime_format
|
1213
|
+
|
1214
|
+
cursor_granularity = (
|
1215
|
+
parse_duration(datetime_based_cursor_model.cursor_granularity)
|
1216
|
+
if datetime_based_cursor_model.cursor_granularity
|
1217
|
+
else None
|
1218
|
+
)
|
1219
|
+
|
1220
|
+
connector_state_converter: DateTimeStreamStateConverter
|
1221
|
+
connector_state_converter = CustomFormatConcurrentStreamStateConverter(
|
1222
|
+
datetime_format=datetime_format,
|
1223
|
+
input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats,
|
1224
|
+
is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state
|
1225
|
+
cursor_granularity=cursor_granularity,
|
1226
|
+
)
|
1227
|
+
|
1228
|
+
# Create the cursor factory
|
1229
|
+
cursor_factory = ConcurrentCursorFactory(
|
1230
|
+
partial(
|
1231
|
+
self.create_concurrent_cursor_from_datetime_based_cursor,
|
1232
|
+
state_manager=state_manager,
|
1233
|
+
model_type=model_type,
|
1234
|
+
component_definition=component_definition,
|
1235
|
+
stream_name=stream_name,
|
1236
|
+
stream_namespace=stream_namespace,
|
1237
|
+
config=config,
|
1238
|
+
message_repository=NoopMessageRepository(),
|
1239
|
+
)
|
1240
|
+
)
|
1241
|
+
|
1242
|
+
# Return the concurrent cursor and state converter
|
1243
|
+
return ConcurrentPerPartitionCursor(
|
1244
|
+
cursor_factory=cursor_factory,
|
1245
|
+
partition_router=partition_router,
|
1246
|
+
stream_name=stream_name,
|
1247
|
+
stream_namespace=stream_namespace,
|
1248
|
+
stream_state=stream_state,
|
1249
|
+
message_repository=self._message_repository, # type: ignore
|
1250
|
+
connector_state_manager=state_manager,
|
1251
|
+
connector_state_converter=connector_state_converter,
|
1252
|
+
cursor_field=cursor_field,
|
1059
1253
|
)
|
1060
1254
|
|
1061
1255
|
@staticmethod
|
@@ -1101,7 +1295,6 @@ class ModelToComponentFactory:
|
|
1101
1295
|
:param config: The custom defined connector config
|
1102
1296
|
:return: The declarative component built from the Pydantic model to be used at runtime
|
1103
1297
|
"""
|
1104
|
-
|
1105
1298
|
custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name)
|
1106
1299
|
component_fields = get_type_hints(custom_component_class)
|
1107
1300
|
model_args = model.dict()
|
@@ -1155,14 +1348,38 @@ class ModelToComponentFactory:
|
|
1155
1348
|
return custom_component_class(**kwargs)
|
1156
1349
|
|
1157
1350
|
@staticmethod
|
1158
|
-
def _get_class_from_fully_qualified_class_name(
|
1351
|
+
def _get_class_from_fully_qualified_class_name(
|
1352
|
+
full_qualified_class_name: str,
|
1353
|
+
) -> Any:
|
1354
|
+
"""Get a class from its fully qualified name.
|
1355
|
+
|
1356
|
+
If a custom components module is needed, we assume it is already registered - probably
|
1357
|
+
as `source_declarative_manifest.components` or `components`.
|
1358
|
+
|
1359
|
+
Args:
|
1360
|
+
full_qualified_class_name (str): The fully qualified name of the class (e.g., "module.ClassName").
|
1361
|
+
|
1362
|
+
Returns:
|
1363
|
+
Any: The class object.
|
1364
|
+
|
1365
|
+
Raises:
|
1366
|
+
ValueError: If the class cannot be loaded.
|
1367
|
+
"""
|
1159
1368
|
split = full_qualified_class_name.split(".")
|
1160
|
-
|
1369
|
+
module_name_full = ".".join(split[:-1])
|
1161
1370
|
class_name = split[-1]
|
1371
|
+
|
1372
|
+
try:
|
1373
|
+
module_ref = importlib.import_module(module_name_full)
|
1374
|
+
except ModuleNotFoundError as e:
|
1375
|
+
raise ValueError(f"Could not load module `{module_name_full}`.") from e
|
1376
|
+
|
1162
1377
|
try:
|
1163
|
-
return getattr(
|
1164
|
-
except AttributeError:
|
1165
|
-
raise ValueError(
|
1378
|
+
return getattr(module_ref, class_name)
|
1379
|
+
except AttributeError as e:
|
1380
|
+
raise ValueError(
|
1381
|
+
f"Could not load class `{class_name}` from module `{module_name_full}`.",
|
1382
|
+
) from e
|
1166
1383
|
|
1167
1384
|
@staticmethod
|
1168
1385
|
def _derive_component_type_from_type_hints(field_type: Any) -> Optional[str]:
|
@@ -1336,18 +1553,15 @@ class ModelToComponentFactory:
|
|
1336
1553
|
raise ValueError(
|
1337
1554
|
"Unsupported Slicer is used. PerPartitionWithGlobalCursor should be used here instead"
|
1338
1555
|
)
|
1339
|
-
|
1340
|
-
|
1341
|
-
|
1342
|
-
|
1343
|
-
|
1344
|
-
|
1345
|
-
|
1346
|
-
|
1347
|
-
|
1348
|
-
else None
|
1349
|
-
),
|
1350
|
-
}
|
1556
|
+
cursor = (
|
1557
|
+
combined_slicers
|
1558
|
+
if isinstance(
|
1559
|
+
combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor)
|
1560
|
+
)
|
1561
|
+
else self._create_component_from_model(model=model.incremental_sync, config=config)
|
1562
|
+
)
|
1563
|
+
|
1564
|
+
client_side_incremental_sync = {"cursor": cursor}
|
1351
1565
|
|
1352
1566
|
if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel):
|
1353
1567
|
cursor_model = model.incremental_sync
|
@@ -1433,7 +1647,7 @@ class ModelToComponentFactory:
|
|
1433
1647
|
) -> Optional[PartitionRouter]:
|
1434
1648
|
if (
|
1435
1649
|
hasattr(model, "partition_router")
|
1436
|
-
and isinstance(model, SimpleRetrieverModel)
|
1650
|
+
and isinstance(model, SimpleRetrieverModel | AsyncRetrieverModel)
|
1437
1651
|
and model.partition_router
|
1438
1652
|
):
|
1439
1653
|
stream_slicer_model = model.partition_router
|
@@ -1467,6 +1681,31 @@ class ModelToComponentFactory:
|
|
1467
1681
|
stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config)
|
1468
1682
|
|
1469
1683
|
if model.incremental_sync and stream_slicer:
|
1684
|
+
if model.retriever.type == "AsyncRetriever":
|
1685
|
+
if model.incremental_sync.type != "DatetimeBasedCursor":
|
1686
|
+
# We are currently in a transition to the Concurrent CDK and AsyncRetriever can only work with the support or unordered slices (for example, when we trigger reports for January and February, the report in February can be completed first). Once we have support for custom concurrent cursor or have a new implementation available in the CDK, we can enable more cursors here.
|
1687
|
+
raise ValueError(
|
1688
|
+
"AsyncRetriever with cursor other than DatetimeBasedCursor is not supported yet"
|
1689
|
+
)
|
1690
|
+
if stream_slicer:
|
1691
|
+
return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing
|
1692
|
+
state_manager=self._connector_state_manager,
|
1693
|
+
model_type=DatetimeBasedCursorModel,
|
1694
|
+
component_definition=model.incremental_sync.__dict__,
|
1695
|
+
stream_name=model.name or "",
|
1696
|
+
stream_namespace=None,
|
1697
|
+
config=config or {},
|
1698
|
+
stream_state={},
|
1699
|
+
partition_router=stream_slicer,
|
1700
|
+
)
|
1701
|
+
return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing
|
1702
|
+
model_type=DatetimeBasedCursorModel,
|
1703
|
+
component_definition=model.incremental_sync.__dict__,
|
1704
|
+
stream_name=model.name or "",
|
1705
|
+
stream_namespace=None,
|
1706
|
+
config=config or {},
|
1707
|
+
)
|
1708
|
+
|
1470
1709
|
incremental_sync_model = model.incremental_sync
|
1471
1710
|
if (
|
1472
1711
|
hasattr(incremental_sync_model, "global_substream_cursor")
|
@@ -1492,6 +1731,22 @@ class ModelToComponentFactory:
|
|
1492
1731
|
stream_cursor=cursor_component,
|
1493
1732
|
)
|
1494
1733
|
elif model.incremental_sync:
|
1734
|
+
if model.retriever.type == "AsyncRetriever":
|
1735
|
+
if model.incremental_sync.type != "DatetimeBasedCursor":
|
1736
|
+
# We are currently in a transition to the Concurrent CDK and AsyncRetriever can only work with the support or unordered slices (for example, when we trigger reports for January and February, the report in February can be completed first). Once we have support for custom concurrent cursor or have a new implementation available in the CDK, we can enable more cursors here.
|
1737
|
+
raise ValueError(
|
1738
|
+
"AsyncRetriever with cursor other than DatetimeBasedCursor is not supported yet"
|
1739
|
+
)
|
1740
|
+
if model.retriever.partition_router:
|
1741
|
+
# Note that this development is also done in parallel to the per partition development which once merged we could support here by calling `create_concurrent_cursor_from_perpartition_cursor`
|
1742
|
+
raise ValueError("Per partition state is not supported yet for AsyncRetriever")
|
1743
|
+
return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing
|
1744
|
+
model_type=DatetimeBasedCursorModel,
|
1745
|
+
component_definition=model.incremental_sync.__dict__,
|
1746
|
+
stream_name=model.name or "",
|
1747
|
+
stream_namespace=None,
|
1748
|
+
config=config or {},
|
1749
|
+
)
|
1495
1750
|
return (
|
1496
1751
|
self._create_component_from_model(model=model.incremental_sync, config=config)
|
1497
1752
|
if model.incremental_sync
|
@@ -1710,10 +1965,26 @@ class ModelToComponentFactory:
|
|
1710
1965
|
) -> InlineSchemaLoader:
|
1711
1966
|
return InlineSchemaLoader(schema=model.schema_ or {}, parameters={})
|
1712
1967
|
|
1713
|
-
|
1714
|
-
|
1968
|
+
def create_complex_field_type(
|
1969
|
+
self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any
|
1970
|
+
) -> ComplexFieldType:
|
1971
|
+
items = (
|
1972
|
+
self._create_component_from_model(model=model.items, config=config)
|
1973
|
+
if isinstance(model.items, ComplexFieldTypeModel)
|
1974
|
+
else model.items
|
1975
|
+
)
|
1976
|
+
|
1977
|
+
return ComplexFieldType(field_type=model.field_type, items=items)
|
1978
|
+
|
1979
|
+
def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap:
|
1980
|
+
target_type = (
|
1981
|
+
self._create_component_from_model(model=model.target_type, config=config)
|
1982
|
+
if isinstance(model.target_type, ComplexFieldTypeModel)
|
1983
|
+
else model.target_type
|
1984
|
+
)
|
1985
|
+
|
1715
1986
|
return TypesMap(
|
1716
|
-
target_type=
|
1987
|
+
target_type=target_type,
|
1717
1988
|
current_type=model.current_type,
|
1718
1989
|
condition=model.condition if model.condition is not None else "True",
|
1719
1990
|
)
|
@@ -1911,6 +2182,12 @@ class ModelToComponentFactory:
|
|
1911
2182
|
def create_oauth_authenticator(
|
1912
2183
|
self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any
|
1913
2184
|
) -> DeclarativeOauth2Authenticator:
|
2185
|
+
profile_assertion = (
|
2186
|
+
self._create_component_from_model(model.profile_assertion, config=config)
|
2187
|
+
if model.profile_assertion
|
2188
|
+
else None
|
2189
|
+
)
|
2190
|
+
|
1914
2191
|
if model.refresh_token_updater:
|
1915
2192
|
# ignore type error because fixing it would have a lot of dependencies, revisit later
|
1916
2193
|
return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore
|
@@ -1931,13 +2208,17 @@ class ModelToComponentFactory:
|
|
1931
2208
|
).eval(config),
|
1932
2209
|
client_id=InterpolatedString.create(
|
1933
2210
|
model.client_id, parameters=model.parameters or {}
|
1934
|
-
).eval(config)
|
2211
|
+
).eval(config)
|
2212
|
+
if model.client_id
|
2213
|
+
else model.client_id,
|
1935
2214
|
client_secret_name=InterpolatedString.create(
|
1936
2215
|
model.client_secret_name or "client_secret", parameters=model.parameters or {}
|
1937
2216
|
).eval(config),
|
1938
2217
|
client_secret=InterpolatedString.create(
|
1939
2218
|
model.client_secret, parameters=model.parameters or {}
|
1940
|
-
).eval(config)
|
2219
|
+
).eval(config)
|
2220
|
+
if model.client_secret
|
2221
|
+
else model.client_secret,
|
1941
2222
|
access_token_config_path=model.refresh_token_updater.access_token_config_path,
|
1942
2223
|
refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path,
|
1943
2224
|
token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path,
|
@@ -1983,6 +2264,8 @@ class ModelToComponentFactory:
|
|
1983
2264
|
config=config,
|
1984
2265
|
parameters=model.parameters or {},
|
1985
2266
|
message_repository=self._message_repository,
|
2267
|
+
profile_assertion=profile_assertion,
|
2268
|
+
use_profile_assertion=model.use_profile_assertion,
|
1986
2269
|
)
|
1987
2270
|
|
1988
2271
|
def create_offset_increment(
|