airbyte-cdk 6.20.0__py3-none-any.whl → 6.20.2.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/sources/declarative/auth/oauth.py +0 -34
- airbyte_cdk/sources/declarative/checks/__init__.py +2 -18
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +80 -16
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +21 -93
- airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +0 -43
- airbyte_cdk/sources/declarative/extractors/record_filter.py +3 -5
- airbyte_cdk/sources/declarative/incremental/__init__.py +6 -0
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +331 -0
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +3 -0
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +15 -0
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +1 -2
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +26 -96
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +105 -111
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +4 -33
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +1 -1
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +0 -11
- airbyte_cdk/sources/file_based/exceptions.py +0 -34
- airbyte_cdk/sources/file_based/file_based_source.py +5 -28
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +4 -18
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +2 -25
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +2 -30
- airbyte_cdk/sources/streams/concurrent/cursor.py +30 -21
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +4 -33
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +4 -42
- airbyte_cdk/sources/types.py +0 -3
- {airbyte_cdk-6.20.0.dist-info → airbyte_cdk-6.20.2.dev0.dist-info}/METADATA +1 -1
- {airbyte_cdk-6.20.0.dist-info → airbyte_cdk-6.20.2.dev0.dist-info}/RECORD +30 -31
- airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py +0 -51
- airbyte_cdk/sources/declarative/requesters/README.md +0 -56
- {airbyte_cdk-6.20.0.dist-info → airbyte_cdk-6.20.2.dev0.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.20.0.dist-info → airbyte_cdk-6.20.2.dev0.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.20.0.dist-info → airbyte_cdk-6.20.2.dev0.dist-info}/entry_points.txt +0 -0
@@ -54,7 +54,7 @@ from airbyte_cdk.sources.declarative.auth.token_provider import (
|
|
54
54
|
SessionTokenProvider,
|
55
55
|
TokenProvider,
|
56
56
|
)
|
57
|
-
from airbyte_cdk.sources.declarative.checks import
|
57
|
+
from airbyte_cdk.sources.declarative.checks import CheckStream
|
58
58
|
from airbyte_cdk.sources.declarative.concurrency_level import ConcurrencyLevel
|
59
59
|
from airbyte_cdk.sources.declarative.datetime import MinMaxDatetime
|
60
60
|
from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream
|
@@ -72,8 +72,6 @@ from airbyte_cdk.sources.declarative.decoders.composite_raw_decoder import (
|
|
72
72
|
CsvParser,
|
73
73
|
GzipParser,
|
74
74
|
JsonLineParser,
|
75
|
-
JsonParser,
|
76
|
-
Parser,
|
77
75
|
)
|
78
76
|
from airbyte_cdk.sources.declarative.extractors import (
|
79
77
|
DpathExtractor,
|
@@ -86,6 +84,8 @@ from airbyte_cdk.sources.declarative.extractors.record_filter import (
|
|
86
84
|
)
|
87
85
|
from airbyte_cdk.sources.declarative.incremental import (
|
88
86
|
ChildPartitionResumableFullRefreshCursor,
|
87
|
+
ConcurrentCursorFactory,
|
88
|
+
ConcurrentPerPartitionCursor,
|
89
89
|
CursorFactory,
|
90
90
|
DatetimeBasedCursor,
|
91
91
|
DeclarativeCursor,
|
@@ -123,9 +123,6 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
|
|
123
123
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
124
124
|
BearerAuthenticator as BearerAuthenticatorModel,
|
125
125
|
)
|
126
|
-
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
127
|
-
CheckDynamicStream as CheckDynamicStreamModel,
|
128
|
-
)
|
129
126
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
130
127
|
CheckStream as CheckStreamModel,
|
131
128
|
)
|
@@ -252,9 +249,6 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
|
|
252
249
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
253
250
|
JsonLineParser as JsonLineParserModel,
|
254
251
|
)
|
255
|
-
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
256
|
-
JsonParser as JsonParserModel,
|
257
|
-
)
|
258
252
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
259
253
|
JwtAuthenticator as JwtAuthenticatorModel,
|
260
254
|
)
|
@@ -446,6 +440,7 @@ from airbyte_cdk.sources.message import (
|
|
446
440
|
InMemoryMessageRepository,
|
447
441
|
LogAppenderMessageRepositoryDecorator,
|
448
442
|
MessageRepository,
|
443
|
+
NoopMessageRepository,
|
449
444
|
)
|
450
445
|
from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField
|
451
446
|
from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import (
|
@@ -496,7 +491,6 @@ class ModelToComponentFactory:
|
|
496
491
|
BasicHttpAuthenticatorModel: self.create_basic_http_authenticator,
|
497
492
|
BearerAuthenticatorModel: self.create_bearer_authenticator,
|
498
493
|
CheckStreamModel: self.create_check_stream,
|
499
|
-
CheckDynamicStreamModel: self.create_check_dynamic_stream,
|
500
494
|
CompositeErrorHandlerModel: self.create_composite_error_handler,
|
501
495
|
CompositeRawDecoderModel: self.create_composite_raw_decoder,
|
502
496
|
ConcurrencyLevelModel: self.create_concurrency_level,
|
@@ -531,7 +525,6 @@ class ModelToComponentFactory:
|
|
531
525
|
JsonDecoderModel: self.create_json_decoder,
|
532
526
|
JsonlDecoderModel: self.create_jsonl_decoder,
|
533
527
|
JsonLineParserModel: self.create_json_line_parser,
|
534
|
-
JsonParserModel: self.create_json_parser,
|
535
528
|
GzipJsonDecoderModel: self.create_gzipjson_decoder,
|
536
529
|
GzipParserModel: self.create_gzip_parser,
|
537
530
|
KeysToLowerModel: self.create_keys_to_lower_transformation,
|
@@ -850,12 +843,6 @@ class ModelToComponentFactory:
|
|
850
843
|
def create_check_stream(model: CheckStreamModel, config: Config, **kwargs: Any) -> CheckStream:
|
851
844
|
return CheckStream(stream_names=model.stream_names, parameters={})
|
852
845
|
|
853
|
-
@staticmethod
|
854
|
-
def create_check_dynamic_stream(
|
855
|
-
model: CheckDynamicStreamModel, config: Config, **kwargs: Any
|
856
|
-
) -> CheckDynamicStream:
|
857
|
-
return CheckDynamicStream(stream_count=model.stream_count, parameters={})
|
858
|
-
|
859
846
|
def create_composite_error_handler(
|
860
847
|
self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any
|
861
848
|
) -> CompositeErrorHandler:
|
@@ -887,6 +874,8 @@ class ModelToComponentFactory:
|
|
887
874
|
stream_namespace: Optional[str],
|
888
875
|
config: Config,
|
889
876
|
stream_state: MutableMapping[str, Any],
|
877
|
+
message_repository: Optional[MessageRepository] = None,
|
878
|
+
runtime_lookback_window: Optional[int] = None,
|
890
879
|
**kwargs: Any,
|
891
880
|
) -> ConcurrentCursor:
|
892
881
|
component_type = component_definition.get("type")
|
@@ -944,6 +933,11 @@ class ModelToComponentFactory:
|
|
944
933
|
if evaluated_lookback_window:
|
945
934
|
lookback_window = parse_duration(evaluated_lookback_window)
|
946
935
|
|
936
|
+
if runtime_lookback_window and lookback_window:
|
937
|
+
lookback_window = max(lookback_window, runtime_lookback_window)
|
938
|
+
elif runtime_lookback_window:
|
939
|
+
lookback_window = runtime_lookback_window
|
940
|
+
|
947
941
|
connector_state_converter: DateTimeStreamStateConverter
|
948
942
|
connector_state_converter = CustomFormatConcurrentStreamStateConverter(
|
949
943
|
datetime_format=datetime_format,
|
@@ -1022,7 +1016,7 @@ class ModelToComponentFactory:
|
|
1022
1016
|
stream_name=stream_name,
|
1023
1017
|
stream_namespace=stream_namespace,
|
1024
1018
|
stream_state=stream_state,
|
1025
|
-
message_repository=self._message_repository,
|
1019
|
+
message_repository=message_repository or self._message_repository,
|
1026
1020
|
connector_state_manager=state_manager,
|
1027
1021
|
connector_state_converter=connector_state_converter,
|
1028
1022
|
cursor_field=cursor_field,
|
@@ -1034,6 +1028,63 @@ class ModelToComponentFactory:
|
|
1034
1028
|
cursor_granularity=cursor_granularity,
|
1035
1029
|
)
|
1036
1030
|
|
1031
|
+
def create_concurrent_cursor_from_perpartition_cursor(
|
1032
|
+
self,
|
1033
|
+
state_manager: ConnectorStateManager,
|
1034
|
+
model_type: Type[BaseModel],
|
1035
|
+
component_definition: ComponentDefinition,
|
1036
|
+
stream_name: str,
|
1037
|
+
stream_namespace: Optional[str],
|
1038
|
+
config: Config,
|
1039
|
+
stream_state: MutableMapping[str, Any],
|
1040
|
+
partition_router: PartitionRouter,
|
1041
|
+
**kwargs: Any,
|
1042
|
+
) -> ConcurrentPerPartitionCursor:
|
1043
|
+
component_type = component_definition.get("type")
|
1044
|
+
if component_definition.get("type") != model_type.__name__:
|
1045
|
+
raise ValueError(
|
1046
|
+
f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead"
|
1047
|
+
)
|
1048
|
+
|
1049
|
+
datetime_based_cursor_model = model_type.parse_obj(component_definition)
|
1050
|
+
|
1051
|
+
if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel):
|
1052
|
+
raise ValueError(
|
1053
|
+
f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}"
|
1054
|
+
)
|
1055
|
+
|
1056
|
+
interpolated_cursor_field = InterpolatedString.create(
|
1057
|
+
datetime_based_cursor_model.cursor_field,
|
1058
|
+
parameters=datetime_based_cursor_model.parameters or {},
|
1059
|
+
)
|
1060
|
+
cursor_field = CursorField(interpolated_cursor_field.eval(config=config))
|
1061
|
+
|
1062
|
+
# Create the cursor factory
|
1063
|
+
cursor_factory = ConcurrentCursorFactory(
|
1064
|
+
partial(
|
1065
|
+
self.create_concurrent_cursor_from_datetime_based_cursor,
|
1066
|
+
state_manager=state_manager,
|
1067
|
+
model_type=model_type,
|
1068
|
+
component_definition=component_definition,
|
1069
|
+
stream_name=stream_name,
|
1070
|
+
stream_namespace=stream_namespace,
|
1071
|
+
config=config,
|
1072
|
+
message_repository=NoopMessageRepository(),
|
1073
|
+
)
|
1074
|
+
)
|
1075
|
+
|
1076
|
+
# Return the concurrent cursor and state converter
|
1077
|
+
return ConcurrentPerPartitionCursor(
|
1078
|
+
cursor_factory=cursor_factory,
|
1079
|
+
partition_router=partition_router,
|
1080
|
+
stream_name=stream_name,
|
1081
|
+
stream_namespace=stream_namespace,
|
1082
|
+
stream_state=stream_state,
|
1083
|
+
message_repository=self._message_repository, # type: ignore
|
1084
|
+
connector_state_manager=state_manager,
|
1085
|
+
cursor_field=cursor_field,
|
1086
|
+
)
|
1087
|
+
|
1037
1088
|
@staticmethod
|
1038
1089
|
def create_constant_backoff_strategy(
|
1039
1090
|
model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any
|
@@ -1048,17 +1099,17 @@ class ModelToComponentFactory:
|
|
1048
1099
|
self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any
|
1049
1100
|
) -> CursorPaginationStrategy:
|
1050
1101
|
if isinstance(decoder, PaginationDecoderDecorator):
|
1051
|
-
|
1052
|
-
|
1053
|
-
|
1054
|
-
|
1055
|
-
|
1056
|
-
if self._is_supported_decoder_for_pagination(inner_decoder):
|
1102
|
+
if not isinstance(decoder.decoder, (JsonDecoder, XmlDecoder)):
|
1103
|
+
raise ValueError(
|
1104
|
+
f"Provided decoder of {type(decoder.decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead."
|
1105
|
+
)
|
1057
1106
|
decoder_to_use = decoder
|
1058
1107
|
else:
|
1059
|
-
|
1060
|
-
|
1061
|
-
|
1108
|
+
if not isinstance(decoder, (JsonDecoder, XmlDecoder)):
|
1109
|
+
raise ValueError(
|
1110
|
+
f"Provided decoder of {type(decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead."
|
1111
|
+
)
|
1112
|
+
decoder_to_use = PaginationDecoderDecorator(decoder=decoder)
|
1062
1113
|
|
1063
1114
|
return CursorPaginationStrategy(
|
1064
1115
|
cursor_value=model.cursor_value,
|
@@ -1316,18 +1367,15 @@ class ModelToComponentFactory:
|
|
1316
1367
|
raise ValueError(
|
1317
1368
|
"Unsupported Slicer is used. PerPartitionWithGlobalCursor should be used here instead"
|
1318
1369
|
)
|
1319
|
-
|
1320
|
-
|
1321
|
-
|
1322
|
-
|
1323
|
-
|
1324
|
-
|
1325
|
-
|
1326
|
-
|
1327
|
-
|
1328
|
-
else None
|
1329
|
-
),
|
1330
|
-
}
|
1370
|
+
cursor = (
|
1371
|
+
combined_slicers
|
1372
|
+
if isinstance(
|
1373
|
+
combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor)
|
1374
|
+
)
|
1375
|
+
else self._create_component_from_model(model=model.incremental_sync, config=config)
|
1376
|
+
)
|
1377
|
+
|
1378
|
+
client_side_incremental_sync = {"cursor": cursor}
|
1331
1379
|
|
1332
1380
|
if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel):
|
1333
1381
|
cursor_model = model.incremental_sync
|
@@ -1531,10 +1579,11 @@ class ModelToComponentFactory:
|
|
1531
1579
|
cursor_used_for_stop_condition: Optional[DeclarativeCursor] = None,
|
1532
1580
|
) -> Union[DefaultPaginator, PaginatorTestReadDecorator]:
|
1533
1581
|
if decoder:
|
1534
|
-
if
|
1535
|
-
|
1536
|
-
|
1537
|
-
|
1582
|
+
if not isinstance(decoder, (JsonDecoder, XmlDecoder)):
|
1583
|
+
raise ValueError(
|
1584
|
+
f"Provided decoder of {type(decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead."
|
1585
|
+
)
|
1586
|
+
decoder_to_use = PaginationDecoderDecorator(decoder=decoder)
|
1538
1587
|
else:
|
1539
1588
|
decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={}))
|
1540
1589
|
page_size_option = (
|
@@ -1763,11 +1812,6 @@ class ModelToComponentFactory:
|
|
1763
1812
|
def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> JsonDecoder:
|
1764
1813
|
return JsonDecoder(parameters={})
|
1765
1814
|
|
1766
|
-
@staticmethod
|
1767
|
-
def create_json_parser(model: JsonParserModel, config: Config, **kwargs: Any) -> JsonParser:
|
1768
|
-
encoding = model.encoding if model.encoding else "utf-8"
|
1769
|
-
return JsonParser(encoding=encoding)
|
1770
|
-
|
1771
1815
|
@staticmethod
|
1772
1816
|
def create_jsonl_decoder(
|
1773
1817
|
model: JsonlDecoderModel, config: Config, **kwargs: Any
|
@@ -1905,33 +1949,21 @@ class ModelToComponentFactory:
|
|
1905
1949
|
expires_in_name=InterpolatedString.create(
|
1906
1950
|
model.expires_in_name or "expires_in", parameters=model.parameters or {}
|
1907
1951
|
).eval(config),
|
1908
|
-
client_id_name=InterpolatedString.create(
|
1909
|
-
model.client_id_name or "client_id", parameters=model.parameters or {}
|
1910
|
-
).eval(config),
|
1911
1952
|
client_id=InterpolatedString.create(
|
1912
1953
|
model.client_id, parameters=model.parameters or {}
|
1913
1954
|
).eval(config),
|
1914
|
-
client_secret_name=InterpolatedString.create(
|
1915
|
-
model.client_secret_name or "client_secret", parameters=model.parameters or {}
|
1916
|
-
).eval(config),
|
1917
1955
|
client_secret=InterpolatedString.create(
|
1918
1956
|
model.client_secret, parameters=model.parameters or {}
|
1919
1957
|
).eval(config),
|
1920
1958
|
access_token_config_path=model.refresh_token_updater.access_token_config_path,
|
1921
1959
|
refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path,
|
1922
1960
|
token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path,
|
1923
|
-
grant_type_name=InterpolatedString.create(
|
1924
|
-
model.grant_type_name or "grant_type", parameters=model.parameters or {}
|
1925
|
-
).eval(config),
|
1926
1961
|
grant_type=InterpolatedString.create(
|
1927
1962
|
model.grant_type or "refresh_token", parameters=model.parameters or {}
|
1928
1963
|
).eval(config),
|
1929
1964
|
refresh_request_body=InterpolatedMapping(
|
1930
1965
|
model.refresh_request_body or {}, parameters=model.parameters or {}
|
1931
1966
|
).eval(config),
|
1932
|
-
refresh_request_headers=InterpolatedMapping(
|
1933
|
-
model.refresh_request_headers or {}, parameters=model.parameters or {}
|
1934
|
-
).eval(config),
|
1935
1967
|
scopes=model.scopes,
|
1936
1968
|
token_expiry_date_format=model.token_expiry_date_format,
|
1937
1969
|
message_repository=self._message_repository,
|
@@ -1943,16 +1975,11 @@ class ModelToComponentFactory:
|
|
1943
1975
|
return DeclarativeOauth2Authenticator( # type: ignore
|
1944
1976
|
access_token_name=model.access_token_name or "access_token",
|
1945
1977
|
access_token_value=model.access_token_value,
|
1946
|
-
client_id_name=model.client_id_name or "client_id",
|
1947
1978
|
client_id=model.client_id,
|
1948
|
-
client_secret_name=model.client_secret_name or "client_secret",
|
1949
1979
|
client_secret=model.client_secret,
|
1950
1980
|
expires_in_name=model.expires_in_name or "expires_in",
|
1951
|
-
grant_type_name=model.grant_type_name or "grant_type",
|
1952
1981
|
grant_type=model.grant_type or "refresh_token",
|
1953
1982
|
refresh_request_body=model.refresh_request_body,
|
1954
|
-
refresh_request_headers=model.refresh_request_headers,
|
1955
|
-
refresh_token_name=model.refresh_token_name or "refresh_token",
|
1956
1983
|
refresh_token=model.refresh_token,
|
1957
1984
|
scopes=model.scopes,
|
1958
1985
|
token_expiry_date=model.token_expiry_date,
|
@@ -1964,22 +1991,22 @@ class ModelToComponentFactory:
|
|
1964
1991
|
message_repository=self._message_repository,
|
1965
1992
|
)
|
1966
1993
|
|
1994
|
+
@staticmethod
|
1967
1995
|
def create_offset_increment(
|
1968
|
-
|
1996
|
+
model: OffsetIncrementModel, config: Config, decoder: Decoder, **kwargs: Any
|
1969
1997
|
) -> OffsetIncrement:
|
1970
1998
|
if isinstance(decoder, PaginationDecoderDecorator):
|
1971
|
-
|
1972
|
-
|
1973
|
-
|
1974
|
-
|
1975
|
-
|
1976
|
-
if self._is_supported_decoder_for_pagination(inner_decoder):
|
1999
|
+
if not isinstance(decoder.decoder, (JsonDecoder, XmlDecoder)):
|
2000
|
+
raise ValueError(
|
2001
|
+
f"Provided decoder of {type(decoder.decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead."
|
2002
|
+
)
|
1977
2003
|
decoder_to_use = decoder
|
1978
2004
|
else:
|
1979
|
-
|
1980
|
-
|
1981
|
-
|
1982
|
-
|
2005
|
+
if not isinstance(decoder, (JsonDecoder, XmlDecoder)):
|
2006
|
+
raise ValueError(
|
2007
|
+
f"Provided decoder of {type(decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead."
|
2008
|
+
)
|
2009
|
+
decoder_to_use = PaginationDecoderDecorator(decoder=decoder)
|
1983
2010
|
return OffsetIncrement(
|
1984
2011
|
page_size=model.page_size,
|
1985
2012
|
config=config,
|
@@ -2164,7 +2191,7 @@ class ModelToComponentFactory:
|
|
2164
2191
|
if (
|
2165
2192
|
not isinstance(stream_slicer, DatetimeBasedCursor)
|
2166
2193
|
or type(stream_slicer) is not DatetimeBasedCursor
|
2167
|
-
):
|
2194
|
+
) and not isinstance(stream_slicer, PerPartitionWithGlobalCursor):
|
2168
2195
|
# Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods).
|
2169
2196
|
# Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement
|
2170
2197
|
# their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's
|
@@ -2324,7 +2351,7 @@ class ModelToComponentFactory:
|
|
2324
2351
|
extractor=download_extractor,
|
2325
2352
|
name=name,
|
2326
2353
|
record_filter=None,
|
2327
|
-
transformations=
|
2354
|
+
transformations=[],
|
2328
2355
|
schema_normalization=TypeTransformer(TransformConfig.NoTransform),
|
2329
2356
|
config=config,
|
2330
2357
|
parameters={},
|
@@ -2361,16 +2388,6 @@ class ModelToComponentFactory:
|
|
2361
2388
|
if model.delete_requester
|
2362
2389
|
else None
|
2363
2390
|
)
|
2364
|
-
url_requester = (
|
2365
|
-
self._create_component_from_model(
|
2366
|
-
model=model.url_requester,
|
2367
|
-
decoder=decoder,
|
2368
|
-
config=config,
|
2369
|
-
name=f"job extract_url - {name}",
|
2370
|
-
)
|
2371
|
-
if model.url_requester
|
2372
|
-
else None
|
2373
|
-
)
|
2374
2391
|
status_extractor = self._create_component_from_model(
|
2375
2392
|
model=model.status_extractor, decoder=decoder, config=config, name=name
|
2376
2393
|
)
|
@@ -2381,7 +2398,6 @@ class ModelToComponentFactory:
|
|
2381
2398
|
creation_requester=creation_requester,
|
2382
2399
|
polling_requester=polling_requester,
|
2383
2400
|
download_retriever=download_retriever,
|
2384
|
-
url_requester=url_requester,
|
2385
2401
|
abort_requester=abort_requester,
|
2386
2402
|
delete_requester=delete_requester,
|
2387
2403
|
status_extractor=status_extractor,
|
@@ -2579,25 +2595,3 @@ class ModelToComponentFactory:
|
|
2579
2595
|
components_mapping=components_mapping,
|
2580
2596
|
parameters=model.parameters or {},
|
2581
2597
|
)
|
2582
|
-
|
2583
|
-
_UNSUPPORTED_DECODER_ERROR = (
|
2584
|
-
"Specified decoder of {decoder_type} is not supported for pagination."
|
2585
|
-
"Please set as `JsonDecoder`, `XmlDecoder`, or a `CompositeRawDecoder` with an inner_parser of `JsonParser` or `GzipParser` instead."
|
2586
|
-
"If using `GzipParser`, please ensure that the lowest level inner_parser is a `JsonParser`."
|
2587
|
-
)
|
2588
|
-
|
2589
|
-
def _is_supported_decoder_for_pagination(self, decoder: Decoder) -> bool:
|
2590
|
-
if isinstance(decoder, (JsonDecoder, XmlDecoder)):
|
2591
|
-
return True
|
2592
|
-
elif isinstance(decoder, CompositeRawDecoder):
|
2593
|
-
return self._is_supported_parser_for_pagination(decoder.parser)
|
2594
|
-
else:
|
2595
|
-
return False
|
2596
|
-
|
2597
|
-
def _is_supported_parser_for_pagination(self, parser: Parser) -> bool:
|
2598
|
-
if isinstance(parser, JsonParser):
|
2599
|
-
return True
|
2600
|
-
elif isinstance(parser, GzipParser):
|
2601
|
-
return isinstance(parser.inner_parser, JsonParser)
|
2602
|
-
else:
|
2603
|
-
return False
|
@@ -31,10 +31,6 @@ LOGGER = logging.getLogger("airbyte")
|
|
31
31
|
|
32
32
|
@dataclass
|
33
33
|
class AsyncHttpJobRepository(AsyncJobRepository):
|
34
|
-
"""
|
35
|
-
See Readme file for more details about flow.
|
36
|
-
"""
|
37
|
-
|
38
34
|
creation_requester: Requester
|
39
35
|
polling_requester: Requester
|
40
36
|
download_retriever: SimpleRetriever
|
@@ -48,9 +44,6 @@ class AsyncHttpJobRepository(AsyncJobRepository):
|
|
48
44
|
record_extractor: RecordExtractor = field(
|
49
45
|
init=False, repr=False, default_factory=lambda: ResponseToFileExtractor({})
|
50
46
|
)
|
51
|
-
url_requester: Optional[Requester] = (
|
52
|
-
None # use it in case polling_requester provides some <id> and extra request is needed to obtain list of urls to download from
|
53
|
-
)
|
54
47
|
|
55
48
|
def __post_init__(self) -> None:
|
56
49
|
self._create_job_response_by_id: Dict[str, Response] = {}
|
@@ -193,13 +186,10 @@ class AsyncHttpJobRepository(AsyncJobRepository):
|
|
193
186
|
|
194
187
|
"""
|
195
188
|
|
196
|
-
for url in self.
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
cursor_slice=job_slice.cursor_slice,
|
201
|
-
extra_fields={**job_slice.extra_fields, "url": url},
|
202
|
-
)
|
189
|
+
for url in self.urls_extractor.extract_records(
|
190
|
+
self._polling_job_response_by_id[job.api_job_id()]
|
191
|
+
):
|
192
|
+
stream_slice: StreamSlice = StreamSlice(partition={"url": url}, cursor_slice={})
|
203
193
|
for message in self.download_retriever.read_records({}, stream_slice):
|
204
194
|
if isinstance(message, Record):
|
205
195
|
yield message.data
|
@@ -236,22 +226,3 @@ class AsyncHttpJobRepository(AsyncJobRepository):
|
|
236
226
|
cursor_slice={},
|
237
227
|
)
|
238
228
|
return stream_slice
|
239
|
-
|
240
|
-
def _get_download_url(self, job: AsyncJob) -> Iterable[str]:
|
241
|
-
if not self.url_requester:
|
242
|
-
url_response = self._polling_job_response_by_id[job.api_job_id()]
|
243
|
-
else:
|
244
|
-
stream_slice: StreamSlice = StreamSlice(
|
245
|
-
partition={
|
246
|
-
"polling_job_response": self._polling_job_response_by_id[job.api_job_id()]
|
247
|
-
},
|
248
|
-
cursor_slice={},
|
249
|
-
)
|
250
|
-
url_response = self.url_requester.send_request(stream_slice=stream_slice) # type: ignore # we expect url_requester to always be presented, otherwise raise an exception as we cannot proceed with the report
|
251
|
-
if not url_response:
|
252
|
-
raise AirbyteTracedException(
|
253
|
-
internal_message="Always expect a response or an exception from url_requester",
|
254
|
-
failure_type=FailureType.system_error,
|
255
|
-
)
|
256
|
-
|
257
|
-
yield from self.urls_extractor.extract_records(url_response) # type: ignore # we expect urls_extractor to always return list of strings
|
@@ -160,7 +160,7 @@ class SimpleRetriever(Retriever):
|
|
160
160
|
stream_slice,
|
161
161
|
next_page_token,
|
162
162
|
self._paginator.get_request_headers,
|
163
|
-
self.
|
163
|
+
self.request_option_provider.get_request_headers,
|
164
164
|
)
|
165
165
|
if isinstance(headers, str):
|
166
166
|
raise ValueError("Request headers cannot be a string")
|
@@ -31,17 +31,6 @@ class DeliverRawFiles(BaseModel):
|
|
31
31
|
|
32
32
|
delivery_type: Literal["use_file_transfer"] = Field("use_file_transfer", const=True)
|
33
33
|
|
34
|
-
preserve_directory_structure: bool = Field(
|
35
|
-
title="Preserve Sub-Directories in File Paths",
|
36
|
-
description=(
|
37
|
-
"If enabled, sends subdirectory folder structure "
|
38
|
-
"along with source file names to the destination. "
|
39
|
-
"Otherwise, files will be synced by their names only. "
|
40
|
-
"This option is ignored when file-based replication is not enabled."
|
41
|
-
),
|
42
|
-
default=True,
|
43
|
-
)
|
44
|
-
|
45
34
|
|
46
35
|
class AbstractFileBasedSpec(BaseModel):
|
47
36
|
"""
|
@@ -111,40 +111,6 @@ class ErrorListingFiles(BaseFileBasedSourceError):
|
|
111
111
|
pass
|
112
112
|
|
113
113
|
|
114
|
-
class DuplicatedFilesError(BaseFileBasedSourceError):
|
115
|
-
def __init__(self, duplicated_files_names: List[dict[str, List[str]]], **kwargs: Any):
|
116
|
-
self._duplicated_files_names = duplicated_files_names
|
117
|
-
self._stream_name: str = kwargs["stream"]
|
118
|
-
super().__init__(self._format_duplicate_files_error_message(), **kwargs)
|
119
|
-
|
120
|
-
def _format_duplicate_files_error_message(self) -> str:
|
121
|
-
duplicated_files_messages = []
|
122
|
-
for duplicated_file in self._duplicated_files_names:
|
123
|
-
for duplicated_file_name, file_paths in duplicated_file.items():
|
124
|
-
file_duplicated_message = (
|
125
|
-
f"{len(file_paths)} duplicates found for file name {duplicated_file_name}:\n\n"
|
126
|
-
+ "".join(f"\n - {file_paths}")
|
127
|
-
)
|
128
|
-
duplicated_files_messages.append(file_duplicated_message)
|
129
|
-
|
130
|
-
error_message = (
|
131
|
-
f"ERROR: Duplicate filenames found for stream {self._stream_name}. "
|
132
|
-
"Duplicate file names are not allowed if the Preserve Sub-Directories in File Paths option is disabled. "
|
133
|
-
"Please remove or rename the duplicate files before attempting to re-run the sync.\n\n"
|
134
|
-
+ "\n".join(duplicated_files_messages)
|
135
|
-
)
|
136
|
-
|
137
|
-
return error_message
|
138
|
-
|
139
|
-
def __repr__(self) -> str:
|
140
|
-
"""Return a string representation of the exception."""
|
141
|
-
class_name = self.__class__.__name__
|
142
|
-
properties_str = ", ".join(
|
143
|
-
f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
|
144
|
-
)
|
145
|
-
return f"{class_name}({properties_str})"
|
146
|
-
|
147
|
-
|
148
114
|
class CustomFileBasedException(AirbyteTracedException):
|
149
115
|
"""
|
150
116
|
A specialized exception for file-based connectors.
|
@@ -242,7 +242,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
242
242
|
stream=self._make_default_stream(
|
243
243
|
stream_config=stream_config,
|
244
244
|
cursor=cursor,
|
245
|
-
|
245
|
+
use_file_transfer=self._use_file_transfer(parsed_config),
|
246
246
|
),
|
247
247
|
source=self,
|
248
248
|
logger=self.logger,
|
@@ -273,7 +273,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
273
273
|
stream=self._make_default_stream(
|
274
274
|
stream_config=stream_config,
|
275
275
|
cursor=cursor,
|
276
|
-
|
276
|
+
use_file_transfer=self._use_file_transfer(parsed_config),
|
277
277
|
),
|
278
278
|
source=self,
|
279
279
|
logger=self.logger,
|
@@ -285,7 +285,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
285
285
|
stream = self._make_default_stream(
|
286
286
|
stream_config=stream_config,
|
287
287
|
cursor=cursor,
|
288
|
-
|
288
|
+
use_file_transfer=self._use_file_transfer(parsed_config),
|
289
289
|
)
|
290
290
|
|
291
291
|
streams.append(stream)
|
@@ -298,7 +298,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
298
298
|
self,
|
299
299
|
stream_config: FileBasedStreamConfig,
|
300
300
|
cursor: Optional[AbstractFileBasedCursor],
|
301
|
-
|
301
|
+
use_file_transfer: bool = False,
|
302
302
|
) -> AbstractFileBasedStream:
|
303
303
|
return DefaultFileBasedStream(
|
304
304
|
config=stream_config,
|
@@ -310,8 +310,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
310
310
|
validation_policy=self._validate_and_get_validation_policy(stream_config),
|
311
311
|
errors_collector=self.errors_collector,
|
312
312
|
cursor=cursor,
|
313
|
-
use_file_transfer=
|
314
|
-
preserve_directory_structure=self._preserve_directory_structure(parsed_config),
|
313
|
+
use_file_transfer=use_file_transfer,
|
315
314
|
)
|
316
315
|
|
317
316
|
def _get_stream_from_catalog(
|
@@ -386,25 +385,3 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
386
385
|
and parsed_config.delivery_method.delivery_type == "use_file_transfer"
|
387
386
|
)
|
388
387
|
return use_file_transfer
|
389
|
-
|
390
|
-
@staticmethod
|
391
|
-
def _preserve_directory_structure(parsed_config: AbstractFileBasedSpec) -> bool:
|
392
|
-
"""
|
393
|
-
Determines whether to preserve directory structure during file transfer.
|
394
|
-
|
395
|
-
When enabled, files maintain their subdirectory paths in the destination.
|
396
|
-
When disabled, files are flattened to the root of the destination.
|
397
|
-
|
398
|
-
Args:
|
399
|
-
parsed_config: The parsed configuration containing delivery method settings
|
400
|
-
|
401
|
-
Returns:
|
402
|
-
True if directory structure should be preserved (default), False otherwise
|
403
|
-
"""
|
404
|
-
if (
|
405
|
-
FileBasedSource._use_file_transfer(parsed_config)
|
406
|
-
and hasattr(parsed_config.delivery_method, "preserve_directory_structure")
|
407
|
-
and parsed_config.delivery_method.preserve_directory_structure is not None
|
408
|
-
):
|
409
|
-
return parsed_config.delivery_method.preserve_directory_structure
|
410
|
-
return True
|
@@ -135,17 +135,6 @@ class AbstractFileBasedStreamReader(ABC):
|
|
135
135
|
return use_file_transfer
|
136
136
|
return False
|
137
137
|
|
138
|
-
def preserve_directory_structure(self) -> bool:
|
139
|
-
# fall back to preserve subdirectories if config is not present or incomplete
|
140
|
-
if (
|
141
|
-
self.use_file_transfer()
|
142
|
-
and self.config
|
143
|
-
and hasattr(self.config.delivery_method, "preserve_directory_structure")
|
144
|
-
and self.config.delivery_method.preserve_directory_structure is not None
|
145
|
-
):
|
146
|
-
return self.config.delivery_method.preserve_directory_structure
|
147
|
-
return True
|
148
|
-
|
149
138
|
@abstractmethod
|
150
139
|
def get_file(
|
151
140
|
self, file: RemoteFile, local_directory: str, logger: logging.Logger
|
@@ -170,13 +159,10 @@ class AbstractFileBasedStreamReader(ABC):
|
|
170
159
|
"""
|
171
160
|
...
|
172
161
|
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
file_relative_path = file.uri.lstrip("/")
|
178
|
-
else:
|
179
|
-
file_relative_path = path.basename(file.uri)
|
162
|
+
@staticmethod
|
163
|
+
def _get_file_transfer_paths(file: RemoteFile, local_directory: str) -> List[str]:
|
164
|
+
# Remove left slashes from source path format to make relative path for writing locally
|
165
|
+
file_relative_path = file.uri.lstrip("/")
|
180
166
|
local_file_path = path.join(local_directory, file_relative_path)
|
181
167
|
|
182
168
|
# Ensure the local directory exists
|