airbyte-cdk 6.5.3rc2__py3-none-any.whl → 6.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/__init__.py +17 -2
- airbyte_cdk/config_observation.py +10 -3
- airbyte_cdk/connector.py +19 -9
- airbyte_cdk/connector_builder/connector_builder_handler.py +28 -8
- airbyte_cdk/connector_builder/main.py +26 -6
- airbyte_cdk/connector_builder/message_grouper.py +95 -25
- airbyte_cdk/destinations/destination.py +47 -14
- airbyte_cdk/destinations/vector_db_based/config.py +36 -14
- airbyte_cdk/destinations/vector_db_based/document_processor.py +49 -11
- airbyte_cdk/destinations/vector_db_based/embedder.py +52 -11
- airbyte_cdk/destinations/vector_db_based/test_utils.py +14 -4
- airbyte_cdk/destinations/vector_db_based/utils.py +8 -2
- airbyte_cdk/destinations/vector_db_based/writer.py +15 -4
- airbyte_cdk/entrypoint.py +82 -26
- airbyte_cdk/exception_handler.py +13 -3
- airbyte_cdk/logger.py +10 -2
- airbyte_cdk/models/airbyte_protocol.py +11 -5
- airbyte_cdk/models/airbyte_protocol_serializers.py +9 -3
- airbyte_cdk/models/well_known_types.py +1 -1
- airbyte_cdk/sources/abstract_source.py +63 -17
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +47 -14
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +25 -7
- airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +27 -6
- airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +9 -3
- airbyte_cdk/sources/connector_state_manager.py +32 -10
- airbyte_cdk/sources/declarative/async_job/job.py +3 -1
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +68 -14
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +24 -6
- airbyte_cdk/sources/declarative/async_job/repository.py +3 -1
- airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +3 -1
- airbyte_cdk/sources/declarative/auth/jwt.py +27 -7
- airbyte_cdk/sources/declarative/auth/oauth.py +35 -11
- airbyte_cdk/sources/declarative/auth/selective_authenticator.py +3 -1
- airbyte_cdk/sources/declarative/auth/token.py +25 -8
- airbyte_cdk/sources/declarative/checks/check_stream.py +12 -4
- airbyte_cdk/sources/declarative/checks/connection_checker.py +3 -1
- airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +11 -3
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +106 -50
- airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +20 -6
- airbyte_cdk/sources/declarative/declarative_source.py +3 -1
- airbyte_cdk/sources/declarative/declarative_stream.py +27 -6
- airbyte_cdk/sources/declarative/decoders/decoder.py +3 -1
- airbyte_cdk/sources/declarative/decoders/json_decoder.py +3 -1
- airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +3 -1
- airbyte_cdk/sources/declarative/decoders/xml_decoder.py +6 -2
- airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +6 -2
- airbyte_cdk/sources/declarative/extractors/record_filter.py +24 -7
- airbyte_cdk/sources/declarative/extractors/record_selector.py +10 -3
- airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +15 -5
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +96 -31
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +22 -8
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +46 -15
- airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +19 -5
- airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +3 -1
- airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +20 -2
- airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +5 -1
- airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +10 -3
- airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +6 -2
- airbyte_cdk/sources/declarative/interpolation/interpolation.py +7 -1
- airbyte_cdk/sources/declarative/interpolation/jinja.py +6 -2
- airbyte_cdk/sources/declarative/interpolation/macros.py +19 -4
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +106 -24
- airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +7 -2
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +656 -678
- airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +13 -4
- airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +9 -2
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +782 -232
- airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +29 -7
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +25 -7
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +54 -15
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +6 -2
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +3 -1
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +17 -5
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +15 -5
- airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +3 -1
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +18 -8
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +16 -7
- airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +51 -14
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +29 -8
- airbyte_cdk/sources/declarative/requesters/http_requester.py +58 -16
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +49 -14
- airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +3 -1
- airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +3 -1
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +17 -5
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +24 -7
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +9 -3
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +3 -1
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +6 -2
- airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +19 -6
- airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +3 -1
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +21 -7
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +18 -6
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +27 -8
- airbyte_cdk/sources/declarative/requesters/requester.py +3 -1
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +12 -5
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +105 -24
- airbyte_cdk/sources/declarative/schema/default_schema_loader.py +3 -1
- airbyte_cdk/sources/declarative/spec/spec.py +8 -2
- airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +3 -1
- airbyte_cdk/sources/declarative/transformations/add_fields.py +12 -3
- airbyte_cdk/sources/declarative/transformations/remove_fields.py +6 -2
- airbyte_cdk/sources/declarative/types.py +8 -1
- airbyte_cdk/sources/declarative/yaml_declarative_source.py +3 -1
- airbyte_cdk/sources/embedded/base_integration.py +14 -4
- airbyte_cdk/sources/embedded/catalog.py +16 -4
- airbyte_cdk/sources/embedded/runner.py +19 -3
- airbyte_cdk/sources/embedded/tools.py +3 -1
- airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +12 -4
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +27 -7
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +12 -6
- airbyte_cdk/sources/file_based/config/csv_format.py +21 -9
- airbyte_cdk/sources/file_based/config/file_based_stream_config.py +6 -2
- airbyte_cdk/sources/file_based/config/unstructured_format.py +10 -3
- airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +2 -4
- airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +7 -2
- airbyte_cdk/sources/file_based/exceptions.py +13 -15
- airbyte_cdk/sources/file_based/file_based_source.py +82 -24
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +16 -5
- airbyte_cdk/sources/file_based/file_types/avro_parser.py +58 -17
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +89 -26
- airbyte_cdk/sources/file_based/file_types/excel_parser.py +25 -7
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +8 -2
- airbyte_cdk/sources/file_based/file_types/file_type_parser.py +4 -1
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +20 -6
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +57 -16
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +64 -15
- airbyte_cdk/sources/file_based/schema_helpers.py +33 -10
- airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +3 -1
- airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +16 -5
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +33 -10
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +47 -11
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +13 -22
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +53 -17
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +17 -5
- airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +3 -1
- airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +26 -9
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +67 -21
- airbyte_cdk/sources/http_logger.py +5 -1
- airbyte_cdk/sources/message/repository.py +18 -4
- airbyte_cdk/sources/source.py +17 -7
- airbyte_cdk/sources/streams/availability_strategy.py +9 -3
- airbyte_cdk/sources/streams/call_rate.py +63 -19
- airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +31 -7
- airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +6 -2
- airbyte_cdk/sources/streams/concurrent/adapters.py +77 -22
- airbyte_cdk/sources/streams/concurrent/cursor.py +56 -20
- airbyte_cdk/sources/streams/concurrent/default_stream.py +9 -2
- airbyte_cdk/sources/streams/concurrent/helpers.py +6 -2
- airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +9 -2
- airbyte_cdk/sources/streams/concurrent/partition_reader.py +4 -1
- airbyte_cdk/sources/streams/concurrent/partitions/record.py +10 -2
- airbyte_cdk/sources/streams/concurrent/partitions/types.py +6 -2
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +25 -10
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +32 -16
- airbyte_cdk/sources/streams/core.py +77 -22
- airbyte_cdk/sources/streams/http/availability_strategy.py +3 -1
- airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +4 -1
- airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +3 -1
- airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +16 -5
- airbyte_cdk/sources/streams/http/error_handlers/response_models.py +9 -3
- airbyte_cdk/sources/streams/http/exceptions.py +2 -2
- airbyte_cdk/sources/streams/http/http.py +133 -33
- airbyte_cdk/sources/streams/http/http_client.py +91 -29
- airbyte_cdk/sources/streams/http/rate_limiting.py +23 -7
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +19 -6
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +38 -11
- airbyte_cdk/sources/streams/http/requests_native_auth/token.py +13 -3
- airbyte_cdk/sources/types.py +5 -1
- airbyte_cdk/sources/utils/record_helper.py +12 -3
- airbyte_cdk/sources/utils/schema_helpers.py +9 -3
- airbyte_cdk/sources/utils/slice_logger.py +4 -1
- airbyte_cdk/sources/utils/transform.py +24 -9
- airbyte_cdk/sql/exceptions.py +19 -6
- airbyte_cdk/sql/secrets.py +3 -1
- airbyte_cdk/sql/shared/catalog_providers.py +13 -4
- airbyte_cdk/sql/shared/sql_processor.py +44 -14
- airbyte_cdk/test/catalog_builder.py +19 -8
- airbyte_cdk/test/entrypoint_wrapper.py +27 -8
- airbyte_cdk/test/mock_http/mocker.py +41 -11
- airbyte_cdk/test/mock_http/request.py +9 -3
- airbyte_cdk/test/mock_http/response.py +3 -1
- airbyte_cdk/test/mock_http/response_builder.py +29 -7
- airbyte_cdk/test/state_builder.py +10 -2
- airbyte_cdk/test/utils/data.py +6 -2
- airbyte_cdk/test/utils/http_mocking.py +3 -1
- airbyte_cdk/utils/airbyte_secrets_utils.py +3 -1
- airbyte_cdk/utils/analytics_message.py +10 -2
- airbyte_cdk/utils/datetime_format_inferrer.py +4 -1
- airbyte_cdk/utils/mapping_helpers.py +3 -1
- airbyte_cdk/utils/message_utils.py +11 -4
- airbyte_cdk/utils/print_buffer.py +6 -1
- airbyte_cdk/utils/schema_inferrer.py +30 -9
- airbyte_cdk/utils/spec_schema_transformations.py +3 -1
- airbyte_cdk/utils/traced_exception.py +35 -9
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/METADATA +7 -6
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/RECORD +198 -198
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/WHEEL +0 -0
@@ -10,9 +10,19 @@ from urllib.parse import unquote
|
|
10
10
|
|
11
11
|
import pyarrow as pa
|
12
12
|
import pyarrow.parquet as pq
|
13
|
-
from airbyte_cdk.sources.file_based.config.file_based_stream_config import
|
14
|
-
|
15
|
-
|
13
|
+
from airbyte_cdk.sources.file_based.config.file_based_stream_config import (
|
14
|
+
FileBasedStreamConfig,
|
15
|
+
ParquetFormat,
|
16
|
+
)
|
17
|
+
from airbyte_cdk.sources.file_based.exceptions import (
|
18
|
+
ConfigValidationError,
|
19
|
+
FileBasedSourceError,
|
20
|
+
RecordParseError,
|
21
|
+
)
|
22
|
+
from airbyte_cdk.sources.file_based.file_based_stream_reader import (
|
23
|
+
AbstractFileBasedStreamReader,
|
24
|
+
FileReadMode,
|
25
|
+
)
|
16
26
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
17
27
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
18
28
|
from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
|
@@ -20,7 +30,6 @@ from pyarrow import DictionaryArray, Scalar
|
|
20
30
|
|
21
31
|
|
22
32
|
class ParquetParser(FileTypeParser):
|
23
|
-
|
24
33
|
ENCODING = None
|
25
34
|
|
26
35
|
def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]:
|
@@ -45,9 +54,15 @@ class ParquetParser(FileTypeParser):
|
|
45
54
|
parquet_schema = parquet_file.schema_arrow
|
46
55
|
|
47
56
|
# Inferred non-partition schema
|
48
|
-
schema = {
|
57
|
+
schema = {
|
58
|
+
field.name: ParquetParser.parquet_type_to_schema_type(field.type, parquet_format)
|
59
|
+
for field in parquet_schema
|
60
|
+
}
|
49
61
|
# Inferred partition schema
|
50
|
-
partition_columns = {
|
62
|
+
partition_columns = {
|
63
|
+
partition.split("=")[0]: {"type": "string"}
|
64
|
+
for partition in self._extract_partitions(file.uri)
|
65
|
+
}
|
51
66
|
|
52
67
|
schema.update(partition_columns)
|
53
68
|
return schema
|
@@ -69,21 +84,27 @@ class ParquetParser(FileTypeParser):
|
|
69
84
|
try:
|
70
85
|
with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
|
71
86
|
reader = pq.ParquetFile(fp)
|
72
|
-
partition_columns = {
|
87
|
+
partition_columns = {
|
88
|
+
x.split("=")[0]: x.split("=")[1] for x in self._extract_partitions(file.uri)
|
89
|
+
}
|
73
90
|
for row_group in range(reader.num_row_groups):
|
74
91
|
batch = reader.read_row_group(row_group)
|
75
92
|
for row in range(batch.num_rows):
|
76
93
|
line_no += 1
|
77
94
|
yield {
|
78
95
|
**{
|
79
|
-
column: ParquetParser._to_output_value(
|
96
|
+
column: ParquetParser._to_output_value(
|
97
|
+
batch.column(column)[row], parquet_format
|
98
|
+
)
|
80
99
|
for column in batch.column_names
|
81
100
|
},
|
82
101
|
**partition_columns,
|
83
102
|
}
|
84
103
|
except Exception as exc:
|
85
104
|
raise RecordParseError(
|
86
|
-
FileBasedSourceError.ERROR_PARSING_RECORD,
|
105
|
+
FileBasedSourceError.ERROR_PARSING_RECORD,
|
106
|
+
filename=file.uri,
|
107
|
+
lineno=f"{row_group=}, {line_no=}",
|
87
108
|
) from exc
|
88
109
|
|
89
110
|
@staticmethod
|
@@ -95,7 +116,9 @@ class ParquetParser(FileTypeParser):
|
|
95
116
|
return FileReadMode.READ_BINARY
|
96
117
|
|
97
118
|
@staticmethod
|
98
|
-
def _to_output_value(
|
119
|
+
def _to_output_value(
|
120
|
+
parquet_value: Union[Scalar, DictionaryArray], parquet_format: ParquetFormat
|
121
|
+
) -> Any:
|
99
122
|
"""
|
100
123
|
Convert an entry in a pyarrow table to a value that can be output by the source.
|
101
124
|
"""
|
@@ -113,7 +136,11 @@ class ParquetParser(FileTypeParser):
|
|
113
136
|
return None
|
114
137
|
|
115
138
|
# Convert date and datetime objects to isoformat strings
|
116
|
-
if
|
139
|
+
if (
|
140
|
+
pa.types.is_time(parquet_value.type)
|
141
|
+
or pa.types.is_timestamp(parquet_value.type)
|
142
|
+
or pa.types.is_date(parquet_value.type)
|
143
|
+
):
|
117
144
|
return parquet_value.as_py().isoformat()
|
118
145
|
|
119
146
|
# Convert month_day_nano_interval to array
|
@@ -168,7 +195,9 @@ class ParquetParser(FileTypeParser):
|
|
168
195
|
}
|
169
196
|
|
170
197
|
@staticmethod
|
171
|
-
def parquet_type_to_schema_type(
|
198
|
+
def parquet_type_to_schema_type(
|
199
|
+
parquet_type: pa.DataType, parquet_format: ParquetFormat
|
200
|
+
) -> Mapping[str, str]:
|
172
201
|
"""
|
173
202
|
Convert a pyarrow data type to an Airbyte schema type.
|
174
203
|
Parquet data types are defined at https://arrow.apache.org/docs/python/api/datatypes.html
|
@@ -198,7 +227,9 @@ class ParquetParser(FileTypeParser):
|
|
198
227
|
@staticmethod
|
199
228
|
def _is_binary(parquet_type: pa.DataType) -> bool:
|
200
229
|
return bool(
|
201
|
-
pa.types.is_binary(parquet_type)
|
230
|
+
pa.types.is_binary(parquet_type)
|
231
|
+
or pa.types.is_large_binary(parquet_type)
|
232
|
+
or pa.types.is_fixed_size_binary(parquet_type)
|
202
233
|
)
|
203
234
|
|
204
235
|
@staticmethod
|
@@ -221,13 +252,23 @@ class ParquetParser(FileTypeParser):
|
|
221
252
|
pa.types.is_time(parquet_type)
|
222
253
|
or pa.types.is_string(parquet_type)
|
223
254
|
or pa.types.is_large_string(parquet_type)
|
224
|
-
or ParquetParser._is_binary(
|
255
|
+
or ParquetParser._is_binary(
|
256
|
+
parquet_type
|
257
|
+
) # Best we can do is return as a string since we do not support binary
|
225
258
|
)
|
226
259
|
|
227
260
|
@staticmethod
|
228
261
|
def _is_object(parquet_type: pa.DataType) -> bool:
|
229
|
-
return bool(
|
262
|
+
return bool(
|
263
|
+
pa.types.is_dictionary(parquet_type)
|
264
|
+
or pa.types.is_struct(parquet_type)
|
265
|
+
or pa.types.is_map(parquet_type)
|
266
|
+
)
|
230
267
|
|
231
268
|
@staticmethod
|
232
269
|
def _is_list(parquet_type: pa.DataType) -> bool:
|
233
|
-
return bool(
|
270
|
+
return bool(
|
271
|
+
pa.types.is_list(parquet_type)
|
272
|
+
or pa.types.is_large_list(parquet_type)
|
273
|
+
or parquet_type == pa.month_day_nano_interval()
|
274
|
+
)
|
@@ -19,13 +19,21 @@ from airbyte_cdk.sources.file_based.config.unstructured_format import (
|
|
19
19
|
UnstructuredFormat,
|
20
20
|
)
|
21
21
|
from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
|
22
|
-
from airbyte_cdk.sources.file_based.file_based_stream_reader import
|
22
|
+
from airbyte_cdk.sources.file_based.file_based_stream_reader import (
|
23
|
+
AbstractFileBasedStreamReader,
|
24
|
+
FileReadMode,
|
25
|
+
)
|
23
26
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
24
27
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
25
28
|
from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
|
26
29
|
from airbyte_cdk.utils import is_cloud_environment
|
27
30
|
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
28
|
-
from unstructured.file_utils.filetype import
|
31
|
+
from unstructured.file_utils.filetype import (
|
32
|
+
FILETYPE_TO_MIMETYPE,
|
33
|
+
STR_TO_FILETYPE,
|
34
|
+
FileType,
|
35
|
+
detect_filetype,
|
36
|
+
)
|
29
37
|
|
30
38
|
unstructured_partition_pdf = None
|
31
39
|
unstructured_partition_docx = None
|
@@ -109,7 +117,10 @@ class UnstructuredParser(FileTypeParser):
|
|
109
117
|
"type": "string",
|
110
118
|
"description": "Content of the file as markdown. Might be null if the file could not be parsed",
|
111
119
|
},
|
112
|
-
"document_key": {
|
120
|
+
"document_key": {
|
121
|
+
"type": "string",
|
122
|
+
"description": "Unique identifier of the document, e.g. the file path",
|
123
|
+
},
|
113
124
|
"_ab_source_file_parse_error": {
|
114
125
|
"type": "string",
|
115
126
|
"description": "Error message if the file could not be parsed even though the file is supported",
|
@@ -149,9 +160,19 @@ class UnstructuredParser(FileTypeParser):
|
|
149
160
|
else:
|
150
161
|
raise e
|
151
162
|
|
152
|
-
def _read_file(
|
163
|
+
def _read_file(
|
164
|
+
self,
|
165
|
+
file_handle: IOBase,
|
166
|
+
remote_file: RemoteFile,
|
167
|
+
format: UnstructuredFormat,
|
168
|
+
logger: logging.Logger,
|
169
|
+
) -> str:
|
153
170
|
_import_unstructured()
|
154
|
-
if (
|
171
|
+
if (
|
172
|
+
(not unstructured_partition_pdf)
|
173
|
+
or (not unstructured_partition_docx)
|
174
|
+
or (not unstructured_partition_pptx)
|
175
|
+
):
|
155
176
|
# check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
|
156
177
|
raise Exception("unstructured library is not available")
|
157
178
|
|
@@ -167,7 +188,9 @@ class UnstructuredParser(FileTypeParser):
|
|
167
188
|
return self._read_file_locally(file_handle, filetype, format.strategy, remote_file)
|
168
189
|
elif format.processing.mode == "api":
|
169
190
|
try:
|
170
|
-
result: str = self._read_file_remotely_with_retries(
|
191
|
+
result: str = self._read_file_remotely_with_retries(
|
192
|
+
file_handle, format.processing, filetype, format.strategy, remote_file
|
193
|
+
)
|
171
194
|
except Exception as e:
|
172
195
|
# If a parser error happens during remotely processing the file, this means the file is corrupted. This case is handled by the parse_records method, so just rethrow.
|
173
196
|
#
|
@@ -175,11 +198,15 @@ class UnstructuredParser(FileTypeParser):
|
|
175
198
|
# Once this parser leaves experimental stage, we should consider making this a system error instead for issues that might be transient.
|
176
199
|
if isinstance(e, RecordParseError):
|
177
200
|
raise e
|
178
|
-
raise AirbyteTracedException.from_exception(
|
201
|
+
raise AirbyteTracedException.from_exception(
|
202
|
+
e, failure_type=FailureType.config_error
|
203
|
+
)
|
179
204
|
|
180
205
|
return result
|
181
206
|
|
182
|
-
def _params_to_dict(
|
207
|
+
def _params_to_dict(
|
208
|
+
self, params: Optional[List[APIParameterConfigModel]], strategy: str
|
209
|
+
) -> Dict[str, Union[str, List[str]]]:
|
183
210
|
result_dict: Dict[str, Union[str, List[str]]] = {"strategy": strategy}
|
184
211
|
if params is None:
|
185
212
|
return result_dict
|
@@ -229,9 +256,16 @@ class UnstructuredParser(FileTypeParser):
|
|
229
256
|
|
230
257
|
return True, None
|
231
258
|
|
232
|
-
@backoff.on_exception(
|
259
|
+
@backoff.on_exception(
|
260
|
+
backoff.expo, requests.exceptions.RequestException, max_tries=5, giveup=user_error
|
261
|
+
)
|
233
262
|
def _read_file_remotely_with_retries(
|
234
|
-
self,
|
263
|
+
self,
|
264
|
+
file_handle: IOBase,
|
265
|
+
format: APIProcessingConfigModel,
|
266
|
+
filetype: FileType,
|
267
|
+
strategy: str,
|
268
|
+
remote_file: RemoteFile,
|
235
269
|
) -> str:
|
236
270
|
"""
|
237
271
|
Read a file remotely, retrying up to 5 times if the error is not caused by user error. This is useful for transient network errors or the API server being overloaded temporarily.
|
@@ -239,7 +273,12 @@ class UnstructuredParser(FileTypeParser):
|
|
239
273
|
return self._read_file_remotely(file_handle, format, filetype, strategy, remote_file)
|
240
274
|
|
241
275
|
def _read_file_remotely(
|
242
|
-
self,
|
276
|
+
self,
|
277
|
+
file_handle: IOBase,
|
278
|
+
format: APIProcessingConfigModel,
|
279
|
+
filetype: FileType,
|
280
|
+
strategy: str,
|
281
|
+
remote_file: RemoteFile,
|
243
282
|
) -> str:
|
244
283
|
headers = {"accept": "application/json", "unstructured-api-key": format.api_key}
|
245
284
|
|
@@ -247,7 +286,9 @@ class UnstructuredParser(FileTypeParser):
|
|
247
286
|
|
248
287
|
file_data = {"files": ("filename", file_handle, FILETYPE_TO_MIMETYPE[filetype])}
|
249
288
|
|
250
|
-
response = requests.post(
|
289
|
+
response = requests.post(
|
290
|
+
f"{format.api_url}/general/v0/general", headers=headers, data=data, files=file_data
|
291
|
+
)
|
251
292
|
|
252
293
|
if response.status_code == 422:
|
253
294
|
# 422 means the file couldn't be processed, but the API is working. Treat this as a parsing error (passing an error record to the destination).
|
@@ -260,9 +301,15 @@ class UnstructuredParser(FileTypeParser):
|
|
260
301
|
|
261
302
|
return self._render_markdown(json_response)
|
262
303
|
|
263
|
-
def _read_file_locally(
|
304
|
+
def _read_file_locally(
|
305
|
+
self, file_handle: IOBase, filetype: FileType, strategy: str, remote_file: RemoteFile
|
306
|
+
) -> str:
|
264
307
|
_import_unstructured()
|
265
|
-
if (
|
308
|
+
if (
|
309
|
+
(not unstructured_partition_pdf)
|
310
|
+
or (not unstructured_partition_docx)
|
311
|
+
or (not unstructured_partition_pptx)
|
312
|
+
):
|
266
313
|
# check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
|
267
314
|
raise Exception("unstructured library is not available")
|
268
315
|
|
@@ -290,7 +337,9 @@ class UnstructuredParser(FileTypeParser):
|
|
290
337
|
return self._render_markdown([element.to_dict() for element in elements])
|
291
338
|
|
292
339
|
def _create_parse_error(self, remote_file: RemoteFile, message: str) -> RecordParseError:
|
293
|
-
return RecordParseError(
|
340
|
+
return RecordParseError(
|
341
|
+
FileBasedSourceError.ERROR_PARSING_RECORD, filename=remote_file.uri, message=message
|
342
|
+
)
|
294
343
|
|
295
344
|
def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileType]:
|
296
345
|
"""
|
@@ -8,13 +8,20 @@ from enum import Enum
|
|
8
8
|
from functools import total_ordering
|
9
9
|
from typing import Any, Dict, List, Literal, Mapping, Optional, Tuple, Type, Union
|
10
10
|
|
11
|
-
from airbyte_cdk.sources.file_based.exceptions import
|
11
|
+
from airbyte_cdk.sources.file_based.exceptions import (
|
12
|
+
ConfigValidationError,
|
13
|
+
FileBasedSourceError,
|
14
|
+
SchemaInferenceError,
|
15
|
+
)
|
12
16
|
|
13
17
|
JsonSchemaSupportedType = Union[List[str], Literal["string"], str]
|
14
18
|
SchemaType = Mapping[str, Mapping[str, JsonSchemaSupportedType]]
|
15
19
|
|
16
20
|
schemaless_schema = {"type": "object", "properties": {"data": {"type": "object"}}}
|
17
|
-
file_transfer_schema = {
|
21
|
+
file_transfer_schema = {
|
22
|
+
"type": "object",
|
23
|
+
"properties": {"data": {"type": "object"}, "file": {"type": "object"}},
|
24
|
+
}
|
18
25
|
|
19
26
|
|
20
27
|
@total_ordering
|
@@ -129,7 +136,12 @@ def _choose_wider_type(key: str, t1: Mapping[str, Any], t2: Mapping[str, Any]) -
|
|
129
136
|
detected_types=f"{t1},{t2}",
|
130
137
|
)
|
131
138
|
# Schemas can still be merged if a key contains a null value in either t1 or t2, but it is still an object
|
132
|
-
elif (
|
139
|
+
elif (
|
140
|
+
(t1_type == "object" or t2_type == "object")
|
141
|
+
and t1_type != "null"
|
142
|
+
and t2_type != "null"
|
143
|
+
and t1 != t2
|
144
|
+
):
|
133
145
|
raise SchemaInferenceError(
|
134
146
|
FileBasedSourceError.SCHEMA_INFERENCE_ERROR,
|
135
147
|
details="Cannot merge schema for unequal object types.",
|
@@ -137,12 +149,19 @@ def _choose_wider_type(key: str, t1: Mapping[str, Any], t2: Mapping[str, Any]) -
|
|
137
149
|
detected_types=f"{t1},{t2}",
|
138
150
|
)
|
139
151
|
else:
|
140
|
-
comparable_t1 = get_comparable_type(
|
141
|
-
|
152
|
+
comparable_t1 = get_comparable_type(
|
153
|
+
TYPE_PYTHON_MAPPING[t1_type][0]
|
154
|
+
) # accessing the type_mapping value
|
155
|
+
comparable_t2 = get_comparable_type(
|
156
|
+
TYPE_PYTHON_MAPPING[t2_type][0]
|
157
|
+
) # accessing the type_mapping value
|
142
158
|
if not comparable_t1 and comparable_t2:
|
143
|
-
raise SchemaInferenceError(
|
159
|
+
raise SchemaInferenceError(
|
160
|
+
FileBasedSourceError.UNRECOGNIZED_TYPE, key=key, detected_types=f"{t1},{t2}"
|
161
|
+
)
|
144
162
|
return max(
|
145
|
-
[t1, t2],
|
163
|
+
[t1, t2],
|
164
|
+
key=lambda x: ComparableType(get_comparable_type(TYPE_PYTHON_MAPPING[x["type"]][0])),
|
146
165
|
) # accessing the type_mapping value
|
147
166
|
|
148
167
|
|
@@ -205,7 +224,8 @@ def _parse_json_input(input_schema: Union[str, Mapping[str, str]]) -> Optional[M
|
|
205
224
|
schema = input_schema
|
206
225
|
if not all(isinstance(s, str) for s in schema.values()):
|
207
226
|
raise ConfigValidationError(
|
208
|
-
FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA,
|
227
|
+
FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA,
|
228
|
+
details="Invalid input schema; nested schemas are not supported.",
|
209
229
|
)
|
210
230
|
|
211
231
|
except json.decoder.JSONDecodeError:
|
@@ -214,7 +234,9 @@ def _parse_json_input(input_schema: Union[str, Mapping[str, str]]) -> Optional[M
|
|
214
234
|
return schema
|
215
235
|
|
216
236
|
|
217
|
-
def type_mapping_to_jsonschema(
|
237
|
+
def type_mapping_to_jsonschema(
|
238
|
+
input_schema: Optional[Union[str, Mapping[str, str]]],
|
239
|
+
) -> Optional[Mapping[str, Any]]:
|
218
240
|
"""
|
219
241
|
Return the user input schema (type mapping), transformed to JSON Schema format.
|
220
242
|
|
@@ -241,7 +263,8 @@ def type_mapping_to_jsonschema(input_schema: Optional[Union[str, Mapping[str, st
|
|
241
263
|
|
242
264
|
if not _json_schema_type:
|
243
265
|
raise ConfigValidationError(
|
244
|
-
FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA,
|
266
|
+
FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA,
|
267
|
+
details=f"Invalid type '{type_name}' for property '{col_name}'.",
|
245
268
|
)
|
246
269
|
|
247
270
|
json_schema_type = _json_schema_type[0]
|
airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py
CHANGED
@@ -11,7 +11,9 @@ class AbstractSchemaValidationPolicy(ABC):
|
|
11
11
|
validate_schema_before_sync = False # Whether to verify that records conform to the schema during the stream's availabilty check
|
12
12
|
|
13
13
|
@abstractmethod
|
14
|
-
def record_passes_validation_policy(
|
14
|
+
def record_passes_validation_policy(
|
15
|
+
self, record: Mapping[str, Any], schema: Optional[Mapping[str, Any]]
|
16
|
+
) -> bool:
|
15
17
|
"""
|
16
18
|
Return True if the record passes the user's validation policy.
|
17
19
|
"""
|
airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py
CHANGED
@@ -5,7 +5,10 @@
|
|
5
5
|
from typing import Any, Mapping, Optional
|
6
6
|
|
7
7
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import ValidationPolicy
|
8
|
-
from airbyte_cdk.sources.file_based.exceptions import
|
8
|
+
from airbyte_cdk.sources.file_based.exceptions import (
|
9
|
+
FileBasedSourceError,
|
10
|
+
StopSyncPerValidationPolicy,
|
11
|
+
)
|
9
12
|
from airbyte_cdk.sources.file_based.schema_helpers import conforms_to_schema
|
10
13
|
from airbyte_cdk.sources.file_based.schema_validation_policies import AbstractSchemaValidationPolicy
|
11
14
|
|
@@ -13,14 +16,18 @@ from airbyte_cdk.sources.file_based.schema_validation_policies import AbstractSc
|
|
13
16
|
class EmitRecordPolicy(AbstractSchemaValidationPolicy):
|
14
17
|
name = "emit_record"
|
15
18
|
|
16
|
-
def record_passes_validation_policy(
|
19
|
+
def record_passes_validation_policy(
|
20
|
+
self, record: Mapping[str, Any], schema: Optional[Mapping[str, Any]]
|
21
|
+
) -> bool:
|
17
22
|
return True
|
18
23
|
|
19
24
|
|
20
25
|
class SkipRecordPolicy(AbstractSchemaValidationPolicy):
|
21
26
|
name = "skip_record"
|
22
27
|
|
23
|
-
def record_passes_validation_policy(
|
28
|
+
def record_passes_validation_policy(
|
29
|
+
self, record: Mapping[str, Any], schema: Optional[Mapping[str, Any]]
|
30
|
+
) -> bool:
|
24
31
|
return schema is not None and conforms_to_schema(record, schema)
|
25
32
|
|
26
33
|
|
@@ -28,9 +35,13 @@ class WaitForDiscoverPolicy(AbstractSchemaValidationPolicy):
|
|
28
35
|
name = "wait_for_discover"
|
29
36
|
validate_schema_before_sync = True
|
30
37
|
|
31
|
-
def record_passes_validation_policy(
|
38
|
+
def record_passes_validation_policy(
|
39
|
+
self, record: Mapping[str, Any], schema: Optional[Mapping[str, Any]]
|
40
|
+
) -> bool:
|
32
41
|
if schema is None or not conforms_to_schema(record, schema):
|
33
|
-
raise StopSyncPerValidationPolicy(
|
42
|
+
raise StopSyncPerValidationPolicy(
|
43
|
+
FileBasedSourceError.STOP_SYNC_PER_SCHEMA_VALIDATION_POLICY
|
44
|
+
)
|
34
45
|
return True
|
35
46
|
|
36
47
|
|
@@ -8,10 +8,20 @@ from typing import Any, Dict, Iterable, List, Mapping, Optional, Type
|
|
8
8
|
|
9
9
|
from airbyte_cdk import AirbyteMessage
|
10
10
|
from airbyte_cdk.models import SyncMode
|
11
|
-
from airbyte_cdk.sources.file_based.availability_strategy import
|
12
|
-
|
11
|
+
from airbyte_cdk.sources.file_based.availability_strategy import (
|
12
|
+
AbstractFileBasedAvailabilityStrategy,
|
13
|
+
)
|
14
|
+
from airbyte_cdk.sources.file_based.config.file_based_stream_config import (
|
15
|
+
FileBasedStreamConfig,
|
16
|
+
PrimaryKeyType,
|
17
|
+
)
|
13
18
|
from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy
|
14
|
-
from airbyte_cdk.sources.file_based.exceptions import
|
19
|
+
from airbyte_cdk.sources.file_based.exceptions import (
|
20
|
+
FileBasedErrorsCollector,
|
21
|
+
FileBasedSourceError,
|
22
|
+
RecordParseError,
|
23
|
+
UndefinedParserError,
|
24
|
+
)
|
15
25
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
|
16
26
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
17
27
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
@@ -64,8 +74,7 @@ class AbstractFileBasedStream(Stream):
|
|
64
74
|
|
65
75
|
@property
|
66
76
|
@abstractmethod
|
67
|
-
def primary_key(self) -> PrimaryKeyType:
|
68
|
-
...
|
77
|
+
def primary_key(self) -> PrimaryKeyType: ...
|
69
78
|
|
70
79
|
@cache
|
71
80
|
def list_files(self) -> List[RemoteFile]:
|
@@ -102,14 +111,20 @@ class AbstractFileBasedStream(Stream):
|
|
102
111
|
return self.read_records_from_slice(stream_slice)
|
103
112
|
|
104
113
|
@abstractmethod
|
105
|
-
def read_records_from_slice(
|
114
|
+
def read_records_from_slice(
|
115
|
+
self, stream_slice: StreamSlice
|
116
|
+
) -> Iterable[Mapping[str, Any] | AirbyteMessage]:
|
106
117
|
"""
|
107
118
|
Yield all records from all remote files in `list_files_for_this_sync`.
|
108
119
|
"""
|
109
120
|
...
|
110
121
|
|
111
122
|
def stream_slices(
|
112
|
-
self,
|
123
|
+
self,
|
124
|
+
*,
|
125
|
+
sync_mode: SyncMode,
|
126
|
+
cursor_field: Optional[List[str]] = None,
|
127
|
+
stream_state: Optional[Mapping[str, Any]] = None,
|
113
128
|
) -> Iterable[Optional[Mapping[str, Any]]]:
|
114
129
|
"""
|
115
130
|
This method acts as an adapter between the generic Stream interface and the file-based's
|
@@ -144,14 +159,22 @@ class AbstractFileBasedStream(Stream):
|
|
144
159
|
try:
|
145
160
|
return self._parsers[type(self.config.format)]
|
146
161
|
except KeyError:
|
147
|
-
raise UndefinedParserError(
|
162
|
+
raise UndefinedParserError(
|
163
|
+
FileBasedSourceError.UNDEFINED_PARSER,
|
164
|
+
stream=self.name,
|
165
|
+
format=type(self.config.format),
|
166
|
+
)
|
148
167
|
|
149
168
|
def record_passes_validation_policy(self, record: Mapping[str, Any]) -> bool:
|
150
169
|
if self.validation_policy:
|
151
|
-
return self.validation_policy.record_passes_validation_policy(
|
170
|
+
return self.validation_policy.record_passes_validation_policy(
|
171
|
+
record=record, schema=self.catalog_schema
|
172
|
+
)
|
152
173
|
else:
|
153
174
|
raise RecordParseError(
|
154
|
-
FileBasedSourceError.UNDEFINED_VALIDATION_POLICY,
|
175
|
+
FileBasedSourceError.UNDEFINED_VALIDATION_POLICY,
|
176
|
+
stream=self.name,
|
177
|
+
validation_policy=self.config.validation_policy,
|
155
178
|
)
|
156
179
|
|
157
180
|
@cached_property
|
@@ -7,7 +7,14 @@ import logging
|
|
7
7
|
from functools import cache, lru_cache
|
8
8
|
from typing import TYPE_CHECKING, Any, Iterable, List, Mapping, MutableMapping, Optional, Union
|
9
9
|
|
10
|
-
from airbyte_cdk.models import
|
10
|
+
from airbyte_cdk.models import (
|
11
|
+
AirbyteLogMessage,
|
12
|
+
AirbyteMessage,
|
13
|
+
ConfiguredAirbyteStream,
|
14
|
+
Level,
|
15
|
+
SyncMode,
|
16
|
+
Type,
|
17
|
+
)
|
11
18
|
from airbyte_cdk.sources import AbstractSource
|
12
19
|
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
13
20
|
from airbyte_cdk.sources.file_based.availability_strategy import (
|
@@ -26,7 +33,10 @@ from airbyte_cdk.sources.source import ExperimentalClassWarning
|
|
26
33
|
from airbyte_cdk.sources.streams.concurrent.abstract_stream_facade import AbstractStreamFacade
|
27
34
|
from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream
|
28
35
|
from airbyte_cdk.sources.streams.concurrent.exceptions import ExceptionWithDisplayMessage
|
29
|
-
from airbyte_cdk.sources.streams.concurrent.helpers import
|
36
|
+
from airbyte_cdk.sources.streams.concurrent.helpers import (
|
37
|
+
get_cursor_field_from_stream,
|
38
|
+
get_primary_key_from_stream,
|
39
|
+
)
|
30
40
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
31
41
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator
|
32
42
|
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
|
@@ -36,7 +46,9 @@ from airbyte_cdk.sources.utils.slice_logger import SliceLogger
|
|
36
46
|
from deprecated.classic import deprecated
|
37
47
|
|
38
48
|
if TYPE_CHECKING:
|
39
|
-
from airbyte_cdk.sources.file_based.stream.concurrent.cursor import
|
49
|
+
from airbyte_cdk.sources.file_based.stream.concurrent.cursor import (
|
50
|
+
AbstractConcurrentFileBasedCursor,
|
51
|
+
)
|
40
52
|
|
41
53
|
"""
|
42
54
|
This module contains adapters to help enabling concurrency on File-based Stream objects without needing to migrate to AbstractStream
|
@@ -72,7 +84,9 @@ class FileBasedStreamFacade(AbstractStreamFacade[DefaultStream], AbstractFileBas
|
|
72
84
|
partition_generator=FileBasedStreamPartitionGenerator(
|
73
85
|
stream,
|
74
86
|
message_repository,
|
75
|
-
SyncMode.full_refresh
|
87
|
+
SyncMode.full_refresh
|
88
|
+
if isinstance(cursor, FileBasedFinalStateCursor)
|
89
|
+
else SyncMode.incremental,
|
76
90
|
[cursor_field] if cursor_field is not None else None,
|
77
91
|
state,
|
78
92
|
cursor,
|
@@ -138,7 +152,10 @@ class FileBasedStreamFacade(AbstractStreamFacade[DefaultStream], AbstractFileBas
|
|
138
152
|
|
139
153
|
@property
|
140
154
|
def primary_key(self) -> PrimaryKeyType:
|
141
|
-
return
|
155
|
+
return (
|
156
|
+
self._legacy_stream.config.primary_key
|
157
|
+
or self.get_parser().get_parser_defined_primary_key(self._legacy_stream.config)
|
158
|
+
)
|
142
159
|
|
143
160
|
def get_parser(self) -> FileTypeParser:
|
144
161
|
return self._legacy_stream.get_parser()
|
@@ -185,7 +202,10 @@ class FileBasedStreamFacade(AbstractStreamFacade[DefaultStream], AbstractFileBas
|
|
185
202
|
# This shouldn't happen if the ConcurrentCursor was used
|
186
203
|
state = "unknown; no state attribute was available on the cursor"
|
187
204
|
yield AirbyteMessage(
|
188
|
-
type=Type.LOG,
|
205
|
+
type=Type.LOG,
|
206
|
+
log=AirbyteLogMessage(
|
207
|
+
level=Level.ERROR, message=f"Cursor State at time of exception: {state}"
|
208
|
+
),
|
189
209
|
)
|
190
210
|
raise exc
|
191
211
|
|
@@ -227,16 +247,30 @@ class FileBasedStreamPartition(Partition):
|
|
227
247
|
):
|
228
248
|
if isinstance(record_data, Mapping):
|
229
249
|
data_to_return = dict(record_data)
|
230
|
-
self._stream.transformer.transform(
|
250
|
+
self._stream.transformer.transform(
|
251
|
+
data_to_return, self._stream.get_json_schema()
|
252
|
+
)
|
231
253
|
yield Record(data_to_return, self)
|
232
|
-
elif
|
254
|
+
elif (
|
255
|
+
isinstance(record_data, AirbyteMessage)
|
256
|
+
and record_data.type == Type.RECORD
|
257
|
+
and record_data.record is not None
|
258
|
+
):
|
233
259
|
# `AirbyteMessage`s of type `Record` should also be yielded so they are enqueued
|
234
260
|
# If stream is flagged for file_transfer the record should data in file key
|
235
|
-
record_message_data =
|
261
|
+
record_message_data = (
|
262
|
+
record_data.record.file
|
263
|
+
if self._use_file_transfer()
|
264
|
+
else record_data.record.data
|
265
|
+
)
|
236
266
|
if not record_message_data:
|
237
267
|
raise ExceptionWithDisplayMessage("A record without data was found")
|
238
268
|
else:
|
239
|
-
yield Record(
|
269
|
+
yield Record(
|
270
|
+
data=record_message_data,
|
271
|
+
partition=self,
|
272
|
+
is_file_transfer_message=self._use_file_transfer(),
|
273
|
+
)
|
240
274
|
else:
|
241
275
|
self._message_repository.emit_message(record_data)
|
242
276
|
except Exception as e:
|
@@ -305,7 +339,9 @@ class FileBasedStreamPartitionGenerator(PartitionGenerator):
|
|
305
339
|
|
306
340
|
def generate(self) -> Iterable[FileBasedStreamPartition]:
|
307
341
|
pending_partitions = []
|
308
|
-
for _slice in self._stream.stream_slices(
|
342
|
+
for _slice in self._stream.stream_slices(
|
343
|
+
sync_mode=self._sync_mode, cursor_field=self._cursor_field, stream_state=self._state
|
344
|
+
):
|
309
345
|
if _slice is not None:
|
310
346
|
for file in _slice.get("files", []):
|
311
347
|
pending_partitions.append(
|