airbyte-cdk 6.5.3rc2__py3-none-any.whl → 6.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/__init__.py +17 -2
- airbyte_cdk/config_observation.py +10 -3
- airbyte_cdk/connector.py +19 -9
- airbyte_cdk/connector_builder/connector_builder_handler.py +28 -8
- airbyte_cdk/connector_builder/main.py +26 -6
- airbyte_cdk/connector_builder/message_grouper.py +95 -25
- airbyte_cdk/destinations/destination.py +47 -14
- airbyte_cdk/destinations/vector_db_based/config.py +36 -14
- airbyte_cdk/destinations/vector_db_based/document_processor.py +49 -11
- airbyte_cdk/destinations/vector_db_based/embedder.py +52 -11
- airbyte_cdk/destinations/vector_db_based/test_utils.py +14 -4
- airbyte_cdk/destinations/vector_db_based/utils.py +8 -2
- airbyte_cdk/destinations/vector_db_based/writer.py +15 -4
- airbyte_cdk/entrypoint.py +82 -26
- airbyte_cdk/exception_handler.py +13 -3
- airbyte_cdk/logger.py +10 -2
- airbyte_cdk/models/airbyte_protocol.py +11 -5
- airbyte_cdk/models/airbyte_protocol_serializers.py +9 -3
- airbyte_cdk/models/well_known_types.py +1 -1
- airbyte_cdk/sources/abstract_source.py +63 -17
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +47 -14
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +25 -7
- airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +27 -6
- airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +9 -3
- airbyte_cdk/sources/connector_state_manager.py +32 -10
- airbyte_cdk/sources/declarative/async_job/job.py +3 -1
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +68 -14
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +24 -6
- airbyte_cdk/sources/declarative/async_job/repository.py +3 -1
- airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +3 -1
- airbyte_cdk/sources/declarative/auth/jwt.py +27 -7
- airbyte_cdk/sources/declarative/auth/oauth.py +35 -11
- airbyte_cdk/sources/declarative/auth/selective_authenticator.py +3 -1
- airbyte_cdk/sources/declarative/auth/token.py +25 -8
- airbyte_cdk/sources/declarative/checks/check_stream.py +12 -4
- airbyte_cdk/sources/declarative/checks/connection_checker.py +3 -1
- airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +11 -3
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +106 -50
- airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +20 -6
- airbyte_cdk/sources/declarative/declarative_source.py +3 -1
- airbyte_cdk/sources/declarative/declarative_stream.py +27 -6
- airbyte_cdk/sources/declarative/decoders/decoder.py +3 -1
- airbyte_cdk/sources/declarative/decoders/json_decoder.py +3 -1
- airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +3 -1
- airbyte_cdk/sources/declarative/decoders/xml_decoder.py +6 -2
- airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +6 -2
- airbyte_cdk/sources/declarative/extractors/record_filter.py +24 -7
- airbyte_cdk/sources/declarative/extractors/record_selector.py +10 -3
- airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +15 -5
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +96 -31
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +22 -8
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +46 -15
- airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +19 -5
- airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +3 -1
- airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +20 -2
- airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +5 -1
- airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +10 -3
- airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +6 -2
- airbyte_cdk/sources/declarative/interpolation/interpolation.py +7 -1
- airbyte_cdk/sources/declarative/interpolation/jinja.py +6 -2
- airbyte_cdk/sources/declarative/interpolation/macros.py +19 -4
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +106 -24
- airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +7 -2
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +656 -678
- airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +13 -4
- airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +9 -2
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +782 -232
- airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +29 -7
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +25 -7
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +54 -15
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +6 -2
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +3 -1
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +17 -5
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +15 -5
- airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +3 -1
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +18 -8
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +16 -7
- airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +51 -14
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +29 -8
- airbyte_cdk/sources/declarative/requesters/http_requester.py +58 -16
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +49 -14
- airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +3 -1
- airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +3 -1
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +17 -5
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +24 -7
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +9 -3
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +3 -1
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +6 -2
- airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +19 -6
- airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +3 -1
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +21 -7
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +18 -6
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +27 -8
- airbyte_cdk/sources/declarative/requesters/requester.py +3 -1
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +12 -5
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +105 -24
- airbyte_cdk/sources/declarative/schema/default_schema_loader.py +3 -1
- airbyte_cdk/sources/declarative/spec/spec.py +8 -2
- airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +3 -1
- airbyte_cdk/sources/declarative/transformations/add_fields.py +12 -3
- airbyte_cdk/sources/declarative/transformations/remove_fields.py +6 -2
- airbyte_cdk/sources/declarative/types.py +8 -1
- airbyte_cdk/sources/declarative/yaml_declarative_source.py +3 -1
- airbyte_cdk/sources/embedded/base_integration.py +14 -4
- airbyte_cdk/sources/embedded/catalog.py +16 -4
- airbyte_cdk/sources/embedded/runner.py +19 -3
- airbyte_cdk/sources/embedded/tools.py +3 -1
- airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +12 -4
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +27 -7
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +12 -6
- airbyte_cdk/sources/file_based/config/csv_format.py +21 -9
- airbyte_cdk/sources/file_based/config/file_based_stream_config.py +6 -2
- airbyte_cdk/sources/file_based/config/unstructured_format.py +10 -3
- airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +2 -4
- airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +7 -2
- airbyte_cdk/sources/file_based/exceptions.py +13 -15
- airbyte_cdk/sources/file_based/file_based_source.py +82 -24
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +16 -5
- airbyte_cdk/sources/file_based/file_types/avro_parser.py +58 -17
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +89 -26
- airbyte_cdk/sources/file_based/file_types/excel_parser.py +25 -7
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +8 -2
- airbyte_cdk/sources/file_based/file_types/file_type_parser.py +4 -1
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +20 -6
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +57 -16
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +64 -15
- airbyte_cdk/sources/file_based/schema_helpers.py +33 -10
- airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +3 -1
- airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +16 -5
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +33 -10
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +47 -11
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +13 -22
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +53 -17
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +17 -5
- airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +3 -1
- airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +26 -9
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +67 -21
- airbyte_cdk/sources/http_logger.py +5 -1
- airbyte_cdk/sources/message/repository.py +18 -4
- airbyte_cdk/sources/source.py +17 -7
- airbyte_cdk/sources/streams/availability_strategy.py +9 -3
- airbyte_cdk/sources/streams/call_rate.py +63 -19
- airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +31 -7
- airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +6 -2
- airbyte_cdk/sources/streams/concurrent/adapters.py +77 -22
- airbyte_cdk/sources/streams/concurrent/cursor.py +56 -20
- airbyte_cdk/sources/streams/concurrent/default_stream.py +9 -2
- airbyte_cdk/sources/streams/concurrent/helpers.py +6 -2
- airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +9 -2
- airbyte_cdk/sources/streams/concurrent/partition_reader.py +4 -1
- airbyte_cdk/sources/streams/concurrent/partitions/record.py +10 -2
- airbyte_cdk/sources/streams/concurrent/partitions/types.py +6 -2
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +25 -10
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +32 -16
- airbyte_cdk/sources/streams/core.py +77 -22
- airbyte_cdk/sources/streams/http/availability_strategy.py +3 -1
- airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +4 -1
- airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +3 -1
- airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +16 -5
- airbyte_cdk/sources/streams/http/error_handlers/response_models.py +9 -3
- airbyte_cdk/sources/streams/http/exceptions.py +2 -2
- airbyte_cdk/sources/streams/http/http.py +133 -33
- airbyte_cdk/sources/streams/http/http_client.py +91 -29
- airbyte_cdk/sources/streams/http/rate_limiting.py +23 -7
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +19 -6
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +38 -11
- airbyte_cdk/sources/streams/http/requests_native_auth/token.py +13 -3
- airbyte_cdk/sources/types.py +5 -1
- airbyte_cdk/sources/utils/record_helper.py +12 -3
- airbyte_cdk/sources/utils/schema_helpers.py +9 -3
- airbyte_cdk/sources/utils/slice_logger.py +4 -1
- airbyte_cdk/sources/utils/transform.py +24 -9
- airbyte_cdk/sql/exceptions.py +19 -6
- airbyte_cdk/sql/secrets.py +3 -1
- airbyte_cdk/sql/shared/catalog_providers.py +13 -4
- airbyte_cdk/sql/shared/sql_processor.py +44 -14
- airbyte_cdk/test/catalog_builder.py +19 -8
- airbyte_cdk/test/entrypoint_wrapper.py +27 -8
- airbyte_cdk/test/mock_http/mocker.py +41 -11
- airbyte_cdk/test/mock_http/request.py +9 -3
- airbyte_cdk/test/mock_http/response.py +3 -1
- airbyte_cdk/test/mock_http/response_builder.py +29 -7
- airbyte_cdk/test/state_builder.py +10 -2
- airbyte_cdk/test/utils/data.py +6 -2
- airbyte_cdk/test/utils/http_mocking.py +3 -1
- airbyte_cdk/utils/airbyte_secrets_utils.py +3 -1
- airbyte_cdk/utils/analytics_message.py +10 -2
- airbyte_cdk/utils/datetime_format_inferrer.py +4 -1
- airbyte_cdk/utils/mapping_helpers.py +3 -1
- airbyte_cdk/utils/message_utils.py +11 -4
- airbyte_cdk/utils/print_buffer.py +6 -1
- airbyte_cdk/utils/schema_inferrer.py +30 -9
- airbyte_cdk/utils/spec_schema_transformations.py +3 -1
- airbyte_cdk/utils/traced_exception.py +35 -9
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/METADATA +7 -6
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/RECORD +198 -198
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/WHEEL +0 -0
@@ -45,7 +45,9 @@ class AbstractFileBasedStreamReader(ABC):
|
|
45
45
|
...
|
46
46
|
|
47
47
|
@abstractmethod
|
48
|
-
def open_file(
|
48
|
+
def open_file(
|
49
|
+
self, file: RemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger
|
50
|
+
) -> IOBase:
|
49
51
|
"""
|
50
52
|
Return a file handle for reading.
|
51
53
|
|
@@ -80,11 +82,17 @@ class AbstractFileBasedStreamReader(ABC):
|
|
80
82
|
"""
|
81
83
|
...
|
82
84
|
|
83
|
-
def filter_files_by_globs_and_start_date(
|
85
|
+
def filter_files_by_globs_and_start_date(
|
86
|
+
self, files: List[RemoteFile], globs: List[str]
|
87
|
+
) -> Iterable[RemoteFile]:
|
84
88
|
"""
|
85
89
|
Utility method for filtering files based on globs.
|
86
90
|
"""
|
87
|
-
start_date =
|
91
|
+
start_date = (
|
92
|
+
datetime.strptime(self.config.start_date, self.DATE_TIME_FORMAT)
|
93
|
+
if self.config and self.config.start_date
|
94
|
+
else None
|
95
|
+
)
|
88
96
|
seen = set()
|
89
97
|
|
90
98
|
for file in files:
|
@@ -120,13 +128,16 @@ class AbstractFileBasedStreamReader(ABC):
|
|
120
128
|
def use_file_transfer(self) -> bool:
|
121
129
|
if self.config:
|
122
130
|
use_file_transfer = (
|
123
|
-
hasattr(self.config.delivery_method, "delivery_type")
|
131
|
+
hasattr(self.config.delivery_method, "delivery_type")
|
132
|
+
and self.config.delivery_method.delivery_type == "use_file_transfer"
|
124
133
|
)
|
125
134
|
return use_file_transfer
|
126
135
|
return False
|
127
136
|
|
128
137
|
@abstractmethod
|
129
|
-
def get_file(
|
138
|
+
def get_file(
|
139
|
+
self, file: RemoteFile, local_directory: str, logger: logging.Logger
|
140
|
+
) -> Dict[str, Any]:
|
130
141
|
"""
|
131
142
|
This is required for connectors that will support writing to
|
132
143
|
files. It will handle the logic to download,get,read,acquire or
|
@@ -9,7 +9,10 @@ import fastavro
|
|
9
9
|
from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat
|
10
10
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
11
11
|
from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
|
12
|
-
from airbyte_cdk.sources.file_based.file_based_stream_reader import
|
12
|
+
from airbyte_cdk.sources.file_based.file_based_stream_reader import (
|
13
|
+
AbstractFileBasedStreamReader,
|
14
|
+
FileReadMode,
|
15
|
+
)
|
13
16
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
14
17
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
15
18
|
from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
|
@@ -64,15 +67,21 @@ class AvroParser(FileTypeParser):
|
|
64
67
|
avro_schema = avro_reader.writer_schema
|
65
68
|
if not avro_schema["type"] == "record":
|
66
69
|
unsupported_type = avro_schema["type"]
|
67
|
-
raise ValueError(
|
70
|
+
raise ValueError(
|
71
|
+
f"Only record based avro files are supported. Found {unsupported_type}"
|
72
|
+
)
|
68
73
|
json_schema = {
|
69
|
-
field["name"]: AvroParser._convert_avro_type_to_json(
|
74
|
+
field["name"]: AvroParser._convert_avro_type_to_json(
|
75
|
+
avro_format, field["name"], field["type"]
|
76
|
+
)
|
70
77
|
for field in avro_schema["fields"]
|
71
78
|
}
|
72
79
|
return json_schema
|
73
80
|
|
74
81
|
@classmethod
|
75
|
-
def _convert_avro_type_to_json(
|
82
|
+
def _convert_avro_type_to_json(
|
83
|
+
cls, avro_format: AvroFormat, field_name: str, avro_field: str
|
84
|
+
) -> Mapping[str, Any]:
|
76
85
|
if isinstance(avro_field, str) and avro_field in AVRO_TYPE_TO_JSON_TYPE:
|
77
86
|
# Legacy behavior to retain backwards compatibility. Long term we should always represent doubles as strings
|
78
87
|
if avro_field == "double" and not avro_format.double_as_string:
|
@@ -83,17 +92,28 @@ class AvroParser(FileTypeParser):
|
|
83
92
|
return {
|
84
93
|
"type": "object",
|
85
94
|
"properties": {
|
86
|
-
object_field["name"]: AvroParser._convert_avro_type_to_json(
|
95
|
+
object_field["name"]: AvroParser._convert_avro_type_to_json(
|
96
|
+
avro_format, object_field["name"], object_field["type"]
|
97
|
+
)
|
87
98
|
for object_field in avro_field["fields"]
|
88
99
|
},
|
89
100
|
}
|
90
101
|
elif avro_field["type"] == "array":
|
91
102
|
if "items" not in avro_field:
|
92
|
-
raise ValueError(
|
93
|
-
|
103
|
+
raise ValueError(
|
104
|
+
f"{field_name} array type does not have a required field items"
|
105
|
+
)
|
106
|
+
return {
|
107
|
+
"type": "array",
|
108
|
+
"items": AvroParser._convert_avro_type_to_json(
|
109
|
+
avro_format, "", avro_field["items"]
|
110
|
+
),
|
111
|
+
}
|
94
112
|
elif avro_field["type"] == "enum":
|
95
113
|
if "symbols" not in avro_field:
|
96
|
-
raise ValueError(
|
114
|
+
raise ValueError(
|
115
|
+
f"{field_name} enum type does not have a required field symbols"
|
116
|
+
)
|
97
117
|
if "name" not in avro_field:
|
98
118
|
raise ValueError(f"{field_name} enum type does not have a required field name")
|
99
119
|
return {"type": "string", "enum": avro_field["symbols"]}
|
@@ -102,7 +122,9 @@ class AvroParser(FileTypeParser):
|
|
102
122
|
raise ValueError(f"{field_name} map type does not have a required field values")
|
103
123
|
return {
|
104
124
|
"type": "object",
|
105
|
-
"additionalProperties": AvroParser._convert_avro_type_to_json(
|
125
|
+
"additionalProperties": AvroParser._convert_avro_type_to_json(
|
126
|
+
avro_format, "", avro_field["values"]
|
127
|
+
),
|
106
128
|
}
|
107
129
|
elif avro_field["type"] == "fixed" and avro_field.get("logicalType") != "duration":
|
108
130
|
if "size" not in avro_field:
|
@@ -115,18 +137,27 @@ class AvroParser(FileTypeParser):
|
|
115
137
|
}
|
116
138
|
elif avro_field.get("logicalType") == "decimal":
|
117
139
|
if "precision" not in avro_field:
|
118
|
-
raise ValueError(
|
140
|
+
raise ValueError(
|
141
|
+
f"{field_name} decimal type does not have a required field precision"
|
142
|
+
)
|
119
143
|
if "scale" not in avro_field:
|
120
|
-
raise ValueError(
|
144
|
+
raise ValueError(
|
145
|
+
f"{field_name} decimal type does not have a required field scale"
|
146
|
+
)
|
121
147
|
max_whole_number_range = avro_field["precision"] - avro_field["scale"]
|
122
148
|
decimal_range = avro_field["scale"]
|
123
149
|
|
124
150
|
# This regex looks like a mess, but it is validation for at least one whole number and optional fractional numbers
|
125
151
|
# For example: ^-?\d{1,5}(?:\.\d{1,3})?$ would accept 12345.123 and 123456.12345 would be rejected
|
126
|
-
return {
|
152
|
+
return {
|
153
|
+
"type": "string",
|
154
|
+
"pattern": f"^-?\\d{{{1,max_whole_number_range}}}(?:\\.\\d{1,decimal_range})?$",
|
155
|
+
}
|
127
156
|
elif "logicalType" in avro_field:
|
128
157
|
if avro_field["logicalType"] not in AVRO_LOGICAL_TYPE_TO_JSON:
|
129
|
-
raise ValueError(
|
158
|
+
raise ValueError(
|
159
|
+
f"{avro_field['logicalType']} is not a valid Avro logical type"
|
160
|
+
)
|
130
161
|
return AVRO_LOGICAL_TYPE_TO_JSON[avro_field["logicalType"]]
|
131
162
|
else:
|
132
163
|
raise ValueError(f"Unsupported avro type: {avro_field}")
|
@@ -150,22 +181,32 @@ class AvroParser(FileTypeParser):
|
|
150
181
|
with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
|
151
182
|
avro_reader = fastavro.reader(fp)
|
152
183
|
schema = avro_reader.writer_schema
|
153
|
-
schema_field_name_to_type = {
|
184
|
+
schema_field_name_to_type = {
|
185
|
+
field["name"]: field["type"] for field in schema["fields"]
|
186
|
+
}
|
154
187
|
for record in avro_reader:
|
155
188
|
line_no += 1
|
156
189
|
yield {
|
157
|
-
record_field: self._to_output_value(
|
190
|
+
record_field: self._to_output_value(
|
191
|
+
avro_format,
|
192
|
+
schema_field_name_to_type[record_field],
|
193
|
+
record[record_field],
|
194
|
+
)
|
158
195
|
for record_field, record_value in schema_field_name_to_type.items()
|
159
196
|
}
|
160
197
|
except Exception as exc:
|
161
|
-
raise RecordParseError(
|
198
|
+
raise RecordParseError(
|
199
|
+
FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line_no
|
200
|
+
) from exc
|
162
201
|
|
163
202
|
@property
|
164
203
|
def file_read_mode(self) -> FileReadMode:
|
165
204
|
return FileReadMode.READ_BINARY
|
166
205
|
|
167
206
|
@staticmethod
|
168
|
-
def _to_output_value(
|
207
|
+
def _to_output_value(
|
208
|
+
avro_format: AvroFormat, record_type: Mapping[str, Any], record_value: Any
|
209
|
+
) -> Any:
|
169
210
|
if isinstance(record_value, bytes):
|
170
211
|
return record_value.decode()
|
171
212
|
elif not isinstance(record_type, Mapping):
|
@@ -13,10 +13,18 @@ from typing import Any, Callable, Dict, Generator, Iterable, List, Mapping, Opti
|
|
13
13
|
from uuid import uuid4
|
14
14
|
|
15
15
|
from airbyte_cdk.models import FailureType
|
16
|
-
from airbyte_cdk.sources.file_based.config.csv_format import
|
16
|
+
from airbyte_cdk.sources.file_based.config.csv_format import (
|
17
|
+
CsvFormat,
|
18
|
+
CsvHeaderAutogenerated,
|
19
|
+
CsvHeaderUserProvided,
|
20
|
+
InferenceType,
|
21
|
+
)
|
17
22
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
18
23
|
from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
|
19
|
-
from airbyte_cdk.sources.file_based.file_based_stream_reader import
|
24
|
+
from airbyte_cdk.sources.file_based.file_based_stream_reader import (
|
25
|
+
AbstractFileBasedStreamReader,
|
26
|
+
FileReadMode,
|
27
|
+
)
|
20
28
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
21
29
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
22
30
|
from airbyte_cdk.sources.file_based.schema_helpers import TYPE_PYTHON_MAPPING, SchemaType
|
@@ -77,7 +85,9 @@ class _CsvReader:
|
|
77
85
|
# than headers or more headers dans columns
|
78
86
|
if None in row:
|
79
87
|
if config_format.ignore_errors_on_fields_mismatch:
|
80
|
-
logger.error(
|
88
|
+
logger.error(
|
89
|
+
f"Skipping record in line {lineno} of file {file.uri}; invalid CSV row with missing column."
|
90
|
+
)
|
81
91
|
else:
|
82
92
|
raise RecordParseError(
|
83
93
|
FileBasedSourceError.ERROR_PARSING_RECORD_MISMATCHED_COLUMNS,
|
@@ -86,10 +96,14 @@ class _CsvReader:
|
|
86
96
|
)
|
87
97
|
if None in row.values():
|
88
98
|
if config_format.ignore_errors_on_fields_mismatch:
|
89
|
-
logger.error(
|
99
|
+
logger.error(
|
100
|
+
f"Skipping record in line {lineno} of file {file.uri}; invalid CSV row with extra column."
|
101
|
+
)
|
90
102
|
else:
|
91
103
|
raise RecordParseError(
|
92
|
-
FileBasedSourceError.ERROR_PARSING_RECORD_MISMATCHED_ROWS,
|
104
|
+
FileBasedSourceError.ERROR_PARSING_RECORD_MISMATCHED_ROWS,
|
105
|
+
filename=file.uri,
|
106
|
+
lineno=lineno,
|
93
107
|
)
|
94
108
|
yield row
|
95
109
|
finally:
|
@@ -105,7 +119,9 @@ class _CsvReader:
|
|
105
119
|
return config_format.header_definition.column_names # type: ignore # should be CsvHeaderUserProvided given the type
|
106
120
|
|
107
121
|
if isinstance(config_format.header_definition, CsvHeaderAutogenerated):
|
108
|
-
self._skip_rows(
|
122
|
+
self._skip_rows(
|
123
|
+
fp, config_format.skip_rows_before_header + config_format.skip_rows_after_header
|
124
|
+
)
|
109
125
|
headers = self._auto_generate_headers(fp, dialect_name)
|
110
126
|
else:
|
111
127
|
# Then read the header
|
@@ -165,11 +181,15 @@ class CsvParser(FileTypeParser):
|
|
165
181
|
# sources will likely require one. Rather than modify the interface now we can wait until the real use case
|
166
182
|
config_format = _extract_format(config)
|
167
183
|
type_inferrer_by_field: Dict[str, _TypeInferrer] = defaultdict(
|
168
|
-
lambda: _JsonTypeInferrer(
|
184
|
+
lambda: _JsonTypeInferrer(
|
185
|
+
config_format.true_values, config_format.false_values, config_format.null_values
|
186
|
+
)
|
169
187
|
if config_format.inference_type != InferenceType.NONE
|
170
188
|
else _DisabledTypeInferrer()
|
171
189
|
)
|
172
|
-
data_generator = self._csv_reader.read_data(
|
190
|
+
data_generator = self._csv_reader.read_data(
|
191
|
+
config, file, stream_reader, logger, self.file_read_mode
|
192
|
+
)
|
173
193
|
read_bytes = 0
|
174
194
|
for row in data_generator:
|
175
195
|
for header, value in row.items():
|
@@ -187,7 +207,10 @@ class CsvParser(FileTypeParser):
|
|
187
207
|
f"Else, please contact Airbyte.",
|
188
208
|
failure_type=FailureType.config_error,
|
189
209
|
)
|
190
|
-
schema = {
|
210
|
+
schema = {
|
211
|
+
header.strip(): {"type": type_inferred.infer()}
|
212
|
+
for header, type_inferred in type_inferrer_by_field.items()
|
213
|
+
}
|
191
214
|
data_generator.close()
|
192
215
|
return schema
|
193
216
|
|
@@ -203,19 +226,30 @@ class CsvParser(FileTypeParser):
|
|
203
226
|
try:
|
204
227
|
config_format = _extract_format(config)
|
205
228
|
if discovered_schema:
|
206
|
-
property_types = {
|
229
|
+
property_types = {
|
230
|
+
col: prop["type"] for col, prop in discovered_schema["properties"].items()
|
231
|
+
} # type: ignore # discovered_schema["properties"] is known to be a mapping
|
207
232
|
deduped_property_types = CsvParser._pre_propcess_property_types(property_types)
|
208
233
|
else:
|
209
234
|
deduped_property_types = {}
|
210
|
-
cast_fn = CsvParser._get_cast_function(
|
211
|
-
|
235
|
+
cast_fn = CsvParser._get_cast_function(
|
236
|
+
deduped_property_types, config_format, logger, config.schemaless
|
237
|
+
)
|
238
|
+
data_generator = self._csv_reader.read_data(
|
239
|
+
config, file, stream_reader, logger, self.file_read_mode
|
240
|
+
)
|
212
241
|
for row in data_generator:
|
213
242
|
line_no += 1
|
214
243
|
yield CsvParser._to_nullable(
|
215
|
-
cast_fn(row),
|
244
|
+
cast_fn(row),
|
245
|
+
deduped_property_types,
|
246
|
+
config_format.null_values,
|
247
|
+
config_format.strings_can_be_null,
|
216
248
|
)
|
217
249
|
except RecordParseError as parse_err:
|
218
|
-
raise RecordParseError(
|
250
|
+
raise RecordParseError(
|
251
|
+
FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line_no
|
252
|
+
) from parse_err
|
219
253
|
finally:
|
220
254
|
data_generator.close()
|
221
255
|
|
@@ -225,27 +259,47 @@ class CsvParser(FileTypeParser):
|
|
225
259
|
|
226
260
|
@staticmethod
|
227
261
|
def _get_cast_function(
|
228
|
-
deduped_property_types: Mapping[str, str],
|
262
|
+
deduped_property_types: Mapping[str, str],
|
263
|
+
config_format: CsvFormat,
|
264
|
+
logger: logging.Logger,
|
265
|
+
schemaless: bool,
|
229
266
|
) -> Callable[[Mapping[str, str]], Mapping[str, str]]:
|
230
267
|
# Only cast values if the schema is provided
|
231
268
|
if deduped_property_types and not schemaless:
|
232
|
-
return partial(
|
269
|
+
return partial(
|
270
|
+
CsvParser._cast_types,
|
271
|
+
deduped_property_types=deduped_property_types,
|
272
|
+
config_format=config_format,
|
273
|
+
logger=logger,
|
274
|
+
)
|
233
275
|
else:
|
234
276
|
# If no schema is provided, yield the rows as they are
|
235
277
|
return _no_cast
|
236
278
|
|
237
279
|
@staticmethod
|
238
280
|
def _to_nullable(
|
239
|
-
row: Mapping[str, str],
|
281
|
+
row: Mapping[str, str],
|
282
|
+
deduped_property_types: Mapping[str, str],
|
283
|
+
null_values: Set[str],
|
284
|
+
strings_can_be_null: bool,
|
240
285
|
) -> Dict[str, Optional[str]]:
|
241
286
|
nullable = {
|
242
|
-
k: None
|
287
|
+
k: None
|
288
|
+
if CsvParser._value_is_none(
|
289
|
+
v, deduped_property_types.get(k), null_values, strings_can_be_null
|
290
|
+
)
|
291
|
+
else v
|
243
292
|
for k, v in row.items()
|
244
293
|
}
|
245
294
|
return nullable
|
246
295
|
|
247
296
|
@staticmethod
|
248
|
-
def _value_is_none(
|
297
|
+
def _value_is_none(
|
298
|
+
value: Any,
|
299
|
+
deduped_property_type: Optional[str],
|
300
|
+
null_values: Set[str],
|
301
|
+
strings_can_be_null: bool,
|
302
|
+
) -> bool:
|
249
303
|
return value in null_values and (strings_can_be_null or deduped_property_type != "string")
|
250
304
|
|
251
305
|
@staticmethod
|
@@ -280,7 +334,10 @@ class CsvParser(FileTypeParser):
|
|
280
334
|
|
281
335
|
@staticmethod
|
282
336
|
def _cast_types(
|
283
|
-
row: Dict[str, str],
|
337
|
+
row: Dict[str, str],
|
338
|
+
deduped_property_types: Mapping[str, str],
|
339
|
+
config_format: CsvFormat,
|
340
|
+
logger: logging.Logger,
|
284
341
|
) -> Dict[str, Any]:
|
285
342
|
"""
|
286
343
|
Casts the values in the input 'row' dictionary according to the types defined in the JSON schema.
|
@@ -305,20 +362,22 @@ class CsvParser(FileTypeParser):
|
|
305
362
|
else:
|
306
363
|
warnings.append(_format_warning(key, value, prop_type))
|
307
364
|
|
308
|
-
elif python_type
|
365
|
+
elif python_type is bool:
|
309
366
|
try:
|
310
|
-
cast_value = _value_to_bool(
|
367
|
+
cast_value = _value_to_bool(
|
368
|
+
value, config_format.true_values, config_format.false_values
|
369
|
+
)
|
311
370
|
except ValueError:
|
312
371
|
warnings.append(_format_warning(key, value, prop_type))
|
313
372
|
|
314
|
-
elif python_type
|
373
|
+
elif python_type is dict:
|
315
374
|
try:
|
316
375
|
# we don't re-use _value_to_object here because we type the column as object as long as there is only one object
|
317
376
|
cast_value = orjson.loads(value)
|
318
377
|
except orjson.JSONDecodeError:
|
319
378
|
warnings.append(_format_warning(key, value, prop_type))
|
320
379
|
|
321
|
-
elif python_type
|
380
|
+
elif python_type is list:
|
322
381
|
try:
|
323
382
|
cast_value = _value_to_list(value)
|
324
383
|
except (ValueError, json.JSONDecodeError):
|
@@ -364,7 +423,9 @@ class _JsonTypeInferrer(_TypeInferrer):
|
|
364
423
|
_NUMBER_TYPE = "number"
|
365
424
|
_STRING_TYPE = "string"
|
366
425
|
|
367
|
-
def __init__(
|
426
|
+
def __init__(
|
427
|
+
self, boolean_trues: Set[str], boolean_falses: Set[str], null_values: Set[str]
|
428
|
+
) -> None:
|
368
429
|
self._boolean_trues = boolean_trues
|
369
430
|
self._boolean_falses = boolean_falses
|
370
431
|
self._null_values = null_values
|
@@ -375,7 +436,9 @@ class _JsonTypeInferrer(_TypeInferrer):
|
|
375
436
|
|
376
437
|
def infer(self) -> str:
|
377
438
|
types_by_value = {value: self._infer_type(value) for value in self._values}
|
378
|
-
types_excluding_null_values = [
|
439
|
+
types_excluding_null_values = [
|
440
|
+
types for types in types_by_value.values() if self._NULL_TYPE not in types
|
441
|
+
]
|
379
442
|
if not types_excluding_null_values:
|
380
443
|
# this is highly unusual but we will consider the column as a string
|
381
444
|
return self._STRING_TYPE
|
@@ -8,9 +8,19 @@ from pathlib import Path
|
|
8
8
|
from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union
|
9
9
|
|
10
10
|
import pandas as pd
|
11
|
-
from airbyte_cdk.sources.file_based.config.file_based_stream_config import
|
12
|
-
|
13
|
-
|
11
|
+
from airbyte_cdk.sources.file_based.config.file_based_stream_config import (
|
12
|
+
ExcelFormat,
|
13
|
+
FileBasedStreamConfig,
|
14
|
+
)
|
15
|
+
from airbyte_cdk.sources.file_based.exceptions import (
|
16
|
+
ConfigValidationError,
|
17
|
+
FileBasedSourceError,
|
18
|
+
RecordParseError,
|
19
|
+
)
|
20
|
+
from airbyte_cdk.sources.file_based.file_based_stream_reader import (
|
21
|
+
AbstractFileBasedStreamReader,
|
22
|
+
FileReadMode,
|
23
|
+
)
|
14
24
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
15
25
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
16
26
|
from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
|
@@ -63,7 +73,11 @@ class ExcelParser(FileTypeParser):
|
|
63
73
|
fields[column] = self.dtype_to_json_type(prev_frame_column_type, df_type)
|
64
74
|
|
65
75
|
schema = {
|
66
|
-
field: (
|
76
|
+
field: (
|
77
|
+
{"type": "string", "format": "date-time"}
|
78
|
+
if fields[field] == "date-time"
|
79
|
+
else {"type": fields[field]}
|
80
|
+
)
|
67
81
|
for field in fields
|
68
82
|
}
|
69
83
|
return schema
|
@@ -101,11 +115,15 @@ class ExcelParser(FileTypeParser):
|
|
101
115
|
# DataFrame.to_dict() method returns datetime values in pandas.Timestamp values, which are not serializable by orjson
|
102
116
|
# DataFrame.to_json() returns string with datetime values serialized to iso8601 with microseconds to align with pydantic behavior
|
103
117
|
# see PR description: https://github.com/airbytehq/airbyte/pull/44444/
|
104
|
-
yield from orjson.loads(
|
118
|
+
yield from orjson.loads(
|
119
|
+
df.to_json(orient="records", date_format="iso", date_unit="us")
|
120
|
+
)
|
105
121
|
|
106
122
|
except Exception as exc:
|
107
123
|
# Raise a RecordParseError if any exception occurs during parsing
|
108
|
-
raise RecordParseError(
|
124
|
+
raise RecordParseError(
|
125
|
+
FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri
|
126
|
+
) from exc
|
109
127
|
|
110
128
|
@property
|
111
129
|
def file_read_mode(self) -> FileReadMode:
|
@@ -133,7 +151,7 @@ class ExcelParser(FileTypeParser):
|
|
133
151
|
if current_type == "string":
|
134
152
|
# Previous column values were of the string type, no need to look further.
|
135
153
|
return current_type
|
136
|
-
if dtype
|
154
|
+
if dtype is object:
|
137
155
|
return "string"
|
138
156
|
if dtype in number_types and (not current_type or current_type == "number"):
|
139
157
|
return "number"
|
@@ -15,7 +15,11 @@ DEFAULT_LOCAL_DIRECTORY = "/tmp/airbyte-file-transfer"
|
|
15
15
|
|
16
16
|
class FileTransfer:
|
17
17
|
def __init__(self) -> None:
|
18
|
-
self._local_directory =
|
18
|
+
self._local_directory = (
|
19
|
+
AIRBYTE_STAGING_DIRECTORY
|
20
|
+
if os.path.exists(AIRBYTE_STAGING_DIRECTORY)
|
21
|
+
else DEFAULT_LOCAL_DIRECTORY
|
22
|
+
)
|
19
23
|
|
20
24
|
def get_file(
|
21
25
|
self,
|
@@ -25,7 +29,9 @@ class FileTransfer:
|
|
25
29
|
logger: logging.Logger,
|
26
30
|
) -> Iterable[Dict[str, Any]]:
|
27
31
|
try:
|
28
|
-
yield stream_reader.get_file(
|
32
|
+
yield stream_reader.get_file(
|
33
|
+
file=file, local_directory=self._local_directory, logger=logger
|
34
|
+
)
|
29
35
|
except Exception as ex:
|
30
36
|
logger.error("An error has occurred while getting file: %s", str(ex))
|
31
37
|
raise ex
|
@@ -7,7 +7,10 @@ from abc import ABC, abstractmethod
|
|
7
7
|
from typing import Any, Dict, Iterable, Mapping, Optional, Tuple
|
8
8
|
|
9
9
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
10
|
-
from airbyte_cdk.sources.file_based.file_based_stream_reader import
|
10
|
+
from airbyte_cdk.sources.file_based.file_based_stream_reader import (
|
11
|
+
AbstractFileBasedStreamReader,
|
12
|
+
FileReadMode,
|
13
|
+
)
|
11
14
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
12
15
|
from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
|
13
16
|
|
@@ -8,15 +8,21 @@ from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union
|
|
8
8
|
|
9
9
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
10
10
|
from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
|
11
|
-
from airbyte_cdk.sources.file_based.file_based_stream_reader import
|
11
|
+
from airbyte_cdk.sources.file_based.file_based_stream_reader import (
|
12
|
+
AbstractFileBasedStreamReader,
|
13
|
+
FileReadMode,
|
14
|
+
)
|
12
15
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
13
16
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
14
|
-
from airbyte_cdk.sources.file_based.schema_helpers import
|
17
|
+
from airbyte_cdk.sources.file_based.schema_helpers import (
|
18
|
+
PYTHON_TYPE_MAPPING,
|
19
|
+
SchemaType,
|
20
|
+
merge_schemas,
|
21
|
+
)
|
15
22
|
from orjson import orjson
|
16
23
|
|
17
24
|
|
18
25
|
class JsonlParser(FileTypeParser):
|
19
|
-
|
20
26
|
MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE = 1_000_000
|
21
27
|
ENCODING = "utf8"
|
22
28
|
|
@@ -103,7 +109,9 @@ class JsonlParser(FileTypeParser):
|
|
103
109
|
try:
|
104
110
|
record = orjson.loads(accumulator)
|
105
111
|
if had_json_parsing_error and not has_warned_for_multiline_json_object:
|
106
|
-
logger.warning(
|
112
|
+
logger.warning(
|
113
|
+
f"File at {file.uri} is using multiline JSON. Performance could be greatly reduced"
|
114
|
+
)
|
107
115
|
has_warned_for_multiline_json_object = True
|
108
116
|
|
109
117
|
yield record
|
@@ -112,7 +120,11 @@ class JsonlParser(FileTypeParser):
|
|
112
120
|
except orjson.JSONDecodeError:
|
113
121
|
had_json_parsing_error = True
|
114
122
|
|
115
|
-
if
|
123
|
+
if (
|
124
|
+
read_limit
|
125
|
+
and yielded_at_least_once
|
126
|
+
and read_bytes >= self.MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE
|
127
|
+
):
|
116
128
|
logger.warning(
|
117
129
|
f"Exceeded the maximum number of bytes per file for schema inference ({self.MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE}). "
|
118
130
|
f"Inferring schema from an incomplete set of records."
|
@@ -120,7 +132,9 @@ class JsonlParser(FileTypeParser):
|
|
120
132
|
break
|
121
133
|
|
122
134
|
if had_json_parsing_error and not yielded_at_least_once:
|
123
|
-
raise RecordParseError(
|
135
|
+
raise RecordParseError(
|
136
|
+
FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line
|
137
|
+
)
|
124
138
|
|
125
139
|
@staticmethod
|
126
140
|
def _instantiate_accumulator(line: Union[bytes, str]) -> Union[bytes, str]:
|