airbyte-cdk 0.58.4__py3-none-any.whl → 0.58.7__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (23) hide show
  1. airbyte_cdk/entrypoint.py +5 -3
  2. airbyte_cdk/sources/file_based/config/avro_format.py +1 -0
  3. airbyte_cdk/sources/file_based/config/parquet_format.py +1 -0
  4. airbyte_cdk/sources/file_based/exceptions.py +26 -1
  5. airbyte_cdk/sources/file_based/file_based_source.py +5 -1
  6. airbyte_cdk/sources/file_based/file_types/avro_parser.py +15 -9
  7. airbyte_cdk/sources/file_based/file_types/csv_parser.py +19 -11
  8. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +1 -1
  9. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +22 -14
  10. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +3 -1
  11. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +8 -6
  12. {airbyte_cdk-0.58.4.dist-info → airbyte_cdk-0.58.7.dist-info}/METADATA +1 -1
  13. {airbyte_cdk-0.58.4.dist-info → airbyte_cdk-0.58.7.dist-info}/RECORD +23 -23
  14. unit_tests/connector_builder/test_connector_builder_handler.py +3 -3
  15. unit_tests/sources/file_based/scenarios/csv_scenarios.py +110 -9
  16. unit_tests/sources/file_based/scenarios/unstructured_scenarios.py +4 -0
  17. unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +16 -31
  18. unit_tests/sources/file_based/stream/test_default_file_based_stream.py +87 -10
  19. unit_tests/sources/file_based/test_file_based_scenarios.py +2 -0
  20. unit_tests/sources/test_integration_source.py +4 -3
  21. {airbyte_cdk-0.58.4.dist-info → airbyte_cdk-0.58.7.dist-info}/LICENSE.txt +0 -0
  22. {airbyte_cdk-0.58.4.dist-info → airbyte_cdk-0.58.7.dist-info}/WHEEL +0 -0
  23. {airbyte_cdk-0.58.4.dist-info → airbyte_cdk-0.58.7.dist-info}/top_level.txt +0 -0
airbyte_cdk/entrypoint.py CHANGED
@@ -26,6 +26,7 @@ from airbyte_cdk.utils import is_cloud_environment
26
26
  from airbyte_cdk.utils.airbyte_secrets_utils import get_secrets, update_secrets
27
27
  from airbyte_cdk.utils.constants import ENV_REQUEST_CACHE_PATH
28
28
  from airbyte_cdk.utils.traced_exception import AirbyteTracedException
29
+ from airbyte_protocol.models import FailureType
29
30
  from requests import PreparedRequest, Response, Session
30
31
 
31
32
  logger = init_logger("airbyte")
@@ -236,9 +237,10 @@ def _init_internal_request_filter() -> None:
236
237
  try:
237
238
  is_private = _is_private_url(parsed_url.hostname, parsed_url.port) # type: ignore [arg-type]
238
239
  if is_private:
239
- raise ValueError(
240
- "Invalid URL endpoint: The endpoint that data is being requested from belongs to a private network. Source "
241
- + "connectors only support requesting data from public API endpoints."
240
+ raise AirbyteTracedException(
241
+ internal_message=f"Invalid URL endpoint: `{parsed_url.hostname!r}` belongs to a private network",
242
+ failure_type=FailureType.config_error,
243
+ message="Invalid URL endpoint: The endpoint that data is being requested from belongs to a private network. Source connectors only support requesting data from public API endpoints.",
242
244
  )
243
245
  except socket.gaierror as exception:
244
246
  # This is a special case where the developer specifies an IP address string that is not formatted correctly like trailing
@@ -2,6 +2,7 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
+
5
6
  from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
6
7
  from pydantic import BaseModel, Field
7
8
 
@@ -2,6 +2,7 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
+
5
6
  from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
6
7
  from pydantic import BaseModel, Field
7
8
 
@@ -3,8 +3,9 @@
3
3
  #
4
4
 
5
5
  from enum import Enum
6
- from typing import Union
6
+ from typing import Any, List, Union
7
7
 
8
+ from airbyte_cdk.models import AirbyteMessage, FailureType
8
9
  from airbyte_cdk.utils import AirbyteTracedException
9
10
 
10
11
 
@@ -40,6 +41,30 @@ class FileBasedSourceError(Enum):
40
41
  UNDEFINED_VALIDATION_POLICY = "The validation policy defined in the config does not exist for the source."
41
42
 
42
43
 
44
+ class FileBasedErrorsCollector:
45
+ """
46
+ The placeholder for all errors collected.
47
+ """
48
+
49
+ errors: List[AirbyteMessage] = []
50
+
51
+ def yield_and_raise_collected(self) -> Any:
52
+ if self.errors:
53
+ # emit collected logged messages
54
+ yield from self.errors
55
+ # clean the collector
56
+ self.errors.clear()
57
+ # raising the single exception
58
+ raise AirbyteTracedException(
59
+ internal_message="Please check the logged errors for more information.",
60
+ message="Some errors occured while reading from the source.",
61
+ failure_type=FailureType.config_error,
62
+ )
63
+
64
+ def collect(self, logged_error: AirbyteMessage) -> None:
65
+ self.errors.append(logged_error)
66
+
67
+
43
68
  class BaseFileBasedSourceError(Exception):
44
69
  def __init__(self, error: Union[FileBasedSourceError, str], **kwargs): # type: ignore # noqa
45
70
  if isinstance(error, FileBasedSourceError):
@@ -14,7 +14,7 @@ from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBas
14
14
  from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
15
15
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ValidationPolicy
16
16
  from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy, DefaultDiscoveryPolicy
17
- from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError
17
+ from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedErrorsCollector, FileBasedSourceError
18
18
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
19
19
  from airbyte_cdk.sources.file_based.file_types import default_parsers
20
20
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
@@ -49,6 +49,7 @@ class FileBasedSource(AbstractSource, ABC):
49
49
  self.stream_schemas = {s.stream.name: s.stream.json_schema for s in catalog.streams} if catalog else {}
50
50
  self.cursor_cls = cursor_cls
51
51
  self.logger = logging.getLogger(f"airbyte.{self.name}")
52
+ self.errors_collector: FileBasedErrorsCollector = FileBasedErrorsCollector()
52
53
 
53
54
  def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Optional[Any]]:
54
55
  """
@@ -106,6 +107,7 @@ class FileBasedSource(AbstractSource, ABC):
106
107
  parsers=self.parsers,
107
108
  validation_policy=self._validate_and_get_validation_policy(stream_config),
108
109
  cursor=self.cursor_cls(stream_config),
110
+ errors_collector=self.errors_collector,
109
111
  )
110
112
  )
111
113
  return streams
@@ -121,6 +123,8 @@ class FileBasedSource(AbstractSource, ABC):
121
123
  state: Optional[Union[List[AirbyteStateMessage], MutableMapping[str, Any]]] = None,
122
124
  ) -> Iterator[AirbyteMessage]:
123
125
  yield from super().read(logger, config, catalog, state)
126
+ # emit all the errors collected
127
+ yield from self.errors_collector.yield_and_raise_collected()
124
128
  # count streams using a certain parser
125
129
  parsed_config = self._get_parsed_config(config)
126
130
  for parser, count in Counter(stream.format.filetype for stream in parsed_config.streams).items():
@@ -8,6 +8,7 @@ from typing import Any, Dict, Iterable, Mapping, Optional, Tuple
8
8
  import fastavro
9
9
  from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat
10
10
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
11
+ from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
11
12
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
12
13
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
13
14
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
@@ -144,15 +145,20 @@ class AvroParser(FileTypeParser):
144
145
  if not isinstance(avro_format, AvroFormat):
145
146
  raise ValueError(f"Expected ParquetFormat, got {avro_format}")
146
147
 
147
- with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
148
- avro_reader = fastavro.reader(fp)
149
- schema = avro_reader.writer_schema
150
- schema_field_name_to_type = {field["name"]: field["type"] for field in schema["fields"]}
151
- for record in avro_reader:
152
- yield {
153
- record_field: self._to_output_value(avro_format, schema_field_name_to_type[record_field], record[record_field])
154
- for record_field, record_value in schema_field_name_to_type.items()
155
- }
148
+ line_no = 0
149
+ try:
150
+ with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
151
+ avro_reader = fastavro.reader(fp)
152
+ schema = avro_reader.writer_schema
153
+ schema_field_name_to_type = {field["name"]: field["type"] for field in schema["fields"]}
154
+ for record in avro_reader:
155
+ line_no += 1
156
+ yield {
157
+ record_field: self._to_output_value(avro_format, schema_field_name_to_type[record_field], record[record_field])
158
+ for record_field, record_value in schema_field_name_to_type.items()
159
+ }
160
+ except Exception as exc:
161
+ raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line_no) from exc
156
162
 
157
163
  @property
158
164
  def file_read_mode(self) -> FileReadMode:
@@ -178,17 +178,25 @@ class CsvParser(FileTypeParser):
178
178
  logger: logging.Logger,
179
179
  discovered_schema: Optional[Mapping[str, SchemaType]],
180
180
  ) -> Iterable[Dict[str, Any]]:
181
- config_format = _extract_format(config)
182
- if discovered_schema:
183
- property_types = {col: prop["type"] for col, prop in discovered_schema["properties"].items()} # type: ignore # discovered_schema["properties"] is known to be a mapping
184
- deduped_property_types = CsvParser._pre_propcess_property_types(property_types)
185
- else:
186
- deduped_property_types = {}
187
- cast_fn = CsvParser._get_cast_function(deduped_property_types, config_format, logger, config.schemaless)
188
- data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode)
189
- for row in data_generator:
190
- yield CsvParser._to_nullable(cast_fn(row), deduped_property_types, config_format.null_values, config_format.strings_can_be_null)
191
- data_generator.close()
181
+ line_no = 0
182
+ try:
183
+ config_format = _extract_format(config)
184
+ if discovered_schema:
185
+ property_types = {col: prop["type"] for col, prop in discovered_schema["properties"].items()} # type: ignore # discovered_schema["properties"] is known to be a mapping
186
+ deduped_property_types = CsvParser._pre_propcess_property_types(property_types)
187
+ else:
188
+ deduped_property_types = {}
189
+ cast_fn = CsvParser._get_cast_function(deduped_property_types, config_format, logger, config.schemaless)
190
+ data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode)
191
+ for row in data_generator:
192
+ line_no += 1
193
+ yield CsvParser._to_nullable(
194
+ cast_fn(row), deduped_property_types, config_format.null_values, config_format.strings_can_be_null
195
+ )
196
+ except RecordParseError as parse_err:
197
+ raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line_no) from parse_err
198
+ finally:
199
+ data_generator.close()
192
200
 
193
201
  @property
194
202
  def file_read_mode(self) -> FileReadMode:
@@ -119,7 +119,7 @@ class JsonlParser(FileTypeParser):
119
119
  break
120
120
 
121
121
  if had_json_parsing_error and not yielded_at_least_once:
122
- raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD)
122
+ raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line)
123
123
 
124
124
  @staticmethod
125
125
  def _instantiate_accumulator(line: Union[bytes, str]) -> Union[bytes, str]:
@@ -11,7 +11,7 @@ from urllib.parse import unquote
11
11
  import pyarrow as pa
12
12
  import pyarrow.parquet as pq
13
13
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ParquetFormat
14
- from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError
14
+ from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError, RecordParseError
15
15
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
16
16
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
17
17
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
@@ -64,19 +64,27 @@ class ParquetParser(FileTypeParser):
64
64
  if not isinstance(parquet_format, ParquetFormat):
65
65
  logger.info(f"Expected ParquetFormat, got {parquet_format}")
66
66
  raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR)
67
- with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
68
- reader = pq.ParquetFile(fp)
69
- partition_columns = {x.split("=")[0]: x.split("=")[1] for x in self._extract_partitions(file.uri)}
70
- for row_group in range(reader.num_row_groups):
71
- batch = reader.read_row_group(row_group)
72
- for row in range(batch.num_rows):
73
- yield {
74
- **{
75
- column: ParquetParser._to_output_value(batch.column(column)[row], parquet_format)
76
- for column in batch.column_names
77
- },
78
- **partition_columns,
79
- }
67
+
68
+ line_no = 0
69
+ try:
70
+ with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
71
+ reader = pq.ParquetFile(fp)
72
+ partition_columns = {x.split("=")[0]: x.split("=")[1] for x in self._extract_partitions(file.uri)}
73
+ for row_group in range(reader.num_row_groups):
74
+ batch = reader.read_row_group(row_group)
75
+ for row in range(batch.num_rows):
76
+ line_no += 1
77
+ yield {
78
+ **{
79
+ column: ParquetParser._to_output_value(batch.column(column)[row], parquet_format)
80
+ for column in batch.column_names
81
+ },
82
+ **partition_columns,
83
+ }
84
+ except Exception as exc:
85
+ raise RecordParseError(
86
+ FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=f"{row_group=}, {line_no=}"
87
+ ) from exc
80
88
 
81
89
  @staticmethod
82
90
  def _extract_partitions(filepath: str) -> List[str]:
@@ -10,7 +10,7 @@ from airbyte_cdk.models import SyncMode
10
10
  from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy
11
11
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, PrimaryKeyType
12
12
  from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy
13
- from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError, UndefinedParserError
13
+ from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector, FileBasedSourceError, RecordParseError, UndefinedParserError
14
14
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
15
15
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
16
16
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
@@ -44,6 +44,7 @@ class AbstractFileBasedStream(Stream):
44
44
  discovery_policy: AbstractDiscoveryPolicy,
45
45
  parsers: Dict[Type[Any], FileTypeParser],
46
46
  validation_policy: AbstractSchemaValidationPolicy,
47
+ errors_collector: FileBasedErrorsCollector,
47
48
  ):
48
49
  super().__init__()
49
50
  self.config = config
@@ -53,6 +54,7 @@ class AbstractFileBasedStream(Stream):
53
54
  self._discovery_policy = discovery_policy
54
55
  self._availability_strategy = availability_strategy
55
56
  self._parsers = parsers
57
+ self.errors_collector = errors_collector
56
58
 
57
59
  @property
58
60
  @abstractmethod
@@ -112,12 +112,14 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
112
112
  except RecordParseError:
113
113
  # Increment line_no because the exception was raised before we could increment it
114
114
  line_no += 1
115
- yield AirbyteMessage(
116
- type=MessageType.LOG,
117
- log=AirbyteLogMessage(
118
- level=Level.ERROR,
119
- message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={self.name} file={file.uri} line_no={line_no} n_skipped={n_skipped}",
120
- stack_trace=traceback.format_exc(),
115
+ self.errors_collector.collect(
116
+ AirbyteMessage(
117
+ type=MessageType.LOG,
118
+ log=AirbyteLogMessage(
119
+ level=Level.ERROR,
120
+ message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={self.name} file={file.uri} line_no={line_no} n_skipped={n_skipped}",
121
+ stack_trace=traceback.format_exc(),
122
+ ),
121
123
  ),
122
124
  )
123
125
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: airbyte-cdk
3
- Version: 0.58.4
3
+ Version: 0.58.7
4
4
  Summary: A framework for writing Airbyte Connectors.
5
5
  Home-page: https://github.com/airbytehq/airbyte
6
6
  Author: Airbyte
@@ -1,7 +1,7 @@
1
1
  airbyte_cdk/__init__.py,sha256=OBQWv5rF_QTRpOiP6J8J8oTU-GGrfi18i1PRFpahKks,262
2
2
  airbyte_cdk/config_observation.py,sha256=3kjxv8xTwCnub2_fTWnMPRx0E7vly1BUeyXOSK15Ql4,3610
3
3
  airbyte_cdk/connector.py,sha256=LtTAmBFV1LBUz_fOEbQ_EvBhyUsz8AGOlDsvK8QOOo0,4396
4
- airbyte_cdk/entrypoint.py,sha256=uX3MawH1qukzxFjdR1AFynG0l5vLof9X40m_AYTrP_8,13180
4
+ airbyte_cdk/entrypoint.py,sha256=pJ8Wxw3Om7w4oA8KpytamcY_OOseB4NgoblB-GwS0KQ,13401
5
5
  airbyte_cdk/exception_handler.py,sha256=Xa8rpWRB_JBMMdqwKhQGYLekuq5BpYot_Lwde4B7r4E,1485
6
6
  airbyte_cdk/logger.py,sha256=4Mi2MEQi1uh59BP9Dxw_UEbZuxaJewqK_jvEU2b10nk,3985
7
7
  airbyte_cdk/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -152,8 +152,8 @@ airbyte_cdk/sources/embedded/catalog.py,sha256=mIM7rO5CZAUIHKbrKwn1-Zn9_e3sLiHrT
152
152
  airbyte_cdk/sources/embedded/runner.py,sha256=kZ0CcUANuMjdZ4fmvp_w9P2IcsS9WSHxNqYHqMwcfXI,1390
153
153
  airbyte_cdk/sources/embedded/tools.py,sha256=-Z4tZ4AP1OTi_zrqFM3YV8Rt7c60wvsrv0Dc-rTZ2uw,744
154
154
  airbyte_cdk/sources/file_based/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
155
- airbyte_cdk/sources/file_based/exceptions.py,sha256=z9JBEEhGyM1ev9P6MjjjigtyuBm3OaOl3lIhkOQf8lQ,4765
156
- airbyte_cdk/sources/file_based/file_based_source.py,sha256=2kguVKlTFg9vSE-eNZeVj4-VXElz3OuhJZrWrIdp2HE,7896
155
+ airbyte_cdk/sources/file_based/exceptions.py,sha256=-SjdDk-mbkp5qQVUESkn788W8NmGtC2LROkZRKS_Dxc,5613
156
+ airbyte_cdk/sources/file_based/file_based_source.py,sha256=XddFHSiL_a-VOfQF33yXVapUG6wvHu2hd9xxYEoBcuc,8180
157
157
  airbyte_cdk/sources/file_based/file_based_stream_reader.py,sha256=K9fFHcSL4E8v-X2l38wRAcZCjpyifr35orvby8vQt84,3749
158
158
  airbyte_cdk/sources/file_based/remote_file.py,sha256=dtRX7X06Fug-XDz93a5lkwPQy5nQgxH0-ZcXW2HuMGI,312
159
159
  airbyte_cdk/sources/file_based/schema_helpers.py,sha256=XBkOutIw_n6SNYU34qbyTbl0Ppt0i4k3sVFMSaX3wJo,9103
@@ -163,28 +163,28 @@ airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availab
163
163
  airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py,sha256=TXo8kdwOQ3XiQbS3ccPtj9FghHFpiVL2JRWjen3NRXw,5289
164
164
  airbyte_cdk/sources/file_based/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
165
165
  airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=dgOoQuoi7-7wdTMSP7wz4ENXIDT49Ew4FoAxnnplGGc,4956
166
- airbyte_cdk/sources/file_based/config/avro_format.py,sha256=lQSEq5JZY0M5y9mW93R4EjrIb8brYXUgrXCY-6EMHww,711
166
+ airbyte_cdk/sources/file_based/config/avro_format.py,sha256=q1I2G9bGNy3ADds35PfWT7Mss6fjYzUtYDkUYvh5v7s,712
167
167
  airbyte_cdk/sources/file_based/config/csv_format.py,sha256=L3JEgb91yrCob1oYrGl0088QEWblkOsRfDmMfWRQ0bg,7482
168
168
  airbyte_cdk/sources/file_based/config/file_based_stream_config.py,sha256=l9DFyttYbxY9exwy67WzRXySEk_yKV2G_THRA_Sq1I4,4229
169
169
  airbyte_cdk/sources/file_based/config/jsonl_format.py,sha256=fAPzZnoghGgHjaDvx6Qo68C8j54mBxo1NTdpwSI0VZo,374
170
- airbyte_cdk/sources/file_based/config/parquet_format.py,sha256=8GTDTQyvS7pWLVG0LWirHVE1snHd0Au5R4Ym33-ezEg,736
170
+ airbyte_cdk/sources/file_based/config/parquet_format.py,sha256=yKHgXYu3zJWrGfBlJ3JQZ3gVFPumF-K4rjVPNoYTUZ0,737
171
171
  airbyte_cdk/sources/file_based/config/unstructured_format.py,sha256=axuIc4xaac7vTJKi8I9l7Lgn8gGu6bCuZNouQAEAvYs,3513
172
172
  airbyte_cdk/sources/file_based/discovery_policy/__init__.py,sha256=x_7JsQGiS7Ytmr0ZDS0SNYGcNUzC4wCm3_1-Mf3ZFnw,283
173
173
  airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py,sha256=0o_qmEO0-IojO4Ckgp4V3ackTM9Ui1sUHW5HwANueLM,621
174
174
  airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py,sha256=QeZghVmf2Cq4wy_6NYcHmR6SLgdWfsGgctYg2ZsjFE4,939
175
175
  airbyte_cdk/sources/file_based/file_types/__init__.py,sha256=hWSDNKCIqvi7gOyfZJezuKt6-JtVroerUVTvW3ZY-R4,1017
176
- airbyte_cdk/sources/file_based/file_types/avro_parser.py,sha256=gCyPJc4khkar4sdfBd-RU3CuV_k7nnsNM080tjwDOiw,8817
177
- airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=VS2Ld9rfm4tLkwNZ3fKsZbFQSbo83EGNSuqZpqjTg_c,17880
176
+ airbyte_cdk/sources/file_based/file_types/avro_parser.py,sha256=FC3L6D32SzhAv4jyS5dnsIgYmvGHgbomJpI2xRWrbZ0,9167
177
+ airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=HqglbQ7vopMbEWC5F-PuB-4ycXgDHLHxq1sN6IGPUpE,18215
178
178
  airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=Gbn-8v1-jLhKpJXTNOOc5PZT1Jzah6G-INCZt4snLdQ,2819
179
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=S7OtfRRvQ8P6YbZVdJ8h7mw1hnWQUVSHR9Jy12U1Yy0,5634
180
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=Jq_-WSbyueVwyLYrrGafXhvcA1LDOeps0A_uBhStOHI,9017
179
+ airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=MkjK_J2OqzqRPyGeQFQFADxgwqsRaNtoawB7dwKxWb0,5666
180
+ airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=8ZuuYnS2AzlJ-IaeBP6Pnjzu4Z2zzfBWw_x9Rt9a5Qs,9363
181
181
  airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=omYdo6daIHI-YWF9WsKFdFHRXTFWgJjJ3OqegiN345k,16736
182
182
  airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py,sha256=sEVnRhZ8x9f7PNjo6lewxid9z0PI8eSj7gSoFC3MH1Y,527
183
183
  airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py,sha256=uwk6Ugf23xKG4PRPVVRVwpcNjTwPgxejl03vLSEzK0s,604
184
184
  airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py,sha256=ZeAa0z50ywMU2chNjQ7JpL4yePU1NajhBa8FS7rXLVo,1643
185
185
  airbyte_cdk/sources/file_based/stream/__init__.py,sha256=QPDqdgjsabOQD93dSFqHGaFS_3pIwm-chEabZHiPJi0,265
186
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py,sha256=0Z343kciFr5Y66SiwzIcxT6eKG2rMQtLHgLX-vpUVa4,6278
187
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py,sha256=VBVZeHt4MP_PWHE_Z6rQatiOUWu-HIRoqo2EcmvV_6E,12463
186
+ airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py,sha256=GJu02B2_fcfaOnSBvhpRyXIEEtu4v8ubFR_vQpe-YAU,6405
187
+ airbyte_cdk/sources/file_based/stream/default_file_based_stream.py,sha256=Ou8vKnaR7yhEG9NoOLfwOHqxYBro055nB9CCBPC2I2s,12555
188
188
  airbyte_cdk/sources/file_based/stream/cursor/__init__.py,sha256=MhFB5hOo8sjwvCh8gangaymdg3EJWYt_72brFOZt068,191
189
189
  airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py,sha256=i-FPeK8lwCzX34GCcmvL5Yvdh8-uu7FeCVYDoFbD7IY,1920
190
190
  airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py,sha256=kuJRKgDYOGXRk0V0I8BpFxg0hGv7SfV_nBpmmn45F88,6815
@@ -265,7 +265,7 @@ airbyte_cdk/utils/traced_exception.py,sha256=ChtuhSV_fkmMv8QjPBR1dV1US8uxlmVt_My
265
265
  source_declarative_manifest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
266
266
  source_declarative_manifest/main.py,sha256=HXzuRsRyhHwPrGU-hc4S7RrgoOoHImqkdfbmO2geBeE,1027
267
267
  unit_tests/connector_builder/__init__.py,sha256=4Hw-PX1-VgESLF16cDdvuYCzGJtHntThLF4qIiULWeo,61
268
- unit_tests/connector_builder/test_connector_builder_handler.py,sha256=V4bocoh7XOgBoc4cqPz665QtzTbhmyRcItdBIXUITuc,33788
268
+ unit_tests/connector_builder/test_connector_builder_handler.py,sha256=UJbz5luKnbczdRM7yaIXnNIF64_Ajl7KgFeHSW32nc0,33824
269
269
  unit_tests/connector_builder/test_message_grouper.py,sha256=qfXXrpz043VbrTNV1tWqhIp8BP08HOF7XuGoMuWvWkc,30305
270
270
  unit_tests/connector_builder/utils.py,sha256=AAggdGWP-mNuWOZUHLAVIbjTeIcdPo-3pbMm5zdYpS0,796
271
271
  unit_tests/destinations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -279,7 +279,7 @@ unit_tests/sources/test_concurrent_source.py,sha256=NT4K0z-oz2OZBHE9xNQT0KUdI2wJ
279
279
  unit_tests/sources/test_config.py,sha256=lxjeaf48pOMF4Pf3-Z1ux_tHTyjRFCdG_hpnxw3e7uQ,2839
280
280
  unit_tests/sources/test_connector_state_manager.py,sha256=ynFxA63Cxe6t-wMMh9C6ByTlMAuk8W7H2FikDhnUEQ0,24264
281
281
  unit_tests/sources/test_http_logger.py,sha256=VT6DqgspI3DcRnoBQkkQX0z4dF_AOiYZ5P_zxmMW8oU,9004
282
- unit_tests/sources/test_integration_source.py,sha256=7DAWzuYwU_HXzhw-rRjjwQuQej-hVpNyzw_NLqQiJVc,3369
282
+ unit_tests/sources/test_integration_source.py,sha256=u_w5NS9n8GkTsoTjJvBE3-g8x0NG2054hL3PtW7IfAM,3458
283
283
  unit_tests/sources/test_source.py,sha256=W0I4umL_d_OToLYYiRkjkJR6e-cCYjdV8zKc3uLvF0k,27999
284
284
  unit_tests/sources/test_source_read.py,sha256=AEFoJfzM0_5QQIJyKwGLK_kq_Vz_CBivImnUnXJQJ0I,17176
285
285
  unit_tests/sources/concurrent_source/__init__.py,sha256=4Hw-PX1-VgESLF16cDdvuYCzGJtHntThLF4qIiULWeo,61
@@ -363,7 +363,7 @@ unit_tests/sources/declarative/stream_slicers/test_cartesian_product_stream_slic
363
363
  unit_tests/sources/file_based/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
364
364
  unit_tests/sources/file_based/helpers.py,sha256=MZTwaWtX0a6TPbFcUMP-EgqBunK2wpoElgApCEE1bN4,2659
365
365
  unit_tests/sources/file_based/in_memory_files_source.py,sha256=r2yD6-_ABXG7_PIyTq4ACN21sHyg3g-Hd9dIgxfDQUk,8235
366
- unit_tests/sources/file_based/test_file_based_scenarios.py,sha256=iwXCqnFGxfHh3l48wXtlD-x74rbZYcA94XXnBrcrrKQ,11616
366
+ unit_tests/sources/file_based/test_file_based_scenarios.py,sha256=rQaORUdsRdWxTMMshJxAxnp3x6Bsnuirit4yjrT0Oao,11680
367
367
  unit_tests/sources/file_based/test_file_based_stream_reader.py,sha256=P6yTp7tbPfREzi5SXg4SSSql5nxiRV571YdOmwb_SzY,9219
368
368
  unit_tests/sources/file_based/test_scenarios.py,sha256=ONBUwnX_dWOaejKiuJQgMRWgr_0NCWJpTwf4nvw_ePg,8008
369
369
  unit_tests/sources/file_based/test_schema_helpers.py,sha256=IYIDdLRK41RkSG_ZW2cagAt9krV4QLbkzu6r7vPx9Js,12047
@@ -384,18 +384,18 @@ unit_tests/sources/file_based/file_types/test_unstructured_parser.py,sha256=kmVl
384
384
  unit_tests/sources/file_based/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
385
385
  unit_tests/sources/file_based/scenarios/avro_scenarios.py,sha256=oeQUmCV7d2aTShreYc-PvVb4cWqLSsVwHfg-lcKjzPs,30554
386
386
  unit_tests/sources/file_based/scenarios/check_scenarios.py,sha256=0xkt21ASTnTAMP0RYJEsF3yMGsNN7wWOoG_tmzL9PYw,6750
387
- unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=4mh9nllsdsi4NCOr8q0ZRZatFUz3Zf5etVcwVE_mjbM,120032
387
+ unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=4RYb8_C7sJM_6pI-cftP5fCk0i6dr4BW1lpY5iQDdN8,123795
388
388
  unit_tests/sources/file_based/scenarios/file_based_source_builder.py,sha256=wgb7l5VohcEvZT82ZpJcjINSrjuJtzJZS4zuZjdKpJ4,3874
389
389
  unit_tests/sources/file_based/scenarios/incremental_scenarios.py,sha256=B7YE2IbvgTH_v7DYQEuv7yn2IG15aKUvJ_7dA4d3Cg4,69413
390
390
  unit_tests/sources/file_based/scenarios/jsonl_scenarios.py,sha256=LsOf-tpjWNuwskPcgAMhMpQQ3iaHaD3PjPmt2M2zSzo,31839
391
391
  unit_tests/sources/file_based/scenarios/parquet_scenarios.py,sha256=MGgLCqkTJb8uNEwYZY3zbVVDZRSBKSmf2s8VMuYse_I,26549
392
392
  unit_tests/sources/file_based/scenarios/scenario_builder.py,sha256=zSZtaYUflkosflxQGrTDxiJ24mhFsTJYosKyxAHgWbM,9475
393
- unit_tests/sources/file_based/scenarios/unstructured_scenarios.py,sha256=04exiS9j6kPyHLUUMgQLGfJHmlD1T63bixANhnUDdzk,67762
393
+ unit_tests/sources/file_based/scenarios/unstructured_scenarios.py,sha256=2_p15Phk2xiBgZ0OdGYrCU9eAlTT8h_SU5nk1ehUcLk,67894
394
394
  unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py,sha256=FVYbRfdj2RCLFVwUNqQKiBFMm78y6FvmTO447i3SXqY,28697
395
- unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py,sha256=Try0knJN5wfoGNO38QGoLGIcqSceSAQsUWO42CusNYI,33005
395
+ unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py,sha256=fFcNR-lzfLQ5SS8Uetbx6iFijgs_OXlqYz3Pr1OVTAI,32221
396
396
  unit_tests/sources/file_based/stream/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
397
397
  unit_tests/sources/file_based/stream/test_default_file_based_cursor.py,sha256=XhtCGvgSBFyeQwgqGciPsIB1HIlWqTcXROwnxrjutHc,13109
398
- unit_tests/sources/file_based/stream/test_default_file_based_stream.py,sha256=3zupeqwYyAb2EP4jn_8zbdu6_gTa1HlOAu6Rh0lxStM,7786
398
+ unit_tests/sources/file_based/stream/test_default_file_based_stream.py,sha256=1GZPMIL00KGMIaYcPPBhQ0gpkYAJ48xtxXOgEwxkg84,10263
399
399
  unit_tests/sources/fixtures/__init__.py,sha256=ZnqYNxHsKCgO38IwB34RQyRMXTs4GTvlRi3ImKnIioo,61
400
400
  unit_tests/sources/fixtures/source_test_fixture.py,sha256=dvpISgio2sOp-U3bXudH_49vY4c68sO_PMs1JZTMaj0,5502
401
401
  unit_tests/sources/message/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -444,8 +444,8 @@ unit_tests/utils/test_schema_inferrer.py,sha256=Z2jHBZ540wnYkylIdV_2xr75Vtwlxuyg
444
444
  unit_tests/utils/test_secret_utils.py,sha256=XKe0f1RHYii8iwE6ATmBr5JGDI1pzzrnZUGdUSMJQP4,4886
445
445
  unit_tests/utils/test_stream_status_utils.py,sha256=Xr8MZ2HWgTVIyMbywDvuYkRaUF4RZLQOT8-JjvcfR24,2970
446
446
  unit_tests/utils/test_traced_exception.py,sha256=bDFP5zMBizFenz6V2WvEZTRCKGB5ijh3DBezjbfoYIs,4198
447
- airbyte_cdk-0.58.4.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
448
- airbyte_cdk-0.58.4.dist-info/METADATA,sha256=TOtOUaJWhywPsGHa6p3FX_F7z5ehHq5EIj5QtVI5d_E,11073
449
- airbyte_cdk-0.58.4.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
450
- airbyte_cdk-0.58.4.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
451
- airbyte_cdk-0.58.4.dist-info/RECORD,,
447
+ airbyte_cdk-0.58.7.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
448
+ airbyte_cdk-0.58.7.dist-info/METADATA,sha256=PAw5bOce761Nqfm14qOw6Rk60S5JqPvK9d8oUnzv8vc,11073
449
+ airbyte_cdk-0.58.7.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
450
+ airbyte_cdk-0.58.7.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
451
+ airbyte_cdk-0.58.7.dist-info/RECORD,,
@@ -769,8 +769,8 @@ def test_read_source_single_page_single_slice(mock_http_stream):
769
769
  "deployment_mode, url_base, expected_error",
770
770
  [
771
771
  pytest.param("CLOUD", "https://airbyte.com/api/v1/characters", None, id="test_cloud_read_with_public_endpoint"),
772
- pytest.param("CLOUD", "https://10.0.27.27", "ValueError", id="test_cloud_read_with_private_endpoint"),
773
- pytest.param("CLOUD", "https://localhost:80/api/v1/cast", "ValueError", id="test_cloud_read_with_localhost"),
772
+ pytest.param("CLOUD", "https://10.0.27.27", "AirbyteTracedException", id="test_cloud_read_with_private_endpoint"),
773
+ pytest.param("CLOUD", "https://localhost:80/api/v1/cast", "AirbyteTracedException", id="test_cloud_read_with_localhost"),
774
774
  pytest.param("CLOUD", "http://unsecured.protocol/api/v1", "InvalidSchema", id="test_cloud_read_with_unsecured_endpoint"),
775
775
  pytest.param("CLOUD", "https://domainwithoutextension", "Invalid URL", id="test_cloud_read_with_invalid_url_endpoint"),
776
776
  pytest.param("OSS", "https://airbyte.com/api/v1/", None, id="test_oss_read_with_public_endpoint"),
@@ -820,7 +820,7 @@ def test_handle_read_external_requests(deployment_mode, url_base, expected_error
820
820
  "deployment_mode, token_url, expected_error",
821
821
  [
822
822
  pytest.param("CLOUD", "https://airbyte.com/tokens/bearer", None, id="test_cloud_read_with_public_endpoint"),
823
- pytest.param("CLOUD", "https://10.0.27.27/tokens/bearer", "ValueError", id="test_cloud_read_with_private_endpoint"),
823
+ pytest.param("CLOUD", "https://10.0.27.27/tokens/bearer", "AirbyteTracedException", id="test_cloud_read_with_private_endpoint"),
824
824
  pytest.param("CLOUD", "http://unsecured.protocol/tokens/bearer", "InvalidSchema", id="test_cloud_read_with_unsecured_endpoint"),
825
825
  pytest.param("CLOUD", "https://domainwithoutextension", "Invalid URL", id="test_cloud_read_with_invalid_url_endpoint"),
826
826
  pytest.param("OSS", "https://airbyte.com/tokens/bearer", None, id="test_oss_read_with_public_endpoint"),
@@ -852,6 +852,109 @@ invalid_csv_scenario: TestScenario[InMemoryFilesSource] = (
852
852
  ]
853
853
  }
854
854
  )
855
+ .set_expected_read_error(
856
+ AirbyteTracedException,
857
+ "Please check the logged errors for more information.",
858
+ )
859
+ ).build()
860
+
861
+ invalid_csv_multi_scenario: TestScenario[InMemoryFilesSource] = (
862
+ TestScenarioBuilder[InMemoryFilesSource]()
863
+ .set_name("invalid_csv_multi_scenario") # too many values for the number of headers
864
+ .set_config(
865
+ {
866
+ "streams": [
867
+ {
868
+ "name": "stream1",
869
+ "format": {"filetype": "csv"},
870
+ "globs": ["*"],
871
+ "validation_policy": "Emit Record",
872
+ },
873
+ {
874
+ "name": "stream2",
875
+ "format": {"filetype": "csv"},
876
+ "globs": ["b.csv"],
877
+ "validation_policy": "Emit Record",
878
+ },
879
+ ]
880
+ }
881
+ )
882
+ .set_source_builder(
883
+ FileBasedSourceBuilder()
884
+ .set_files(
885
+ {
886
+ "a.csv": {
887
+ "contents": [
888
+ ("col1",),
889
+ ("val11", "val12"),
890
+ ("val21", "val22"),
891
+ ],
892
+ "last_modified": "2023-06-05T03:54:07.000Z",
893
+ },
894
+ "b.csv": {
895
+ "contents": [
896
+ ("col3",),
897
+ ("val13b", "val14b"),
898
+ ("val23b", "val24b"),
899
+ ],
900
+ "last_modified": "2023-06-05T03:54:07.000Z",
901
+ },
902
+ }
903
+ )
904
+ .set_file_type("csv")
905
+ )
906
+ .set_expected_catalog(
907
+ {
908
+ "streams": [
909
+ {
910
+ "default_cursor_field": ["_ab_source_file_last_modified"],
911
+ "json_schema": {
912
+ "type": "object",
913
+ "properties": {
914
+ "col1": {"type": ["null", "string"]},
915
+ "col2": {"type": ["null", "string"]},
916
+ "_ab_source_file_last_modified": {"type": "string"},
917
+ "_ab_source_file_url": {"type": "string"},
918
+ },
919
+ },
920
+ "name": "stream1",
921
+ "source_defined_cursor": True,
922
+ "supported_sync_modes": ["full_refresh", "incremental"],
923
+ },
924
+ {
925
+ "json_schema": {
926
+ "type": "object",
927
+ "properties": {
928
+ "col3": {"type": ["null", "string"]},
929
+ "_ab_source_file_last_modified": {"type": "string"},
930
+ "_ab_source_file_url": {"type": "string"},
931
+ },
932
+ },
933
+ "name": "stream2",
934
+ "source_defined_cursor": True,
935
+ "default_cursor_field": ["_ab_source_file_last_modified"],
936
+ "supported_sync_modes": ["full_refresh", "incremental"],
937
+ },
938
+ ]
939
+ }
940
+ )
941
+ .set_expected_records([])
942
+ .set_expected_discover_error(AirbyteTracedException, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value)
943
+ .set_expected_logs(
944
+ {
945
+ "read": [
946
+ {
947
+ "level": "ERROR",
948
+ "message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=a.csv line_no=1 n_skipped=0",
949
+ },
950
+ {
951
+ "level": "ERROR",
952
+ "message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream2 file=b.csv line_no=1 n_skipped=0",
953
+ },
954
+ ]
955
+ }
956
+ )
957
+ .set_expected_read_error(AirbyteTracedException, "Please check the logged errors for more information.")
855
958
  ).build()
856
959
 
857
960
  csv_single_stream_scenario: TestScenario[InMemoryFilesSource] = (
@@ -2172,17 +2275,15 @@ csv_newline_in_values_not_quoted_scenario: TestScenario[InMemoryFilesSource] = (
2172
2275
  },
2173
2276
  ]
2174
2277
  )
2175
- .set_expected_logs(
2176
- {
2177
- "read": [
2178
- {
2179
- "level": "ERROR",
2180
- "message": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. stream=stream1 file=a.csv line_no=2 n_skipped=0",
2181
- }
2182
- ]
2183
- }
2278
+ .set_expected_read_error(
2279
+ AirbyteTracedException,
2280
+ f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=a.csv line_no=2 n_skipped=0",
2184
2281
  )
2185
2282
  .set_expected_discover_error(AirbyteTracedException, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value)
2283
+ .set_expected_read_error(
2284
+ AirbyteTracedException,
2285
+ "Please check the logged errors for more information.",
2286
+ )
2186
2287
  ).build()
2187
2288
 
2188
2289
  csv_escape_char_is_set_scenario: TestScenario[InMemoryFilesSource] = (
@@ -231,6 +231,10 @@ unstructured_invalid_file_type_discover_scenario_no_skip = (
231
231
  )
232
232
  .set_expected_records([])
233
233
  .set_expected_discover_error(AirbyteTracedException, "Error inferring schema from files")
234
+ .set_expected_read_error(
235
+ AirbyteTracedException,
236
+ "Please check the logged errors for more information.",
237
+ )
234
238
  ).build()
235
239
 
236
240
  # If skip unprocessable file types is set to true, then discover will succeed even if there are non-matching file types
@@ -2,7 +2,8 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
- from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError
5
+
6
+ from airbyte_cdk.utils.traced_exception import AirbyteTracedException
6
7
  from unit_tests.sources.file_based.scenarios.file_based_source_builder import FileBasedSourceBuilder
7
8
  from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenarioBuilder
8
9
 
@@ -272,6 +273,10 @@ skip_record_scenario_single_stream = (
272
273
  ]
273
274
  }
274
275
  )
276
+ .set_expected_read_error(
277
+ AirbyteTracedException,
278
+ "Please check the logged errors for more information.",
279
+ )
275
280
  ).build()
276
281
 
277
282
 
@@ -416,6 +421,10 @@ skip_record_scenario_multi_stream = (
416
421
  ]
417
422
  }
418
423
  )
424
+ .set_expected_read_error(
425
+ AirbyteTracedException,
426
+ "Please check the logged errors for more information.",
427
+ )
419
428
  ).build()
420
429
 
421
430
 
@@ -492,19 +501,9 @@ emit_record_scenario_single_stream = (
492
501
  },
493
502
  ]
494
503
  )
495
- .set_expected_logs(
496
- {
497
- "read": [
498
- {
499
- "level": "ERROR",
500
- "message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=c.csv line_no=2 n_skipped=0",
501
- },
502
- {
503
- "level": "WARN",
504
- "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer",
505
- },
506
- ]
507
- }
504
+ .set_expected_read_error(
505
+ AirbyteTracedException,
506
+ "Please check the logged errors for more information.",
508
507
  )
509
508
  ).build()
510
509
 
@@ -640,23 +639,9 @@ emit_record_scenario_multi_stream = (
640
639
  },
641
640
  ]
642
641
  )
643
- .set_expected_logs(
644
- {
645
- "read": [
646
- {
647
- "level": "ERROR",
648
- "message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=a/a3.csv line_no=2 n_skipped=0",
649
- },
650
- {
651
- "level": "WARN",
652
- "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer",
653
- },
654
- {
655
- "level": "WARN",
656
- "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer",
657
- },
658
- ]
659
- }
642
+ .set_expected_read_error(
643
+ AirbyteTracedException,
644
+ "Please check the logged errors for more information.",
660
645
  )
661
646
  ).build()
662
647
 
@@ -2,15 +2,18 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
+ import traceback
5
6
  import unittest
6
7
  from datetime import datetime, timezone
7
8
  from typing import Any, Iterable, Iterator, Mapping
8
9
  from unittest.mock import Mock
9
10
 
10
11
  import pytest
11
- from airbyte_cdk.models import Level
12
+ from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level
13
+ from airbyte_cdk.models import Type as MessageType
12
14
  from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy
13
15
  from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy
16
+ from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector, FileBasedSourceError
14
17
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
15
18
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
16
19
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
@@ -55,12 +58,17 @@ class MockFormat:
55
58
  ),
56
59
  pytest.param(
57
60
  {"type": "object", "properties": {"prop": {"type": "string"}}},
58
- {"type": ["null", "object"], "properties": {"prop": {"type": ["null", "string"]}}},
61
+ {
62
+ "type": ["null", "object"],
63
+ "properties": {"prop": {"type": ["null", "string"]}},
64
+ },
59
65
  id="deeply-nested-schema",
60
66
  ),
61
67
  ],
62
68
  )
63
- def test_fill_nulls(input_schema: Mapping[str, Any], expected_output: Mapping[str, Any]) -> None:
69
+ def test_fill_nulls(
70
+ input_schema: Mapping[str, Any], expected_output: Mapping[str, Any]
71
+ ) -> None:
64
72
  assert DefaultFileBasedStream._fill_nulls(input_schema) == expected_output
65
73
 
66
74
 
@@ -90,21 +98,33 @@ class DefaultFileBasedStreamTest(unittest.TestCase):
90
98
  parsers={MockFormat: self._parser},
91
99
  validation_policy=self._validation_policy,
92
100
  cursor=self._cursor,
101
+ errors_collector=FileBasedErrorsCollector(),
93
102
  )
94
103
 
95
104
  def test_when_read_records_from_slice_then_return_records(self) -> None:
96
105
  self._parser.parse_records.return_value = [self._A_RECORD]
97
- messages = list(self._stream.read_records_from_slice({"files": [RemoteFile(uri="uri", last_modified=self._NOW)]}))
98
- assert list(map(lambda message: message.record.data["data"], messages)) == [self._A_RECORD]
106
+ messages = list(
107
+ self._stream.read_records_from_slice(
108
+ {"files": [RemoteFile(uri="uri", last_modified=self._NOW)]}
109
+ )
110
+ )
111
+ assert list(map(lambda message: message.record.data["data"], messages)) == [
112
+ self._A_RECORD
113
+ ]
99
114
 
100
- def test_given_exception_when_read_records_from_slice_then_do_process_other_files(self) -> None:
115
+ def test_given_exception_when_read_records_from_slice_then_do_process_other_files(
116
+ self,
117
+ ) -> None:
101
118
  """
102
119
  The current behavior for source-s3 v3 does not fail sync on some errors and hence, we will keep this behaviour for now. One example
103
120
  we can easily reproduce this is by having a file with gzip extension that is not actually a gzip file. The reader will fail to open
104
121
  the file but the sync won't fail.
105
122
  Ticket: https://github.com/airbytehq/airbyte/issues/29680
106
123
  """
107
- self._parser.parse_records.side_effect = [ValueError("An error"), [self._A_RECORD]]
124
+ self._parser.parse_records.side_effect = [
125
+ ValueError("An error"),
126
+ [self._A_RECORD],
127
+ ]
108
128
 
109
129
  messages = list(
110
130
  self._stream.read_records_from_slice(
@@ -120,7 +140,9 @@ class DefaultFileBasedStreamTest(unittest.TestCase):
120
140
  assert messages[0].log.level == Level.ERROR
121
141
  assert messages[1].record.data["data"] == self._A_RECORD
122
142
 
123
- def test_given_traced_exception_when_read_records_from_slice_then_fail(self) -> None:
143
+ def test_given_traced_exception_when_read_records_from_slice_then_fail(
144
+ self,
145
+ ) -> None:
124
146
  """
125
147
  When a traced exception is raised, the stream shouldn't try to handle but pass it on to the caller.
126
148
  """
@@ -138,10 +160,14 @@ class DefaultFileBasedStreamTest(unittest.TestCase):
138
160
  )
139
161
  )
140
162
 
141
- def test_given_exception_after_skipping_records_when_read_records_from_slice_then_send_warning(self) -> None:
163
+ def test_given_exception_after_skipping_records_when_read_records_from_slice_then_send_warning(
164
+ self,
165
+ ) -> None:
142
166
  self._stream_config.schemaless = False
143
167
  self._validation_policy.record_passes_validation_policy.return_value = False
144
- self._parser.parse_records.side_effect = [self._iter([self._A_RECORD, ValueError("An error")])]
168
+ self._parser.parse_records.side_effect = [
169
+ self._iter([self._A_RECORD, ValueError("An error")])
170
+ ]
145
171
 
146
172
  messages = list(
147
173
  self._stream.read_records_from_slice(
@@ -183,3 +209,54 @@ class DefaultFileBasedStreamTest(unittest.TestCase):
183
209
  if isinstance(item, Exception):
184
210
  raise item
185
211
  yield item
212
+
213
+
214
+ class TestFileBasedErrorCollector:
215
+ test_error_collector: FileBasedErrorsCollector = FileBasedErrorsCollector()
216
+
217
+ @pytest.mark.parametrize(
218
+ "stream, file, line_no, n_skipped, collector_expected_len",
219
+ (
220
+ ("stream_1", "test.csv", 1, 1, 1),
221
+ ("stream_2", "test2.csv", 2, 2, 2),
222
+ ),
223
+ ids=[
224
+ "Single error",
225
+ "Multiple errors",
226
+ ],
227
+ )
228
+ def test_collect_parsing_error(
229
+ self, stream, file, line_no, n_skipped, collector_expected_len
230
+ ) -> None:
231
+ test_error_pattern = "Error parsing record."
232
+ # format the error body
233
+ test_error = (
234
+ AirbyteMessage(
235
+ type=MessageType.LOG,
236
+ log=AirbyteLogMessage(
237
+ level=Level.ERROR,
238
+ message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={stream} file={file} line_no={line_no} n_skipped={n_skipped}",
239
+ stack_trace=traceback.format_exc(),
240
+ ),
241
+ ),
242
+ )
243
+ # collecting the error
244
+ self.test_error_collector.collect(test_error)
245
+ # check the error has been collected
246
+ assert len(self.test_error_collector.errors) == collector_expected_len
247
+ # check for the patern presence for the collected errors
248
+ for error in self.test_error_collector.errors:
249
+ assert test_error_pattern in error[0].log.message
250
+
251
+ def test_yield_and_raise_collected(self) -> None:
252
+ # we expect the following method will raise the AirbyteTracedException
253
+ with pytest.raises(AirbyteTracedException) as parse_error:
254
+ list(self.test_error_collector.yield_and_raise_collected())
255
+ assert (
256
+ parse_error.value.message
257
+ == "Some errors occured while reading from the source."
258
+ )
259
+ assert (
260
+ parse_error.value.internal_message
261
+ == "Please check the logged errors for more information."
262
+ )
@@ -48,6 +48,7 @@ from unit_tests.sources.file_based.scenarios.csv_scenarios import (
48
48
  csv_strings_can_be_null_not_quoted_scenario,
49
49
  earlier_csv_scenario,
50
50
  empty_schema_inference_scenario,
51
+ invalid_csv_multi_scenario,
51
52
  invalid_csv_scenario,
52
53
  multi_csv_scenario,
53
54
  multi_csv_stream_n_file_exceeds_limit_for_inference,
@@ -132,6 +133,7 @@ discover_scenarios = [
132
133
  csv_multi_stream_scenario,
133
134
  csv_single_stream_scenario,
134
135
  invalid_csv_scenario,
136
+ invalid_csv_multi_scenario,
135
137
  single_csv_scenario,
136
138
  multi_csv_scenario,
137
139
  multi_csv_stream_n_file_exceeds_limit_for_inference,
@@ -9,6 +9,7 @@ from unittest.mock import patch
9
9
  import pytest
10
10
  import requests
11
11
  from airbyte_cdk.entrypoint import launch
12
+ from airbyte_cdk.utils import AirbyteTracedException
12
13
  from unit_tests.sources.fixtures.source_test_fixture import (
13
14
  HttpTestStream,
14
15
  SourceFixtureOauthAuthenticator,
@@ -22,8 +23,8 @@ from unit_tests.sources.fixtures.source_test_fixture import (
22
23
  [
23
24
  pytest.param("CLOUD", "https://airbyte.com/api/v1/", [], None, id="test_cloud_read_with_public_endpoint"),
24
25
  pytest.param("CLOUD", "http://unsecured.com/api/v1/", [], ValueError, id="test_cloud_read_with_unsecured_url"),
25
- pytest.param("CLOUD", "https://172.20.105.99/api/v1/", [], ValueError, id="test_cloud_read_with_private_endpoint"),
26
- pytest.param("CLOUD", "https://localhost:80/api/v1/", [], ValueError, id="test_cloud_read_with_localhost"),
26
+ pytest.param("CLOUD", "https://172.20.105.99/api/v1/", [], AirbyteTracedException, id="test_cloud_read_with_private_endpoint"),
27
+ pytest.param("CLOUD", "https://localhost:80/api/v1/", [], AirbyteTracedException, id="test_cloud_read_with_localhost"),
27
28
  pytest.param("OSS", "https://airbyte.com/api/v1/", [], None, id="test_oss_read_with_public_endpoint"),
28
29
  pytest.param("OSS", "https://172.20.105.99/api/v1/", [], None, id="test_oss_read_with_private_endpoint"),
29
30
  ],
@@ -47,7 +48,7 @@ def test_external_request_source(capsys, deployment_mode, url_base, expected_rec
47
48
  [
48
49
  pytest.param("CLOUD", "https://airbyte.com/api/v1/", [], None, id="test_cloud_read_with_public_endpoint"),
49
50
  pytest.param("CLOUD", "http://unsecured.com/api/v1/", [], ValueError, id="test_cloud_read_with_unsecured_url"),
50
- pytest.param("CLOUD", "https://172.20.105.99/api/v1/", [], ValueError, id="test_cloud_read_with_private_endpoint"),
51
+ pytest.param("CLOUD", "https://172.20.105.99/api/v1/", [], AirbyteTracedException, id="test_cloud_read_with_private_endpoint"),
51
52
  pytest.param("OSS", "https://airbyte.com/api/v1/", [], None, id="test_oss_read_with_public_endpoint"),
52
53
  pytest.param("OSS", "https://172.20.105.99/api/v1/", [], None, id="test_oss_read_with_private_endpoint"),
53
54
  ],