airbyte-cdk 0.50.0__py3-none-any.whl → 0.50.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. airbyte_cdk/entrypoint.py +7 -0
  2. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3 -3
  3. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +3 -3
  4. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +9 -9
  5. airbyte_cdk/sources/file_based/config/csv_format.py +42 -6
  6. airbyte_cdk/sources/file_based/file_based_source.py +4 -5
  7. airbyte_cdk/sources/file_based/file_types/csv_parser.py +114 -59
  8. airbyte_cdk/sources/file_based/stream/cursor/__init__.py +2 -2
  9. airbyte_cdk/sources/file_based/stream/cursor/{file_based_cursor.py → abstract_file_based_cursor.py} +9 -1
  10. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +10 -10
  11. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +15 -2
  12. {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/METADATA +1 -1
  13. {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/RECORD +25 -24
  14. unit_tests/sources/file_based/config/test_csv_format.py +23 -0
  15. unit_tests/sources/file_based/file_types/test_csv_parser.py +50 -18
  16. unit_tests/sources/file_based/helpers.py +5 -0
  17. unit_tests/sources/file_based/in_memory_files_source.py +11 -3
  18. unit_tests/sources/file_based/scenarios/csv_scenarios.py +1254 -47
  19. unit_tests/sources/file_based/scenarios/incremental_scenarios.py +6 -5
  20. unit_tests/sources/file_based/scenarios/scenario_builder.py +8 -7
  21. unit_tests/sources/file_based/stream/test_default_file_based_cursor.py +13 -12
  22. unit_tests/sources/file_based/test_scenarios.py +30 -0
  23. {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/LICENSE.txt +0 -0
  24. {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/WHEEL +0 -0
  25. {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/top_level.txt +0 -0
airbyte_cdk/entrypoint.py CHANGED
@@ -181,6 +181,13 @@ class AirbyteEntrypoint(object):
181
181
  return parsed_args.catalog
182
182
  return None
183
183
 
184
+ @classmethod
185
+ def extract_config(cls, args: List[str]) -> Optional[Any]:
186
+ parsed_args = cls.parse_args(args)
187
+ if hasattr(parsed_args, "config"):
188
+ return parsed_args.config
189
+ return None
190
+
184
191
  def _emit_queued_messages(self, source: Source) -> Iterable[AirbyteMessage]:
185
192
  if hasattr(source, "message_repository") and source.message_repository:
186
193
  yield from source.message_repository.consume_queue()
@@ -578,9 +578,9 @@ definitions:
578
578
  - "created_at"
579
579
  - "{{ config['record_cursor'] }}"
580
580
  datetime_format:
581
- title: Cursor Field Datetime Format
581
+ title: Outgoing Datetime Format
582
582
  description: |
583
- The datetime format of the Cursor Field. Use placeholders starting with "%" to describe the format the API is using. The following placeholders are available:
583
+ The datetime format used to format the datetime values that are sent in outgoing requests to the API. Use placeholders starting with "%" to describe the format the API is using. The following placeholders are available:
584
584
  * **%s**: Epoch unix timestamp - `1686218963`
585
585
  * **%a**: Weekday (abbreviated) - `Sun`
586
586
  * **%A**: Weekday (full) - `Sunday`
@@ -626,7 +626,7 @@ definitions:
626
626
  - "{{ config['start_time'] }}"
627
627
  cursor_datetime_formats:
628
628
  title: Cursor Datetime Formats
629
- description: The possible formats for the cursor field
629
+ description: The possible formats for the cursor field, in order of preference. The first format that matches the cursor field value will be used to parse it. If not provided, the `datetime_format` will be used.
630
630
  type: array
631
631
  items:
632
632
  type: string
@@ -810,9 +810,9 @@ class DatetimeBasedCursor(BaseModel):
810
810
  )
811
811
  datetime_format: str = Field(
812
812
  ...,
813
- description="The datetime format of the Cursor Field. Use placeholders starting with \"%\" to describe the format the API is using. The following placeholders are available:\n * **%s**: Epoch unix timestamp - `1686218963`\n * **%a**: Weekday (abbreviated) - `Sun`\n * **%A**: Weekday (full) - `Sunday`\n * **%w**: Weekday (decimal) - `0` (Sunday), `6` (Saturday)\n * **%d**: Day of the month (zero-padded) - `01`, `02`, ..., `31`\n * **%b**: Month (abbreviated) - `Jan`\n * **%B**: Month (full) - `January`\n * **%m**: Month (zero-padded) - `01`, `02`, ..., `12`\n * **%y**: Year (without century, zero-padded) - `00`, `01`, ..., `99`\n * **%Y**: Year (with century) - `0001`, `0002`, ..., `9999`\n * **%H**: Hour (24-hour, zero-padded) - `00`, `01`, ..., `23`\n * **%I**: Hour (12-hour, zero-padded) - `01`, `02`, ..., `12`\n * **%p**: AM/PM indicator\n * **%M**: Minute (zero-padded) - `00`, `01`, ..., `59`\n * **%S**: Second (zero-padded) - `00`, `01`, ..., `59`\n * **%f**: Microsecond (zero-padded to 6 digits) - `000000`\n * **%z**: UTC offset - `(empty)`, `+0000`, `-04:00`\n * **%Z**: Time zone name - `(empty)`, `UTC`, `GMT`\n * **%j**: Day of the year (zero-padded) - `001`, `002`, ..., `366`\n * **%U**: Week number of the year (starting Sunday) - `00`, ..., `53`\n * **%W**: Week number of the year (starting Monday) - `00`, ..., `53`\n * **%c**: Date and time - `Tue Aug 16 21:30:00 1988`\n * **%x**: Date standard format - `08/16/1988`\n * **%X**: Time standard format - `21:30:00`\n * **%%**: Literal '%' character\n\n Some placeholders depend on the locale of the underlying system - in most cases this locale is configured as en/US. For more information see the [Python documentation](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes).\n",
813
+ description="The datetime format used to format the datetime values that are sent in outgoing requests to the API. Use placeholders starting with \"%\" to describe the format the API is using. The following placeholders are available:\n * **%s**: Epoch unix timestamp - `1686218963`\n * **%a**: Weekday (abbreviated) - `Sun`\n * **%A**: Weekday (full) - `Sunday`\n * **%w**: Weekday (decimal) - `0` (Sunday), `6` (Saturday)\n * **%d**: Day of the month (zero-padded) - `01`, `02`, ..., `31`\n * **%b**: Month (abbreviated) - `Jan`\n * **%B**: Month (full) - `January`\n * **%m**: Month (zero-padded) - `01`, `02`, ..., `12`\n * **%y**: Year (without century, zero-padded) - `00`, `01`, ..., `99`\n * **%Y**: Year (with century) - `0001`, `0002`, ..., `9999`\n * **%H**: Hour (24-hour, zero-padded) - `00`, `01`, ..., `23`\n * **%I**: Hour (12-hour, zero-padded) - `01`, `02`, ..., `12`\n * **%p**: AM/PM indicator\n * **%M**: Minute (zero-padded) - `00`, `01`, ..., `59`\n * **%S**: Second (zero-padded) - `00`, `01`, ..., `59`\n * **%f**: Microsecond (zero-padded to 6 digits) - `000000`\n * **%z**: UTC offset - `(empty)`, `+0000`, `-04:00`\n * **%Z**: Time zone name - `(empty)`, `UTC`, `GMT`\n * **%j**: Day of the year (zero-padded) - `001`, `002`, ..., `366`\n * **%U**: Week number of the year (starting Sunday) - `00`, ..., `53`\n * **%W**: Week number of the year (starting Monday) - `00`, ..., `53`\n * **%c**: Date and time - `Tue Aug 16 21:30:00 1988`\n * **%x**: Date standard format - `08/16/1988`\n * **%X**: Time standard format - `21:30:00`\n * **%%**: Literal '%' character\n\n Some placeholders depend on the locale of the underlying system - in most cases this locale is configured as en/US. For more information see the [Python documentation](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes).\n",
814
814
  examples=["%Y-%m-%dT%H:%M:%S.%f%z", "%Y-%m-%d", "%s"],
815
- title="Cursor Field Datetime Format",
815
+ title="Outgoing Datetime Format",
816
816
  )
817
817
  start_datetime: Union[str, MinMaxDatetime] = Field(
818
818
  ...,
@@ -822,7 +822,7 @@ class DatetimeBasedCursor(BaseModel):
822
822
  )
823
823
  cursor_datetime_formats: Optional[List[str]] = Field(
824
824
  None,
825
- description="The possible formats for the cursor field",
825
+ description="The possible formats for the cursor field, in order of preference. The first format that matches the cursor field value will be used to parse it. If not provided, the `datetime_format` will be used.",
826
826
  title="Cursor Datetime Formats",
827
827
  )
828
828
  cursor_granularity: Optional[str] = Field(
@@ -4,7 +4,7 @@
4
4
 
5
5
  import logging
6
6
  import traceback
7
- from typing import List, Optional, Tuple
7
+ from typing import TYPE_CHECKING, List, Optional, Tuple
8
8
 
9
9
  from airbyte_cdk.sources import Source
10
10
  from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy
@@ -12,14 +12,16 @@ from airbyte_cdk.sources.file_based.exceptions import CheckAvailabilityError, Fi
12
12
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
13
13
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
14
14
  from airbyte_cdk.sources.file_based.schema_helpers import conforms_to_schema
15
- from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream
15
+
16
+ if TYPE_CHECKING:
17
+ from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream
16
18
 
17
19
 
18
20
  class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy):
19
21
  def __init__(self, stream_reader: AbstractFileBasedStreamReader):
20
22
  self.stream_reader = stream_reader
21
23
 
22
- def check_availability(self, stream: AbstractFileBasedStream, logger: logging.Logger, _: Optional[Source]) -> Tuple[bool, Optional[str]]: # type: ignore[override]
24
+ def check_availability(self, stream: "AbstractFileBasedStream", logger: logging.Logger, _: Optional[Source]) -> Tuple[bool, Optional[str]]: # type: ignore[override]
23
25
  """
24
26
  Perform a connection check for the stream (verify that we can list files from the stream).
25
27
 
@@ -33,7 +35,7 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy
33
35
  return True, None
34
36
 
35
37
  def check_availability_and_parsability(
36
- self, stream: AbstractFileBasedStream, logger: logging.Logger, _: Optional[Source]
38
+ self, stream: "AbstractFileBasedStream", logger: logging.Logger, _: Optional[Source]
37
39
  ) -> Tuple[bool, Optional[str]]:
38
40
  """
39
41
  Perform a connection check for the stream.
@@ -51,8 +53,6 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy
51
53
  - If the user provided a schema in the config, check that a subset of records in
52
54
  one file conform to the schema via a call to stream.conforms_to_schema(schema).
53
55
  """
54
- if not isinstance(stream, AbstractFileBasedStream):
55
- raise ValueError(f"Stream {stream.name} is not a file-based stream.")
56
56
  try:
57
57
  files = self._check_list_files(stream)
58
58
  self._check_extensions(stream, files)
@@ -62,7 +62,7 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy
62
62
 
63
63
  return True, None
64
64
 
65
- def _check_list_files(self, stream: AbstractFileBasedStream) -> List[RemoteFile]:
65
+ def _check_list_files(self, stream: "AbstractFileBasedStream") -> List[RemoteFile]:
66
66
  try:
67
67
  files = stream.list_files()
68
68
  except Exception as exc:
@@ -73,12 +73,12 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy
73
73
 
74
74
  return files
75
75
 
76
- def _check_extensions(self, stream: AbstractFileBasedStream, files: List[RemoteFile]) -> None:
76
+ def _check_extensions(self, stream: "AbstractFileBasedStream", files: List[RemoteFile]) -> None:
77
77
  if not all(f.extension_agrees_with_file_type(stream.config.file_type) for f in files):
78
78
  raise CheckAvailabilityError(FileBasedSourceError.EXTENSION_MISMATCH, stream=stream.name)
79
79
  return None
80
80
 
81
- def _check_parse_record(self, stream: AbstractFileBasedStream, file: RemoteFile, logger: logging.Logger) -> None:
81
+ def _check_parse_record(self, stream: "AbstractFileBasedStream", file: RemoteFile, logger: logging.Logger) -> None:
82
82
  parser = stream.get_parser(stream.config.file_type)
83
83
 
84
84
  try:
@@ -4,9 +4,9 @@
4
4
 
5
5
  import codecs
6
6
  from enum import Enum
7
- from typing import Optional
7
+ from typing import Any, Mapping, Optional, Set
8
8
 
9
- from pydantic import BaseModel, Field, validator
9
+ from pydantic import BaseModel, Field, root_validator, validator
10
10
  from typing_extensions import Literal
11
11
 
12
12
 
@@ -17,6 +17,10 @@ class QuotingBehavior(Enum):
17
17
  QUOTE_NONE = "Quote None"
18
18
 
19
19
 
20
+ DEFAULT_TRUE_VALUES = ["y", "yes", "t", "true", "on", "1"]
21
+ DEFAULT_FALSE_VALUES = ["n", "no", "f", "false", "off", "0"]
22
+
23
+
20
24
  class CsvFormat(BaseModel):
21
25
  filetype: Literal["csv"] = "csv"
22
26
  delimiter: str = Field(
@@ -46,10 +50,34 @@ class CsvFormat(BaseModel):
46
50
  default=QuotingBehavior.QUOTE_SPECIAL_CHARACTERS,
47
51
  description="The quoting behavior determines when a value in a row should have quote marks added around it. For example, if Quote Non-numeric is specified, while reading, quotes are expected for row values that do not contain numbers. Or for Quote All, every row value will be expecting quotes.",
48
52
  )
49
-
50
- # Noting that the existing S3 connector had a config option newlines_in_values. This was only supported by pyarrow and not
51
- # the Python csv package. It has a little adoption, but long term we should ideally phase this out because of the drawbacks
52
- # of using pyarrow
53
+ null_values: Set[str] = Field(
54
+ title="Null Values",
55
+ default=[],
56
+ description="A set of case-sensitive strings that should be interpreted as null values. For example, if the value 'NA' should be interpreted as null, enter 'NA' in this field.",
57
+ )
58
+ skip_rows_before_header: int = Field(
59
+ title="Skip Rows Before Header",
60
+ default=0,
61
+ description="The number of rows to skip before the header row. For example, if the header row is on the 3rd row, enter 2 in this field.",
62
+ )
63
+ skip_rows_after_header: int = Field(
64
+ title="Skip Rows After Header", default=0, description="The number of rows to skip after the header row."
65
+ )
66
+ autogenerate_column_names: bool = Field(
67
+ title="Autogenerate Column Names",
68
+ default=False,
69
+ description="Whether to autogenerate column names if column_names is empty. If true, column names will be of the form “f0”, “f1”… If false, column names will be read from the first CSV row after skip_rows_before_header.",
70
+ )
71
+ true_values: Set[str] = Field(
72
+ title="True Values",
73
+ default=DEFAULT_TRUE_VALUES,
74
+ description="A set of case-sensitive strings that should be interpreted as true values.",
75
+ )
76
+ false_values: Set[str] = Field(
77
+ title="False Values",
78
+ default=DEFAULT_FALSE_VALUES,
79
+ description="A set of case-sensitive strings that should be interpreted as false values.",
80
+ )
53
81
 
54
82
  @validator("delimiter")
55
83
  def validate_delimiter(cls, v: str) -> str:
@@ -78,3 +106,11 @@ class CsvFormat(BaseModel):
78
106
  except LookupError:
79
107
  raise ValueError(f"invalid encoding format: {v}")
80
108
  return v
109
+
110
+ @root_validator
111
+ def validate_option_combinations(cls, values: Mapping[str, Any]) -> Mapping[str, Any]:
112
+ skip_rows_before_header = values.get("skip_rows_before_header", 0)
113
+ auto_generate_column_names = values.get("autogenerate_column_names", False)
114
+ if skip_rows_before_header > 0 and auto_generate_column_names:
115
+ raise ValueError("Cannot skip rows before header and autogenerate column names at the same time.")
116
+ return values
@@ -19,12 +19,11 @@ from airbyte_cdk.sources.file_based.file_types import default_parsers
19
19
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
20
20
  from airbyte_cdk.sources.file_based.schema_validation_policies import DEFAULT_SCHEMA_VALIDATION_POLICIES, AbstractSchemaValidationPolicy
21
21
  from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream, DefaultFileBasedStream
22
+ from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
22
23
  from airbyte_cdk.sources.file_based.stream.cursor.default_file_based_cursor import DefaultFileBasedCursor
23
24
  from airbyte_cdk.sources.streams import Stream
24
25
  from pydantic.error_wrappers import ValidationError
25
26
 
26
- DEFAULT_MAX_HISTORY_SIZE = 10_000
27
-
28
27
 
29
28
  class FileBasedSource(AbstractSource, ABC):
30
29
  def __init__(
@@ -36,7 +35,7 @@ class FileBasedSource(AbstractSource, ABC):
36
35
  discovery_policy: AbstractDiscoveryPolicy = DefaultDiscoveryPolicy(),
37
36
  parsers: Mapping[str, FileTypeParser] = default_parsers,
38
37
  validation_policies: Mapping[str, AbstractSchemaValidationPolicy] = DEFAULT_SCHEMA_VALIDATION_POLICIES,
39
- max_history_size: int = DEFAULT_MAX_HISTORY_SIZE,
38
+ cursor_cls: Type[AbstractFileBasedCursor] = DefaultFileBasedCursor,
40
39
  ):
41
40
  self.stream_reader = stream_reader
42
41
  self.spec_class = spec_class
@@ -46,7 +45,7 @@ class FileBasedSource(AbstractSource, ABC):
46
45
  self.validation_policies = validation_policies
47
46
  catalog = self.read_catalog(catalog_path) if catalog_path else None
48
47
  self.stream_schemas = {s.stream.name: s.stream.json_schema for s in catalog.streams} if catalog else {}
49
- self.max_history_size = max_history_size
48
+ self.cursor_cls = cursor_cls
50
49
  self.logger = logging.getLogger(f"airbyte.{self.name}")
51
50
 
52
51
  def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Optional[Any]]:
@@ -104,7 +103,7 @@ class FileBasedSource(AbstractSource, ABC):
104
103
  discovery_policy=self.discovery_policy,
105
104
  parsers=self.parsers,
106
105
  validation_policy=self._validate_and_get_validation_policy(stream_config),
107
- cursor=DefaultFileBasedCursor(self.max_history_size, stream_config.days_to_sync_if_history_is_full),
106
+ cursor=self.cursor_cls(stream_config),
108
107
  )
109
108
  )
110
109
  return streams
@@ -5,12 +5,13 @@
5
5
  import csv
6
6
  import json
7
7
  import logging
8
- from distutils.util import strtobool
9
- from typing import Any, Dict, Iterable, Mapping, Optional
8
+ from functools import partial
9
+ from io import IOBase
10
+ from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Set
10
11
 
11
12
  from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, QuotingBehavior
12
13
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
13
- from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError
14
+ from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
14
15
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
15
16
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
16
17
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
@@ -34,30 +35,25 @@ class CsvParser(FileTypeParser):
34
35
  stream_reader: AbstractFileBasedStreamReader,
35
36
  logger: logging.Logger,
36
37
  ) -> Dict[str, Any]:
37
- config_format = config.format.get(config.file_type) if config.format else None
38
- if config_format:
39
- if not isinstance(config_format, CsvFormat):
40
- raise ValueError(f"Invalid format config: {config_format}")
41
- dialect_name = config.name + DIALECT_NAME
42
- csv.register_dialect(
43
- dialect_name,
44
- delimiter=config_format.delimiter,
45
- quotechar=config_format.quote_char,
46
- escapechar=config_format.escape_char,
47
- doublequote=config_format.double_quote,
48
- quoting=config_to_quoting.get(config_format.quoting_behavior, csv.QUOTE_MINIMAL),
49
- )
50
- with stream_reader.open_file(file, self.file_read_mode, logger) as fp:
51
- # todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual
52
- # sources will likely require one. Rather than modify the interface now we can wait until the real use case
53
- reader = csv.DictReader(fp, dialect=dialect_name) # type: ignore
54
- schema = {field.strip(): {"type": "string"} for field in next(reader)}
55
- csv.unregister_dialect(dialect_name)
56
- return schema
57
- else:
58
- with stream_reader.open_file(file, self.file_read_mode, logger) as fp:
59
- reader = csv.DictReader(fp) # type: ignore
60
- return {field.strip(): {"type": "string"} for field in next(reader)}
38
+ config_format = config.format.get(config.file_type) if config.format else CsvFormat()
39
+ if not isinstance(config_format, CsvFormat):
40
+ raise ValueError(f"Invalid format config: {config_format}")
41
+ dialect_name = config.name + DIALECT_NAME
42
+ csv.register_dialect(
43
+ dialect_name,
44
+ delimiter=config_format.delimiter,
45
+ quotechar=config_format.quote_char,
46
+ escapechar=config_format.escape_char,
47
+ doublequote=config_format.double_quote,
48
+ quoting=config_to_quoting.get(config_format.quoting_behavior, csv.QUOTE_MINIMAL),
49
+ )
50
+ with stream_reader.open_file(file, self.file_read_mode, logger) as fp:
51
+ # todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual
52
+ # sources will likely require one. Rather than modify the interface now we can wait until the real use case
53
+ headers = self._get_headers(fp, config_format, dialect_name)
54
+ schema = {field.strip(): {"type": "string"} for field in headers}
55
+ csv.unregister_dialect(dialect_name)
56
+ return schema
61
57
 
62
58
  def parse_records(
63
59
  self,
@@ -67,30 +63,28 @@ class CsvParser(FileTypeParser):
67
63
  logger: logging.Logger,
68
64
  ) -> Iterable[Dict[str, Any]]:
69
65
  schema: Mapping[str, Any] = config.input_schema # type: ignore
70
- config_format = config.format.get(config.file_type) if config.format else None
71
- if config_format:
72
- if not isinstance(config_format, CsvFormat):
73
- raise ValueError(f"Invalid format config: {config_format}")
74
- # Formats are configured individually per-stream so a unique dialect should be registered for each stream.
75
- # Wwe don't unregister the dialect because we are lazily parsing each csv file to generate records
76
- dialect_name = config.name + DIALECT_NAME
77
- csv.register_dialect(
78
- dialect_name,
79
- delimiter=config_format.delimiter,
80
- quotechar=config_format.quote_char,
81
- escapechar=config_format.escape_char,
82
- doublequote=config_format.double_quote,
83
- quoting=config_to_quoting.get(config_format.quoting_behavior, csv.QUOTE_MINIMAL),
84
- )
85
- with stream_reader.open_file(file, self.file_read_mode, logger) as fp:
86
- # todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual
87
- # sources will likely require one. Rather than modify the interface now we can wait until the real use case
88
- reader = csv.DictReader(fp, dialect=dialect_name) # type: ignore
89
- yield from self._read_and_cast_types(reader, schema, logger)
90
- else:
91
- with stream_reader.open_file(file, self.file_read_mode, logger) as fp:
92
- reader = csv.DictReader(fp) # type: ignore
93
- yield from self._read_and_cast_types(reader, schema, logger)
66
+ config_format = config.format.get(config.file_type) if config.format else CsvFormat()
67
+ if not isinstance(config_format, CsvFormat):
68
+ raise ValueError(f"Invalid format config: {config_format}")
69
+ # Formats are configured individually per-stream so a unique dialect should be registered for each stream.
70
+ # We don't unregister the dialect because we are lazily parsing each csv file to generate records
71
+ # This will potentially be a problem if we ever process multiple streams concurrently
72
+ dialect_name = config.name + DIALECT_NAME
73
+ csv.register_dialect(
74
+ dialect_name,
75
+ delimiter=config_format.delimiter,
76
+ quotechar=config_format.quote_char,
77
+ escapechar=config_format.escape_char,
78
+ doublequote=config_format.double_quote,
79
+ quoting=config_to_quoting.get(config_format.quoting_behavior, csv.QUOTE_MINIMAL),
80
+ )
81
+ with stream_reader.open_file(file, self.file_read_mode, logger) as fp:
82
+ # todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual
83
+ # sources will likely require one. Rather than modify the interface now we can wait until the real use case
84
+ self._skip_rows_before_header(fp, config_format.skip_rows_before_header)
85
+ field_names = self._auto_generate_headers(fp, config_format) if config_format.autogenerate_column_names else None
86
+ reader = csv.DictReader(fp, dialect=dialect_name, fieldnames=field_names) # type: ignore
87
+ yield from self._read_and_cast_types(reader, schema, config_format, logger)
94
88
 
95
89
  @property
96
90
  def file_read_mode(self) -> FileReadMode:
@@ -98,7 +92,7 @@ class CsvParser(FileTypeParser):
98
92
 
99
93
  @staticmethod
100
94
  def _read_and_cast_types(
101
- reader: csv.DictReader, schema: Optional[Mapping[str, Any]], logger: logging.Logger # type: ignore
95
+ reader: csv.DictReader, schema: Optional[Mapping[str, Any]], config_format: CsvFormat, logger: logging.Logger # type: ignore
102
96
  ) -> Iterable[Dict[str, Any]]:
103
97
  """
104
98
  If the user provided a schema, attempt to cast the record values to the associated type.
@@ -107,16 +101,65 @@ class CsvParser(FileTypeParser):
107
101
  cast it to a string. Downstream, the user's validation policy will determine whether the
108
102
  record should be emitted.
109
103
  """
110
- if not schema:
111
- yield from reader
104
+ cast_fn = CsvParser._get_cast_function(schema, config_format, logger)
105
+ for i, row in enumerate(reader):
106
+ if i < config_format.skip_rows_after_header:
107
+ continue
108
+ # The row was not properly parsed if any of the values are None
109
+ if any(val is None for val in row.values()):
110
+ raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD)
111
+ else:
112
+ yield CsvParser._to_nullable(cast_fn(row), config_format.null_values)
112
113
 
113
- else:
114
+ @staticmethod
115
+ def _get_cast_function(
116
+ schema: Optional[Mapping[str, Any]], config_format: CsvFormat, logger: logging.Logger
117
+ ) -> Callable[[Mapping[str, str]], Mapping[str, str]]:
118
+ # Only cast values if the schema is provided
119
+ if schema:
114
120
  property_types = {col: prop["type"] for col, prop in schema["properties"].items()}
115
- for row in reader:
116
- yield cast_types(row, property_types, logger)
121
+ return partial(_cast_types, property_types=property_types, config_format=config_format, logger=logger)
122
+ else:
123
+ # If no schema is provided, yield the rows as they are
124
+ return _no_cast
125
+
126
+ @staticmethod
127
+ def _to_nullable(row: Mapping[str, str], null_values: Set[str]) -> Dict[str, Optional[str]]:
128
+ nullable = row | {k: None if v in null_values else v for k, v in row.items()}
129
+ return nullable
130
+
131
+ @staticmethod
132
+ def _skip_rows_before_header(fp: IOBase, rows_to_skip: int) -> None:
133
+ """
134
+ Skip rows before the header. This has to be done on the file object itself, not the reader
135
+ """
136
+ for _ in range(rows_to_skip):
137
+ fp.readline()
138
+
139
+ def _get_headers(self, fp: IOBase, config_format: CsvFormat, dialect_name: str) -> List[str]:
140
+ # Note that this method assumes the dialect has already been registered if we're parsing the headers
141
+ if config_format.autogenerate_column_names:
142
+ return self._auto_generate_headers(fp, config_format)
143
+ else:
144
+ # If we're not autogenerating column names, we need to skip the rows before the header
145
+ self._skip_rows_before_header(fp, config_format.skip_rows_before_header)
146
+ # Then read the header
147
+ reader = csv.DictReader(fp, dialect=dialect_name) # type: ignore
148
+ return next(reader) # type: ignore
117
149
 
150
+ def _auto_generate_headers(self, fp: IOBase, config_format: CsvFormat) -> List[str]:
151
+ """
152
+ Generates field names as [f0, f1, ...] in the same way as pyarrow's csv reader with autogenerate_column_names=True.
153
+ See https://arrow.apache.org/docs/python/generated/pyarrow.csv.ReadOptions.html
154
+ """
155
+ next_line = next(fp).strip()
156
+ number_of_columns = len(next_line.split(config_format.delimiter)) # type: ignore
157
+ # Reset the file pointer to the beginning of the file so that the first row is not skipped
158
+ fp.seek(0)
159
+ return [f"f{i}" for i in range(number_of_columns)]
118
160
 
119
- def cast_types(row: Dict[str, str], property_types: Dict[str, Any], logger: logging.Logger) -> Dict[str, Any]:
161
+
162
+ def _cast_types(row: Dict[str, str], property_types: Dict[str, Any], config_format: CsvFormat, logger: logging.Logger) -> Dict[str, Any]:
120
163
  """
121
164
  Casts the values in the input 'row' dictionary according to the types defined in the JSON schema.
122
165
 
@@ -142,7 +185,7 @@ def cast_types(row: Dict[str, str], property_types: Dict[str, Any], logger: logg
142
185
 
143
186
  elif python_type == bool:
144
187
  try:
145
- cast_value = strtobool(value)
188
+ cast_value = _value_to_bool(value, config_format.true_values, config_format.false_values)
146
189
  except ValueError:
147
190
  warnings.append(_format_warning(key, value, prop_type))
148
191
 
@@ -178,5 +221,17 @@ def cast_types(row: Dict[str, str], property_types: Dict[str, Any], logger: logg
178
221
  return result
179
222
 
180
223
 
224
+ def _value_to_bool(value: str, true_values: Set[str], false_values: Set[str]) -> bool:
225
+ if value in true_values:
226
+ return True
227
+ if value in false_values:
228
+ return False
229
+ raise ValueError(f"Value {value} is not a valid boolean value")
230
+
231
+
181
232
  def _format_warning(key: str, value: str, expected_type: Optional[Any]) -> str:
182
233
  return f"{key}: value={value},expected_type={expected_type}"
234
+
235
+
236
+ def _no_cast(row: Mapping[str, str]) -> Mapping[str, str]:
237
+ return row
@@ -1,4 +1,4 @@
1
+ from .abstract_file_based_cursor import AbstractFileBasedCursor
1
2
  from .default_file_based_cursor import DefaultFileBasedCursor
2
- from .file_based_cursor import FileBasedCursor
3
3
 
4
- __all__ = ["FileBasedCursor", "DefaultFileBasedCursor"]
4
+ __all__ = ["AbstractFileBasedCursor", "DefaultFileBasedCursor"]
@@ -7,15 +7,23 @@ from abc import ABC, abstractmethod
7
7
  from datetime import datetime
8
8
  from typing import Any, Iterable, MutableMapping
9
9
 
10
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
10
11
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
11
12
  from airbyte_cdk.sources.file_based.types import StreamState
12
13
 
13
14
 
14
- class FileBasedCursor(ABC):
15
+ class AbstractFileBasedCursor(ABC):
15
16
  """
16
17
  Abstract base class for cursors used by file-based streams.
17
18
  """
18
19
 
20
+ @abstractmethod
21
+ def __init__(self, stream_config: FileBasedStreamConfig, **kwargs: Any):
22
+ """
23
+ Common interface for all cursors.
24
+ """
25
+ ...
26
+
19
27
  @abstractmethod
20
28
  def add_file(self, file: RemoteFile) -> None:
21
29
  """
@@ -4,26 +4,26 @@
4
4
 
5
5
  import logging
6
6
  from datetime import datetime, timedelta
7
- from typing import Iterable, MutableMapping, Optional
7
+ from typing import Any, Iterable, MutableMapping, Optional
8
8
 
9
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
9
10
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
10
- from airbyte_cdk.sources.file_based.stream.cursor.file_based_cursor import FileBasedCursor
11
+ from airbyte_cdk.sources.file_based.stream.cursor.abstract_file_based_cursor import AbstractFileBasedCursor
11
12
  from airbyte_cdk.sources.file_based.types import StreamState
12
13
 
13
14
 
14
- class DefaultFileBasedCursor(FileBasedCursor):
15
+ class DefaultFileBasedCursor(AbstractFileBasedCursor):
15
16
  DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL = 3
17
+ DEFAULT_MAX_HISTORY_SIZE = 10_000
16
18
  DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
17
19
 
18
- def __init__(self, max_history_size: int, days_to_sync_if_history_is_full: Optional[int]):
20
+ def __init__(self, stream_config: FileBasedStreamConfig, **_: Any):
21
+ super().__init__(stream_config)
19
22
  self._file_to_datetime_history: MutableMapping[str, str] = {}
20
- self._max_history_size = max_history_size
21
23
  self._time_window_if_history_is_full = timedelta(
22
- days=days_to_sync_if_history_is_full or self.DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
24
+ days=stream_config.days_to_sync_if_history_is_full or self.DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
23
25
  )
24
26
 
25
- if self._max_history_size <= 0:
26
- raise ValueError(f"max_history_size must be a positive integer, got {self._max_history_size}")
27
27
  if self._time_window_if_history_is_full <= timedelta():
28
28
  raise ValueError(f"days_to_sync_if_history_is_full must be a positive timedelta, got {self._time_window_if_history_is_full}")
29
29
 
@@ -37,7 +37,7 @@ class DefaultFileBasedCursor(FileBasedCursor):
37
37
 
38
38
  def add_file(self, file: RemoteFile) -> None:
39
39
  self._file_to_datetime_history[file.uri] = file.last_modified.strftime(self.DATE_TIME_FORMAT)
40
- if len(self._file_to_datetime_history) > self._max_history_size:
40
+ if len(self._file_to_datetime_history) > self.DEFAULT_MAX_HISTORY_SIZE:
41
41
  # Get the earliest file based on its last modified date and its uri
42
42
  oldest_file = self._compute_earliest_file_in_history()
43
43
  if oldest_file:
@@ -67,7 +67,7 @@ class DefaultFileBasedCursor(FileBasedCursor):
67
67
  """
68
68
  Returns true if the state's history is full, meaning new entries will start to replace old entries.
69
69
  """
70
- return len(self._file_to_datetime_history) >= self._max_history_size
70
+ return len(self._file_to_datetime_history) >= self.DEFAULT_MAX_HISTORY_SIZE
71
71
 
72
72
  def _should_sync_file(self, file: RemoteFile, logger: logging.Logger) -> bool:
73
73
  if file.uri in self._file_to_datetime_history:
@@ -15,13 +15,14 @@ from airbyte_cdk.sources.file_based.exceptions import (
15
15
  FileBasedSourceError,
16
16
  InvalidSchemaError,
17
17
  MissingSchemaError,
18
+ RecordParseError,
18
19
  SchemaInferenceError,
19
20
  StopSyncPerValidationPolicy,
20
21
  )
21
22
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
22
23
  from airbyte_cdk.sources.file_based.schema_helpers import merge_schemas, schemaless_schema
23
24
  from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream
24
- from airbyte_cdk.sources.file_based.stream.cursor import FileBasedCursor
25
+ from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
25
26
  from airbyte_cdk.sources.file_based.types import StreamSlice
26
27
  from airbyte_cdk.sources.streams import IncrementalMixin
27
28
  from airbyte_cdk.sources.streams.core import JsonSchema
@@ -39,7 +40,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
39
40
  ab_file_name_col = "_ab_source_file_url"
40
41
  airbyte_columns = [ab_last_mod_col, ab_file_name_col]
41
42
 
42
- def __init__(self, cursor: FileBasedCursor, **kwargs: Any):
43
+ def __init__(self, cursor: AbstractFileBasedCursor, **kwargs: Any):
43
44
  super().__init__(**kwargs)
44
45
  self._cursor = cursor
45
46
 
@@ -105,6 +106,18 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
105
106
  )
106
107
  break
107
108
 
109
+ except RecordParseError:
110
+ # Increment line_no because the exception was raised before we could increment it
111
+ line_no += 1
112
+ yield AirbyteMessage(
113
+ type=MessageType.LOG,
114
+ log=AirbyteLogMessage(
115
+ level=Level.ERROR,
116
+ message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={self.name} file={file.uri} line_no={line_no} n_skipped={n_skipped}",
117
+ stack_trace=traceback.format_exc(),
118
+ ),
119
+ )
120
+
108
121
  except Exception:
109
122
  yield AirbyteMessage(
110
123
  type=MessageType.LOG,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: airbyte-cdk
3
- Version: 0.50.0
3
+ Version: 0.50.2
4
4
  Summary: A framework for writing Airbyte Connectors.
5
5
  Home-page: https://github.com/airbytehq/airbyte
6
6
  Author: Airbyte