airbyte-cdk 0.50.0__py3-none-any.whl → 0.50.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (25) hide show
  1. airbyte_cdk/entrypoint.py +7 -0
  2. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3 -3
  3. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +3 -3
  4. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +9 -9
  5. airbyte_cdk/sources/file_based/config/csv_format.py +42 -6
  6. airbyte_cdk/sources/file_based/file_based_source.py +4 -5
  7. airbyte_cdk/sources/file_based/file_types/csv_parser.py +114 -59
  8. airbyte_cdk/sources/file_based/stream/cursor/__init__.py +2 -2
  9. airbyte_cdk/sources/file_based/stream/cursor/{file_based_cursor.py → abstract_file_based_cursor.py} +9 -1
  10. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +10 -10
  11. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +15 -2
  12. {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/METADATA +1 -1
  13. {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/RECORD +25 -24
  14. unit_tests/sources/file_based/config/test_csv_format.py +23 -0
  15. unit_tests/sources/file_based/file_types/test_csv_parser.py +50 -18
  16. unit_tests/sources/file_based/helpers.py +5 -0
  17. unit_tests/sources/file_based/in_memory_files_source.py +11 -3
  18. unit_tests/sources/file_based/scenarios/csv_scenarios.py +1254 -47
  19. unit_tests/sources/file_based/scenarios/incremental_scenarios.py +6 -5
  20. unit_tests/sources/file_based/scenarios/scenario_builder.py +8 -7
  21. unit_tests/sources/file_based/stream/test_default_file_based_cursor.py +13 -12
  22. unit_tests/sources/file_based/test_scenarios.py +30 -0
  23. {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/LICENSE.txt +0 -0
  24. {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/WHEEL +0 -0
  25. {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/top_level.txt +0 -0
airbyte_cdk/entrypoint.py CHANGED
@@ -181,6 +181,13 @@ class AirbyteEntrypoint(object):
181
181
  return parsed_args.catalog
182
182
  return None
183
183
 
184
+ @classmethod
185
+ def extract_config(cls, args: List[str]) -> Optional[Any]:
186
+ parsed_args = cls.parse_args(args)
187
+ if hasattr(parsed_args, "config"):
188
+ return parsed_args.config
189
+ return None
190
+
184
191
  def _emit_queued_messages(self, source: Source) -> Iterable[AirbyteMessage]:
185
192
  if hasattr(source, "message_repository") and source.message_repository:
186
193
  yield from source.message_repository.consume_queue()
@@ -578,9 +578,9 @@ definitions:
578
578
  - "created_at"
579
579
  - "{{ config['record_cursor'] }}"
580
580
  datetime_format:
581
- title: Cursor Field Datetime Format
581
+ title: Outgoing Datetime Format
582
582
  description: |
583
- The datetime format of the Cursor Field. Use placeholders starting with "%" to describe the format the API is using. The following placeholders are available:
583
+ The datetime format used to format the datetime values that are sent in outgoing requests to the API. Use placeholders starting with "%" to describe the format the API is using. The following placeholders are available:
584
584
  * **%s**: Epoch unix timestamp - `1686218963`
585
585
  * **%a**: Weekday (abbreviated) - `Sun`
586
586
  * **%A**: Weekday (full) - `Sunday`
@@ -626,7 +626,7 @@ definitions:
626
626
  - "{{ config['start_time'] }}"
627
627
  cursor_datetime_formats:
628
628
  title: Cursor Datetime Formats
629
- description: The possible formats for the cursor field
629
+ description: The possible formats for the cursor field, in order of preference. The first format that matches the cursor field value will be used to parse it. If not provided, the `datetime_format` will be used.
630
630
  type: array
631
631
  items:
632
632
  type: string
@@ -810,9 +810,9 @@ class DatetimeBasedCursor(BaseModel):
810
810
  )
811
811
  datetime_format: str = Field(
812
812
  ...,
813
- description="The datetime format of the Cursor Field. Use placeholders starting with \"%\" to describe the format the API is using. The following placeholders are available:\n * **%s**: Epoch unix timestamp - `1686218963`\n * **%a**: Weekday (abbreviated) - `Sun`\n * **%A**: Weekday (full) - `Sunday`\n * **%w**: Weekday (decimal) - `0` (Sunday), `6` (Saturday)\n * **%d**: Day of the month (zero-padded) - `01`, `02`, ..., `31`\n * **%b**: Month (abbreviated) - `Jan`\n * **%B**: Month (full) - `January`\n * **%m**: Month (zero-padded) - `01`, `02`, ..., `12`\n * **%y**: Year (without century, zero-padded) - `00`, `01`, ..., `99`\n * **%Y**: Year (with century) - `0001`, `0002`, ..., `9999`\n * **%H**: Hour (24-hour, zero-padded) - `00`, `01`, ..., `23`\n * **%I**: Hour (12-hour, zero-padded) - `01`, `02`, ..., `12`\n * **%p**: AM/PM indicator\n * **%M**: Minute (zero-padded) - `00`, `01`, ..., `59`\n * **%S**: Second (zero-padded) - `00`, `01`, ..., `59`\n * **%f**: Microsecond (zero-padded to 6 digits) - `000000`\n * **%z**: UTC offset - `(empty)`, `+0000`, `-04:00`\n * **%Z**: Time zone name - `(empty)`, `UTC`, `GMT`\n * **%j**: Day of the year (zero-padded) - `001`, `002`, ..., `366`\n * **%U**: Week number of the year (starting Sunday) - `00`, ..., `53`\n * **%W**: Week number of the year (starting Monday) - `00`, ..., `53`\n * **%c**: Date and time - `Tue Aug 16 21:30:00 1988`\n * **%x**: Date standard format - `08/16/1988`\n * **%X**: Time standard format - `21:30:00`\n * **%%**: Literal '%' character\n\n Some placeholders depend on the locale of the underlying system - in most cases this locale is configured as en/US. For more information see the [Python documentation](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes).\n",
813
+ description="The datetime format used to format the datetime values that are sent in outgoing requests to the API. Use placeholders starting with \"%\" to describe the format the API is using. The following placeholders are available:\n * **%s**: Epoch unix timestamp - `1686218963`\n * **%a**: Weekday (abbreviated) - `Sun`\n * **%A**: Weekday (full) - `Sunday`\n * **%w**: Weekday (decimal) - `0` (Sunday), `6` (Saturday)\n * **%d**: Day of the month (zero-padded) - `01`, `02`, ..., `31`\n * **%b**: Month (abbreviated) - `Jan`\n * **%B**: Month (full) - `January`\n * **%m**: Month (zero-padded) - `01`, `02`, ..., `12`\n * **%y**: Year (without century, zero-padded) - `00`, `01`, ..., `99`\n * **%Y**: Year (with century) - `0001`, `0002`, ..., `9999`\n * **%H**: Hour (24-hour, zero-padded) - `00`, `01`, ..., `23`\n * **%I**: Hour (12-hour, zero-padded) - `01`, `02`, ..., `12`\n * **%p**: AM/PM indicator\n * **%M**: Minute (zero-padded) - `00`, `01`, ..., `59`\n * **%S**: Second (zero-padded) - `00`, `01`, ..., `59`\n * **%f**: Microsecond (zero-padded to 6 digits) - `000000`\n * **%z**: UTC offset - `(empty)`, `+0000`, `-04:00`\n * **%Z**: Time zone name - `(empty)`, `UTC`, `GMT`\n * **%j**: Day of the year (zero-padded) - `001`, `002`, ..., `366`\n * **%U**: Week number of the year (starting Sunday) - `00`, ..., `53`\n * **%W**: Week number of the year (starting Monday) - `00`, ..., `53`\n * **%c**: Date and time - `Tue Aug 16 21:30:00 1988`\n * **%x**: Date standard format - `08/16/1988`\n * **%X**: Time standard format - `21:30:00`\n * **%%**: Literal '%' character\n\n Some placeholders depend on the locale of the underlying system - in most cases this locale is configured as en/US. For more information see the [Python documentation](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes).\n",
814
814
  examples=["%Y-%m-%dT%H:%M:%S.%f%z", "%Y-%m-%d", "%s"],
815
- title="Cursor Field Datetime Format",
815
+ title="Outgoing Datetime Format",
816
816
  )
817
817
  start_datetime: Union[str, MinMaxDatetime] = Field(
818
818
  ...,
@@ -822,7 +822,7 @@ class DatetimeBasedCursor(BaseModel):
822
822
  )
823
823
  cursor_datetime_formats: Optional[List[str]] = Field(
824
824
  None,
825
- description="The possible formats for the cursor field",
825
+ description="The possible formats for the cursor field, in order of preference. The first format that matches the cursor field value will be used to parse it. If not provided, the `datetime_format` will be used.",
826
826
  title="Cursor Datetime Formats",
827
827
  )
828
828
  cursor_granularity: Optional[str] = Field(
@@ -4,7 +4,7 @@
4
4
 
5
5
  import logging
6
6
  import traceback
7
- from typing import List, Optional, Tuple
7
+ from typing import TYPE_CHECKING, List, Optional, Tuple
8
8
 
9
9
  from airbyte_cdk.sources import Source
10
10
  from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy
@@ -12,14 +12,16 @@ from airbyte_cdk.sources.file_based.exceptions import CheckAvailabilityError, Fi
12
12
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
13
13
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
14
14
  from airbyte_cdk.sources.file_based.schema_helpers import conforms_to_schema
15
- from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream
15
+
16
+ if TYPE_CHECKING:
17
+ from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream
16
18
 
17
19
 
18
20
  class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy):
19
21
  def __init__(self, stream_reader: AbstractFileBasedStreamReader):
20
22
  self.stream_reader = stream_reader
21
23
 
22
- def check_availability(self, stream: AbstractFileBasedStream, logger: logging.Logger, _: Optional[Source]) -> Tuple[bool, Optional[str]]: # type: ignore[override]
24
+ def check_availability(self, stream: "AbstractFileBasedStream", logger: logging.Logger, _: Optional[Source]) -> Tuple[bool, Optional[str]]: # type: ignore[override]
23
25
  """
24
26
  Perform a connection check for the stream (verify that we can list files from the stream).
25
27
 
@@ -33,7 +35,7 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy
33
35
  return True, None
34
36
 
35
37
  def check_availability_and_parsability(
36
- self, stream: AbstractFileBasedStream, logger: logging.Logger, _: Optional[Source]
38
+ self, stream: "AbstractFileBasedStream", logger: logging.Logger, _: Optional[Source]
37
39
  ) -> Tuple[bool, Optional[str]]:
38
40
  """
39
41
  Perform a connection check for the stream.
@@ -51,8 +53,6 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy
51
53
  - If the user provided a schema in the config, check that a subset of records in
52
54
  one file conform to the schema via a call to stream.conforms_to_schema(schema).
53
55
  """
54
- if not isinstance(stream, AbstractFileBasedStream):
55
- raise ValueError(f"Stream {stream.name} is not a file-based stream.")
56
56
  try:
57
57
  files = self._check_list_files(stream)
58
58
  self._check_extensions(stream, files)
@@ -62,7 +62,7 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy
62
62
 
63
63
  return True, None
64
64
 
65
- def _check_list_files(self, stream: AbstractFileBasedStream) -> List[RemoteFile]:
65
+ def _check_list_files(self, stream: "AbstractFileBasedStream") -> List[RemoteFile]:
66
66
  try:
67
67
  files = stream.list_files()
68
68
  except Exception as exc:
@@ -73,12 +73,12 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy
73
73
 
74
74
  return files
75
75
 
76
- def _check_extensions(self, stream: AbstractFileBasedStream, files: List[RemoteFile]) -> None:
76
+ def _check_extensions(self, stream: "AbstractFileBasedStream", files: List[RemoteFile]) -> None:
77
77
  if not all(f.extension_agrees_with_file_type(stream.config.file_type) for f in files):
78
78
  raise CheckAvailabilityError(FileBasedSourceError.EXTENSION_MISMATCH, stream=stream.name)
79
79
  return None
80
80
 
81
- def _check_parse_record(self, stream: AbstractFileBasedStream, file: RemoteFile, logger: logging.Logger) -> None:
81
+ def _check_parse_record(self, stream: "AbstractFileBasedStream", file: RemoteFile, logger: logging.Logger) -> None:
82
82
  parser = stream.get_parser(stream.config.file_type)
83
83
 
84
84
  try:
@@ -4,9 +4,9 @@
4
4
 
5
5
  import codecs
6
6
  from enum import Enum
7
- from typing import Optional
7
+ from typing import Any, Mapping, Optional, Set
8
8
 
9
- from pydantic import BaseModel, Field, validator
9
+ from pydantic import BaseModel, Field, root_validator, validator
10
10
  from typing_extensions import Literal
11
11
 
12
12
 
@@ -17,6 +17,10 @@ class QuotingBehavior(Enum):
17
17
  QUOTE_NONE = "Quote None"
18
18
 
19
19
 
20
+ DEFAULT_TRUE_VALUES = ["y", "yes", "t", "true", "on", "1"]
21
+ DEFAULT_FALSE_VALUES = ["n", "no", "f", "false", "off", "0"]
22
+
23
+
20
24
  class CsvFormat(BaseModel):
21
25
  filetype: Literal["csv"] = "csv"
22
26
  delimiter: str = Field(
@@ -46,10 +50,34 @@ class CsvFormat(BaseModel):
46
50
  default=QuotingBehavior.QUOTE_SPECIAL_CHARACTERS,
47
51
  description="The quoting behavior determines when a value in a row should have quote marks added around it. For example, if Quote Non-numeric is specified, while reading, quotes are expected for row values that do not contain numbers. Or for Quote All, every row value will be expecting quotes.",
48
52
  )
49
-
50
- # Noting that the existing S3 connector had a config option newlines_in_values. This was only supported by pyarrow and not
51
- # the Python csv package. It has a little adoption, but long term we should ideally phase this out because of the drawbacks
52
- # of using pyarrow
53
+ null_values: Set[str] = Field(
54
+ title="Null Values",
55
+ default=[],
56
+ description="A set of case-sensitive strings that should be interpreted as null values. For example, if the value 'NA' should be interpreted as null, enter 'NA' in this field.",
57
+ )
58
+ skip_rows_before_header: int = Field(
59
+ title="Skip Rows Before Header",
60
+ default=0,
61
+ description="The number of rows to skip before the header row. For example, if the header row is on the 3rd row, enter 2 in this field.",
62
+ )
63
+ skip_rows_after_header: int = Field(
64
+ title="Skip Rows After Header", default=0, description="The number of rows to skip after the header row."
65
+ )
66
+ autogenerate_column_names: bool = Field(
67
+ title="Autogenerate Column Names",
68
+ default=False,
69
+ description="Whether to autogenerate column names if column_names is empty. If true, column names will be of the form “f0”, “f1”… If false, column names will be read from the first CSV row after skip_rows_before_header.",
70
+ )
71
+ true_values: Set[str] = Field(
72
+ title="True Values",
73
+ default=DEFAULT_TRUE_VALUES,
74
+ description="A set of case-sensitive strings that should be interpreted as true values.",
75
+ )
76
+ false_values: Set[str] = Field(
77
+ title="False Values",
78
+ default=DEFAULT_FALSE_VALUES,
79
+ description="A set of case-sensitive strings that should be interpreted as false values.",
80
+ )
53
81
 
54
82
  @validator("delimiter")
55
83
  def validate_delimiter(cls, v: str) -> str:
@@ -78,3 +106,11 @@ class CsvFormat(BaseModel):
78
106
  except LookupError:
79
107
  raise ValueError(f"invalid encoding format: {v}")
80
108
  return v
109
+
110
+ @root_validator
111
+ def validate_option_combinations(cls, values: Mapping[str, Any]) -> Mapping[str, Any]:
112
+ skip_rows_before_header = values.get("skip_rows_before_header", 0)
113
+ auto_generate_column_names = values.get("autogenerate_column_names", False)
114
+ if skip_rows_before_header > 0 and auto_generate_column_names:
115
+ raise ValueError("Cannot skip rows before header and autogenerate column names at the same time.")
116
+ return values
@@ -19,12 +19,11 @@ from airbyte_cdk.sources.file_based.file_types import default_parsers
19
19
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
20
20
  from airbyte_cdk.sources.file_based.schema_validation_policies import DEFAULT_SCHEMA_VALIDATION_POLICIES, AbstractSchemaValidationPolicy
21
21
  from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream, DefaultFileBasedStream
22
+ from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
22
23
  from airbyte_cdk.sources.file_based.stream.cursor.default_file_based_cursor import DefaultFileBasedCursor
23
24
  from airbyte_cdk.sources.streams import Stream
24
25
  from pydantic.error_wrappers import ValidationError
25
26
 
26
- DEFAULT_MAX_HISTORY_SIZE = 10_000
27
-
28
27
 
29
28
  class FileBasedSource(AbstractSource, ABC):
30
29
  def __init__(
@@ -36,7 +35,7 @@ class FileBasedSource(AbstractSource, ABC):
36
35
  discovery_policy: AbstractDiscoveryPolicy = DefaultDiscoveryPolicy(),
37
36
  parsers: Mapping[str, FileTypeParser] = default_parsers,
38
37
  validation_policies: Mapping[str, AbstractSchemaValidationPolicy] = DEFAULT_SCHEMA_VALIDATION_POLICIES,
39
- max_history_size: int = DEFAULT_MAX_HISTORY_SIZE,
38
+ cursor_cls: Type[AbstractFileBasedCursor] = DefaultFileBasedCursor,
40
39
  ):
41
40
  self.stream_reader = stream_reader
42
41
  self.spec_class = spec_class
@@ -46,7 +45,7 @@ class FileBasedSource(AbstractSource, ABC):
46
45
  self.validation_policies = validation_policies
47
46
  catalog = self.read_catalog(catalog_path) if catalog_path else None
48
47
  self.stream_schemas = {s.stream.name: s.stream.json_schema for s in catalog.streams} if catalog else {}
49
- self.max_history_size = max_history_size
48
+ self.cursor_cls = cursor_cls
50
49
  self.logger = logging.getLogger(f"airbyte.{self.name}")
51
50
 
52
51
  def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Optional[Any]]:
@@ -104,7 +103,7 @@ class FileBasedSource(AbstractSource, ABC):
104
103
  discovery_policy=self.discovery_policy,
105
104
  parsers=self.parsers,
106
105
  validation_policy=self._validate_and_get_validation_policy(stream_config),
107
- cursor=DefaultFileBasedCursor(self.max_history_size, stream_config.days_to_sync_if_history_is_full),
106
+ cursor=self.cursor_cls(stream_config),
108
107
  )
109
108
  )
110
109
  return streams
@@ -5,12 +5,13 @@
5
5
  import csv
6
6
  import json
7
7
  import logging
8
- from distutils.util import strtobool
9
- from typing import Any, Dict, Iterable, Mapping, Optional
8
+ from functools import partial
9
+ from io import IOBase
10
+ from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Set
10
11
 
11
12
  from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, QuotingBehavior
12
13
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
13
- from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError
14
+ from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
14
15
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
15
16
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
16
17
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
@@ -34,30 +35,25 @@ class CsvParser(FileTypeParser):
34
35
  stream_reader: AbstractFileBasedStreamReader,
35
36
  logger: logging.Logger,
36
37
  ) -> Dict[str, Any]:
37
- config_format = config.format.get(config.file_type) if config.format else None
38
- if config_format:
39
- if not isinstance(config_format, CsvFormat):
40
- raise ValueError(f"Invalid format config: {config_format}")
41
- dialect_name = config.name + DIALECT_NAME
42
- csv.register_dialect(
43
- dialect_name,
44
- delimiter=config_format.delimiter,
45
- quotechar=config_format.quote_char,
46
- escapechar=config_format.escape_char,
47
- doublequote=config_format.double_quote,
48
- quoting=config_to_quoting.get(config_format.quoting_behavior, csv.QUOTE_MINIMAL),
49
- )
50
- with stream_reader.open_file(file, self.file_read_mode, logger) as fp:
51
- # todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual
52
- # sources will likely require one. Rather than modify the interface now we can wait until the real use case
53
- reader = csv.DictReader(fp, dialect=dialect_name) # type: ignore
54
- schema = {field.strip(): {"type": "string"} for field in next(reader)}
55
- csv.unregister_dialect(dialect_name)
56
- return schema
57
- else:
58
- with stream_reader.open_file(file, self.file_read_mode, logger) as fp:
59
- reader = csv.DictReader(fp) # type: ignore
60
- return {field.strip(): {"type": "string"} for field in next(reader)}
38
+ config_format = config.format.get(config.file_type) if config.format else CsvFormat()
39
+ if not isinstance(config_format, CsvFormat):
40
+ raise ValueError(f"Invalid format config: {config_format}")
41
+ dialect_name = config.name + DIALECT_NAME
42
+ csv.register_dialect(
43
+ dialect_name,
44
+ delimiter=config_format.delimiter,
45
+ quotechar=config_format.quote_char,
46
+ escapechar=config_format.escape_char,
47
+ doublequote=config_format.double_quote,
48
+ quoting=config_to_quoting.get(config_format.quoting_behavior, csv.QUOTE_MINIMAL),
49
+ )
50
+ with stream_reader.open_file(file, self.file_read_mode, logger) as fp:
51
+ # todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual
52
+ # sources will likely require one. Rather than modify the interface now we can wait until the real use case
53
+ headers = self._get_headers(fp, config_format, dialect_name)
54
+ schema = {field.strip(): {"type": "string"} for field in headers}
55
+ csv.unregister_dialect(dialect_name)
56
+ return schema
61
57
 
62
58
  def parse_records(
63
59
  self,
@@ -67,30 +63,28 @@ class CsvParser(FileTypeParser):
67
63
  logger: logging.Logger,
68
64
  ) -> Iterable[Dict[str, Any]]:
69
65
  schema: Mapping[str, Any] = config.input_schema # type: ignore
70
- config_format = config.format.get(config.file_type) if config.format else None
71
- if config_format:
72
- if not isinstance(config_format, CsvFormat):
73
- raise ValueError(f"Invalid format config: {config_format}")
74
- # Formats are configured individually per-stream so a unique dialect should be registered for each stream.
75
- # Wwe don't unregister the dialect because we are lazily parsing each csv file to generate records
76
- dialect_name = config.name + DIALECT_NAME
77
- csv.register_dialect(
78
- dialect_name,
79
- delimiter=config_format.delimiter,
80
- quotechar=config_format.quote_char,
81
- escapechar=config_format.escape_char,
82
- doublequote=config_format.double_quote,
83
- quoting=config_to_quoting.get(config_format.quoting_behavior, csv.QUOTE_MINIMAL),
84
- )
85
- with stream_reader.open_file(file, self.file_read_mode, logger) as fp:
86
- # todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual
87
- # sources will likely require one. Rather than modify the interface now we can wait until the real use case
88
- reader = csv.DictReader(fp, dialect=dialect_name) # type: ignore
89
- yield from self._read_and_cast_types(reader, schema, logger)
90
- else:
91
- with stream_reader.open_file(file, self.file_read_mode, logger) as fp:
92
- reader = csv.DictReader(fp) # type: ignore
93
- yield from self._read_and_cast_types(reader, schema, logger)
66
+ config_format = config.format.get(config.file_type) if config.format else CsvFormat()
67
+ if not isinstance(config_format, CsvFormat):
68
+ raise ValueError(f"Invalid format config: {config_format}")
69
+ # Formats are configured individually per-stream so a unique dialect should be registered for each stream.
70
+ # We don't unregister the dialect because we are lazily parsing each csv file to generate records
71
+ # This will potentially be a problem if we ever process multiple streams concurrently
72
+ dialect_name = config.name + DIALECT_NAME
73
+ csv.register_dialect(
74
+ dialect_name,
75
+ delimiter=config_format.delimiter,
76
+ quotechar=config_format.quote_char,
77
+ escapechar=config_format.escape_char,
78
+ doublequote=config_format.double_quote,
79
+ quoting=config_to_quoting.get(config_format.quoting_behavior, csv.QUOTE_MINIMAL),
80
+ )
81
+ with stream_reader.open_file(file, self.file_read_mode, logger) as fp:
82
+ # todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual
83
+ # sources will likely require one. Rather than modify the interface now we can wait until the real use case
84
+ self._skip_rows_before_header(fp, config_format.skip_rows_before_header)
85
+ field_names = self._auto_generate_headers(fp, config_format) if config_format.autogenerate_column_names else None
86
+ reader = csv.DictReader(fp, dialect=dialect_name, fieldnames=field_names) # type: ignore
87
+ yield from self._read_and_cast_types(reader, schema, config_format, logger)
94
88
 
95
89
  @property
96
90
  def file_read_mode(self) -> FileReadMode:
@@ -98,7 +92,7 @@ class CsvParser(FileTypeParser):
98
92
 
99
93
  @staticmethod
100
94
  def _read_and_cast_types(
101
- reader: csv.DictReader, schema: Optional[Mapping[str, Any]], logger: logging.Logger # type: ignore
95
+ reader: csv.DictReader, schema: Optional[Mapping[str, Any]], config_format: CsvFormat, logger: logging.Logger # type: ignore
102
96
  ) -> Iterable[Dict[str, Any]]:
103
97
  """
104
98
  If the user provided a schema, attempt to cast the record values to the associated type.
@@ -107,16 +101,65 @@ class CsvParser(FileTypeParser):
107
101
  cast it to a string. Downstream, the user's validation policy will determine whether the
108
102
  record should be emitted.
109
103
  """
110
- if not schema:
111
- yield from reader
104
+ cast_fn = CsvParser._get_cast_function(schema, config_format, logger)
105
+ for i, row in enumerate(reader):
106
+ if i < config_format.skip_rows_after_header:
107
+ continue
108
+ # The row was not properly parsed if any of the values are None
109
+ if any(val is None for val in row.values()):
110
+ raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD)
111
+ else:
112
+ yield CsvParser._to_nullable(cast_fn(row), config_format.null_values)
112
113
 
113
- else:
114
+ @staticmethod
115
+ def _get_cast_function(
116
+ schema: Optional[Mapping[str, Any]], config_format: CsvFormat, logger: logging.Logger
117
+ ) -> Callable[[Mapping[str, str]], Mapping[str, str]]:
118
+ # Only cast values if the schema is provided
119
+ if schema:
114
120
  property_types = {col: prop["type"] for col, prop in schema["properties"].items()}
115
- for row in reader:
116
- yield cast_types(row, property_types, logger)
121
+ return partial(_cast_types, property_types=property_types, config_format=config_format, logger=logger)
122
+ else:
123
+ # If no schema is provided, yield the rows as they are
124
+ return _no_cast
125
+
126
+ @staticmethod
127
+ def _to_nullable(row: Mapping[str, str], null_values: Set[str]) -> Dict[str, Optional[str]]:
128
+ nullable = row | {k: None if v in null_values else v for k, v in row.items()}
129
+ return nullable
130
+
131
+ @staticmethod
132
+ def _skip_rows_before_header(fp: IOBase, rows_to_skip: int) -> None:
133
+ """
134
+ Skip rows before the header. This has to be done on the file object itself, not the reader
135
+ """
136
+ for _ in range(rows_to_skip):
137
+ fp.readline()
138
+
139
+ def _get_headers(self, fp: IOBase, config_format: CsvFormat, dialect_name: str) -> List[str]:
140
+ # Note that this method assumes the dialect has already been registered if we're parsing the headers
141
+ if config_format.autogenerate_column_names:
142
+ return self._auto_generate_headers(fp, config_format)
143
+ else:
144
+ # If we're not autogenerating column names, we need to skip the rows before the header
145
+ self._skip_rows_before_header(fp, config_format.skip_rows_before_header)
146
+ # Then read the header
147
+ reader = csv.DictReader(fp, dialect=dialect_name) # type: ignore
148
+ return next(reader) # type: ignore
117
149
 
150
+ def _auto_generate_headers(self, fp: IOBase, config_format: CsvFormat) -> List[str]:
151
+ """
152
+ Generates field names as [f0, f1, ...] in the same way as pyarrow's csv reader with autogenerate_column_names=True.
153
+ See https://arrow.apache.org/docs/python/generated/pyarrow.csv.ReadOptions.html
154
+ """
155
+ next_line = next(fp).strip()
156
+ number_of_columns = len(next_line.split(config_format.delimiter)) # type: ignore
157
+ # Reset the file pointer to the beginning of the file so that the first row is not skipped
158
+ fp.seek(0)
159
+ return [f"f{i}" for i in range(number_of_columns)]
118
160
 
119
- def cast_types(row: Dict[str, str], property_types: Dict[str, Any], logger: logging.Logger) -> Dict[str, Any]:
161
+
162
+ def _cast_types(row: Dict[str, str], property_types: Dict[str, Any], config_format: CsvFormat, logger: logging.Logger) -> Dict[str, Any]:
120
163
  """
121
164
  Casts the values in the input 'row' dictionary according to the types defined in the JSON schema.
122
165
 
@@ -142,7 +185,7 @@ def cast_types(row: Dict[str, str], property_types: Dict[str, Any], logger: logg
142
185
 
143
186
  elif python_type == bool:
144
187
  try:
145
- cast_value = strtobool(value)
188
+ cast_value = _value_to_bool(value, config_format.true_values, config_format.false_values)
146
189
  except ValueError:
147
190
  warnings.append(_format_warning(key, value, prop_type))
148
191
 
@@ -178,5 +221,17 @@ def cast_types(row: Dict[str, str], property_types: Dict[str, Any], logger: logg
178
221
  return result
179
222
 
180
223
 
224
+ def _value_to_bool(value: str, true_values: Set[str], false_values: Set[str]) -> bool:
225
+ if value in true_values:
226
+ return True
227
+ if value in false_values:
228
+ return False
229
+ raise ValueError(f"Value {value} is not a valid boolean value")
230
+
231
+
181
232
  def _format_warning(key: str, value: str, expected_type: Optional[Any]) -> str:
182
233
  return f"{key}: value={value},expected_type={expected_type}"
234
+
235
+
236
+ def _no_cast(row: Mapping[str, str]) -> Mapping[str, str]:
237
+ return row
@@ -1,4 +1,4 @@
1
+ from .abstract_file_based_cursor import AbstractFileBasedCursor
1
2
  from .default_file_based_cursor import DefaultFileBasedCursor
2
- from .file_based_cursor import FileBasedCursor
3
3
 
4
- __all__ = ["FileBasedCursor", "DefaultFileBasedCursor"]
4
+ __all__ = ["AbstractFileBasedCursor", "DefaultFileBasedCursor"]
@@ -7,15 +7,23 @@ from abc import ABC, abstractmethod
7
7
  from datetime import datetime
8
8
  from typing import Any, Iterable, MutableMapping
9
9
 
10
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
10
11
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
11
12
  from airbyte_cdk.sources.file_based.types import StreamState
12
13
 
13
14
 
14
- class FileBasedCursor(ABC):
15
+ class AbstractFileBasedCursor(ABC):
15
16
  """
16
17
  Abstract base class for cursors used by file-based streams.
17
18
  """
18
19
 
20
+ @abstractmethod
21
+ def __init__(self, stream_config: FileBasedStreamConfig, **kwargs: Any):
22
+ """
23
+ Common interface for all cursors.
24
+ """
25
+ ...
26
+
19
27
  @abstractmethod
20
28
  def add_file(self, file: RemoteFile) -> None:
21
29
  """
@@ -4,26 +4,26 @@
4
4
 
5
5
  import logging
6
6
  from datetime import datetime, timedelta
7
- from typing import Iterable, MutableMapping, Optional
7
+ from typing import Any, Iterable, MutableMapping, Optional
8
8
 
9
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
9
10
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
10
- from airbyte_cdk.sources.file_based.stream.cursor.file_based_cursor import FileBasedCursor
11
+ from airbyte_cdk.sources.file_based.stream.cursor.abstract_file_based_cursor import AbstractFileBasedCursor
11
12
  from airbyte_cdk.sources.file_based.types import StreamState
12
13
 
13
14
 
14
- class DefaultFileBasedCursor(FileBasedCursor):
15
+ class DefaultFileBasedCursor(AbstractFileBasedCursor):
15
16
  DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL = 3
17
+ DEFAULT_MAX_HISTORY_SIZE = 10_000
16
18
  DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
17
19
 
18
- def __init__(self, max_history_size: int, days_to_sync_if_history_is_full: Optional[int]):
20
+ def __init__(self, stream_config: FileBasedStreamConfig, **_: Any):
21
+ super().__init__(stream_config)
19
22
  self._file_to_datetime_history: MutableMapping[str, str] = {}
20
- self._max_history_size = max_history_size
21
23
  self._time_window_if_history_is_full = timedelta(
22
- days=days_to_sync_if_history_is_full or self.DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
24
+ days=stream_config.days_to_sync_if_history_is_full or self.DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
23
25
  )
24
26
 
25
- if self._max_history_size <= 0:
26
- raise ValueError(f"max_history_size must be a positive integer, got {self._max_history_size}")
27
27
  if self._time_window_if_history_is_full <= timedelta():
28
28
  raise ValueError(f"days_to_sync_if_history_is_full must be a positive timedelta, got {self._time_window_if_history_is_full}")
29
29
 
@@ -37,7 +37,7 @@ class DefaultFileBasedCursor(FileBasedCursor):
37
37
 
38
38
  def add_file(self, file: RemoteFile) -> None:
39
39
  self._file_to_datetime_history[file.uri] = file.last_modified.strftime(self.DATE_TIME_FORMAT)
40
- if len(self._file_to_datetime_history) > self._max_history_size:
40
+ if len(self._file_to_datetime_history) > self.DEFAULT_MAX_HISTORY_SIZE:
41
41
  # Get the earliest file based on its last modified date and its uri
42
42
  oldest_file = self._compute_earliest_file_in_history()
43
43
  if oldest_file:
@@ -67,7 +67,7 @@ class DefaultFileBasedCursor(FileBasedCursor):
67
67
  """
68
68
  Returns true if the state's history is full, meaning new entries will start to replace old entries.
69
69
  """
70
- return len(self._file_to_datetime_history) >= self._max_history_size
70
+ return len(self._file_to_datetime_history) >= self.DEFAULT_MAX_HISTORY_SIZE
71
71
 
72
72
  def _should_sync_file(self, file: RemoteFile, logger: logging.Logger) -> bool:
73
73
  if file.uri in self._file_to_datetime_history:
@@ -15,13 +15,14 @@ from airbyte_cdk.sources.file_based.exceptions import (
15
15
  FileBasedSourceError,
16
16
  InvalidSchemaError,
17
17
  MissingSchemaError,
18
+ RecordParseError,
18
19
  SchemaInferenceError,
19
20
  StopSyncPerValidationPolicy,
20
21
  )
21
22
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
22
23
  from airbyte_cdk.sources.file_based.schema_helpers import merge_schemas, schemaless_schema
23
24
  from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream
24
- from airbyte_cdk.sources.file_based.stream.cursor import FileBasedCursor
25
+ from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
25
26
  from airbyte_cdk.sources.file_based.types import StreamSlice
26
27
  from airbyte_cdk.sources.streams import IncrementalMixin
27
28
  from airbyte_cdk.sources.streams.core import JsonSchema
@@ -39,7 +40,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
39
40
  ab_file_name_col = "_ab_source_file_url"
40
41
  airbyte_columns = [ab_last_mod_col, ab_file_name_col]
41
42
 
42
- def __init__(self, cursor: FileBasedCursor, **kwargs: Any):
43
+ def __init__(self, cursor: AbstractFileBasedCursor, **kwargs: Any):
43
44
  super().__init__(**kwargs)
44
45
  self._cursor = cursor
45
46
 
@@ -105,6 +106,18 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
105
106
  )
106
107
  break
107
108
 
109
+ except RecordParseError:
110
+ # Increment line_no because the exception was raised before we could increment it
111
+ line_no += 1
112
+ yield AirbyteMessage(
113
+ type=MessageType.LOG,
114
+ log=AirbyteLogMessage(
115
+ level=Level.ERROR,
116
+ message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={self.name} file={file.uri} line_no={line_no} n_skipped={n_skipped}",
117
+ stack_trace=traceback.format_exc(),
118
+ ),
119
+ )
120
+
108
121
  except Exception:
109
122
  yield AirbyteMessage(
110
123
  type=MessageType.LOG,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: airbyte-cdk
3
- Version: 0.50.0
3
+ Version: 0.50.2
4
4
  Summary: A framework for writing Airbyte Connectors.
5
5
  Home-page: https://github.com/airbytehq/airbyte
6
6
  Author: Airbyte