airbyte-cdk 0.50.0__py3-none-any.whl → 0.50.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/entrypoint.py +7 -0
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3 -3
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +3 -3
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +9 -9
- airbyte_cdk/sources/file_based/config/csv_format.py +42 -6
- airbyte_cdk/sources/file_based/file_based_source.py +4 -5
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +114 -59
- airbyte_cdk/sources/file_based/stream/cursor/__init__.py +2 -2
- airbyte_cdk/sources/file_based/stream/cursor/{file_based_cursor.py → abstract_file_based_cursor.py} +9 -1
- airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +10 -10
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +15 -2
- {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/METADATA +1 -1
- {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/RECORD +25 -24
- unit_tests/sources/file_based/config/test_csv_format.py +23 -0
- unit_tests/sources/file_based/file_types/test_csv_parser.py +50 -18
- unit_tests/sources/file_based/helpers.py +5 -0
- unit_tests/sources/file_based/in_memory_files_source.py +11 -3
- unit_tests/sources/file_based/scenarios/csv_scenarios.py +1254 -47
- unit_tests/sources/file_based/scenarios/incremental_scenarios.py +6 -5
- unit_tests/sources/file_based/scenarios/scenario_builder.py +8 -7
- unit_tests/sources/file_based/stream/test_default_file_based_cursor.py +13 -12
- unit_tests/sources/file_based/test_scenarios.py +30 -0
- {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/WHEEL +0 -0
- {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/top_level.txt +0 -0
airbyte_cdk/entrypoint.py
CHANGED
@@ -181,6 +181,13 @@ class AirbyteEntrypoint(object):
|
|
181
181
|
return parsed_args.catalog
|
182
182
|
return None
|
183
183
|
|
184
|
+
@classmethod
|
185
|
+
def extract_config(cls, args: List[str]) -> Optional[Any]:
|
186
|
+
parsed_args = cls.parse_args(args)
|
187
|
+
if hasattr(parsed_args, "config"):
|
188
|
+
return parsed_args.config
|
189
|
+
return None
|
190
|
+
|
184
191
|
def _emit_queued_messages(self, source: Source) -> Iterable[AirbyteMessage]:
|
185
192
|
if hasattr(source, "message_repository") and source.message_repository:
|
186
193
|
yield from source.message_repository.consume_queue()
|
@@ -578,9 +578,9 @@ definitions:
|
|
578
578
|
- "created_at"
|
579
579
|
- "{{ config['record_cursor'] }}"
|
580
580
|
datetime_format:
|
581
|
-
title:
|
581
|
+
title: Outgoing Datetime Format
|
582
582
|
description: |
|
583
|
-
The datetime format
|
583
|
+
The datetime format used to format the datetime values that are sent in outgoing requests to the API. Use placeholders starting with "%" to describe the format the API is using. The following placeholders are available:
|
584
584
|
* **%s**: Epoch unix timestamp - `1686218963`
|
585
585
|
* **%a**: Weekday (abbreviated) - `Sun`
|
586
586
|
* **%A**: Weekday (full) - `Sunday`
|
@@ -626,7 +626,7 @@ definitions:
|
|
626
626
|
- "{{ config['start_time'] }}"
|
627
627
|
cursor_datetime_formats:
|
628
628
|
title: Cursor Datetime Formats
|
629
|
-
description: The possible formats for the cursor field
|
629
|
+
description: The possible formats for the cursor field, in order of preference. The first format that matches the cursor field value will be used to parse it. If not provided, the `datetime_format` will be used.
|
630
630
|
type: array
|
631
631
|
items:
|
632
632
|
type: string
|
@@ -810,9 +810,9 @@ class DatetimeBasedCursor(BaseModel):
|
|
810
810
|
)
|
811
811
|
datetime_format: str = Field(
|
812
812
|
...,
|
813
|
-
description="The datetime format
|
813
|
+
description="The datetime format used to format the datetime values that are sent in outgoing requests to the API. Use placeholders starting with \"%\" to describe the format the API is using. The following placeholders are available:\n * **%s**: Epoch unix timestamp - `1686218963`\n * **%a**: Weekday (abbreviated) - `Sun`\n * **%A**: Weekday (full) - `Sunday`\n * **%w**: Weekday (decimal) - `0` (Sunday), `6` (Saturday)\n * **%d**: Day of the month (zero-padded) - `01`, `02`, ..., `31`\n * **%b**: Month (abbreviated) - `Jan`\n * **%B**: Month (full) - `January`\n * **%m**: Month (zero-padded) - `01`, `02`, ..., `12`\n * **%y**: Year (without century, zero-padded) - `00`, `01`, ..., `99`\n * **%Y**: Year (with century) - `0001`, `0002`, ..., `9999`\n * **%H**: Hour (24-hour, zero-padded) - `00`, `01`, ..., `23`\n * **%I**: Hour (12-hour, zero-padded) - `01`, `02`, ..., `12`\n * **%p**: AM/PM indicator\n * **%M**: Minute (zero-padded) - `00`, `01`, ..., `59`\n * **%S**: Second (zero-padded) - `00`, `01`, ..., `59`\n * **%f**: Microsecond (zero-padded to 6 digits) - `000000`\n * **%z**: UTC offset - `(empty)`, `+0000`, `-04:00`\n * **%Z**: Time zone name - `(empty)`, `UTC`, `GMT`\n * **%j**: Day of the year (zero-padded) - `001`, `002`, ..., `366`\n * **%U**: Week number of the year (starting Sunday) - `00`, ..., `53`\n * **%W**: Week number of the year (starting Monday) - `00`, ..., `53`\n * **%c**: Date and time - `Tue Aug 16 21:30:00 1988`\n * **%x**: Date standard format - `08/16/1988`\n * **%X**: Time standard format - `21:30:00`\n * **%%**: Literal '%' character\n\n Some placeholders depend on the locale of the underlying system - in most cases this locale is configured as en/US. For more information see the [Python documentation](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes).\n",
|
814
814
|
examples=["%Y-%m-%dT%H:%M:%S.%f%z", "%Y-%m-%d", "%s"],
|
815
|
-
title="
|
815
|
+
title="Outgoing Datetime Format",
|
816
816
|
)
|
817
817
|
start_datetime: Union[str, MinMaxDatetime] = Field(
|
818
818
|
...,
|
@@ -822,7 +822,7 @@ class DatetimeBasedCursor(BaseModel):
|
|
822
822
|
)
|
823
823
|
cursor_datetime_formats: Optional[List[str]] = Field(
|
824
824
|
None,
|
825
|
-
description="The possible formats for the cursor field",
|
825
|
+
description="The possible formats for the cursor field, in order of preference. The first format that matches the cursor field value will be used to parse it. If not provided, the `datetime_format` will be used.",
|
826
826
|
title="Cursor Datetime Formats",
|
827
827
|
)
|
828
828
|
cursor_granularity: Optional[str] = Field(
|
airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
import logging
|
6
6
|
import traceback
|
7
|
-
from typing import List, Optional, Tuple
|
7
|
+
from typing import TYPE_CHECKING, List, Optional, Tuple
|
8
8
|
|
9
9
|
from airbyte_cdk.sources import Source
|
10
10
|
from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy
|
@@ -12,14 +12,16 @@ from airbyte_cdk.sources.file_based.exceptions import CheckAvailabilityError, Fi
|
|
12
12
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
|
13
13
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
14
14
|
from airbyte_cdk.sources.file_based.schema_helpers import conforms_to_schema
|
15
|
-
|
15
|
+
|
16
|
+
if TYPE_CHECKING:
|
17
|
+
from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream
|
16
18
|
|
17
19
|
|
18
20
|
class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy):
|
19
21
|
def __init__(self, stream_reader: AbstractFileBasedStreamReader):
|
20
22
|
self.stream_reader = stream_reader
|
21
23
|
|
22
|
-
def check_availability(self, stream: AbstractFileBasedStream, logger: logging.Logger, _: Optional[Source]) -> Tuple[bool, Optional[str]]: # type: ignore[override]
|
24
|
+
def check_availability(self, stream: "AbstractFileBasedStream", logger: logging.Logger, _: Optional[Source]) -> Tuple[bool, Optional[str]]: # type: ignore[override]
|
23
25
|
"""
|
24
26
|
Perform a connection check for the stream (verify that we can list files from the stream).
|
25
27
|
|
@@ -33,7 +35,7 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy
|
|
33
35
|
return True, None
|
34
36
|
|
35
37
|
def check_availability_and_parsability(
|
36
|
-
self, stream: AbstractFileBasedStream, logger: logging.Logger, _: Optional[Source]
|
38
|
+
self, stream: "AbstractFileBasedStream", logger: logging.Logger, _: Optional[Source]
|
37
39
|
) -> Tuple[bool, Optional[str]]:
|
38
40
|
"""
|
39
41
|
Perform a connection check for the stream.
|
@@ -51,8 +53,6 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy
|
|
51
53
|
- If the user provided a schema in the config, check that a subset of records in
|
52
54
|
one file conform to the schema via a call to stream.conforms_to_schema(schema).
|
53
55
|
"""
|
54
|
-
if not isinstance(stream, AbstractFileBasedStream):
|
55
|
-
raise ValueError(f"Stream {stream.name} is not a file-based stream.")
|
56
56
|
try:
|
57
57
|
files = self._check_list_files(stream)
|
58
58
|
self._check_extensions(stream, files)
|
@@ -62,7 +62,7 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy
|
|
62
62
|
|
63
63
|
return True, None
|
64
64
|
|
65
|
-
def _check_list_files(self, stream: AbstractFileBasedStream) -> List[RemoteFile]:
|
65
|
+
def _check_list_files(self, stream: "AbstractFileBasedStream") -> List[RemoteFile]:
|
66
66
|
try:
|
67
67
|
files = stream.list_files()
|
68
68
|
except Exception as exc:
|
@@ -73,12 +73,12 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy
|
|
73
73
|
|
74
74
|
return files
|
75
75
|
|
76
|
-
def _check_extensions(self, stream: AbstractFileBasedStream, files: List[RemoteFile]) -> None:
|
76
|
+
def _check_extensions(self, stream: "AbstractFileBasedStream", files: List[RemoteFile]) -> None:
|
77
77
|
if not all(f.extension_agrees_with_file_type(stream.config.file_type) for f in files):
|
78
78
|
raise CheckAvailabilityError(FileBasedSourceError.EXTENSION_MISMATCH, stream=stream.name)
|
79
79
|
return None
|
80
80
|
|
81
|
-
def _check_parse_record(self, stream: AbstractFileBasedStream, file: RemoteFile, logger: logging.Logger) -> None:
|
81
|
+
def _check_parse_record(self, stream: "AbstractFileBasedStream", file: RemoteFile, logger: logging.Logger) -> None:
|
82
82
|
parser = stream.get_parser(stream.config.file_type)
|
83
83
|
|
84
84
|
try:
|
@@ -4,9 +4,9 @@
|
|
4
4
|
|
5
5
|
import codecs
|
6
6
|
from enum import Enum
|
7
|
-
from typing import Optional
|
7
|
+
from typing import Any, Mapping, Optional, Set
|
8
8
|
|
9
|
-
from pydantic import BaseModel, Field, validator
|
9
|
+
from pydantic import BaseModel, Field, root_validator, validator
|
10
10
|
from typing_extensions import Literal
|
11
11
|
|
12
12
|
|
@@ -17,6 +17,10 @@ class QuotingBehavior(Enum):
|
|
17
17
|
QUOTE_NONE = "Quote None"
|
18
18
|
|
19
19
|
|
20
|
+
DEFAULT_TRUE_VALUES = ["y", "yes", "t", "true", "on", "1"]
|
21
|
+
DEFAULT_FALSE_VALUES = ["n", "no", "f", "false", "off", "0"]
|
22
|
+
|
23
|
+
|
20
24
|
class CsvFormat(BaseModel):
|
21
25
|
filetype: Literal["csv"] = "csv"
|
22
26
|
delimiter: str = Field(
|
@@ -46,10 +50,34 @@ class CsvFormat(BaseModel):
|
|
46
50
|
default=QuotingBehavior.QUOTE_SPECIAL_CHARACTERS,
|
47
51
|
description="The quoting behavior determines when a value in a row should have quote marks added around it. For example, if Quote Non-numeric is specified, while reading, quotes are expected for row values that do not contain numbers. Or for Quote All, every row value will be expecting quotes.",
|
48
52
|
)
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
+
null_values: Set[str] = Field(
|
54
|
+
title="Null Values",
|
55
|
+
default=[],
|
56
|
+
description="A set of case-sensitive strings that should be interpreted as null values. For example, if the value 'NA' should be interpreted as null, enter 'NA' in this field.",
|
57
|
+
)
|
58
|
+
skip_rows_before_header: int = Field(
|
59
|
+
title="Skip Rows Before Header",
|
60
|
+
default=0,
|
61
|
+
description="The number of rows to skip before the header row. For example, if the header row is on the 3rd row, enter 2 in this field.",
|
62
|
+
)
|
63
|
+
skip_rows_after_header: int = Field(
|
64
|
+
title="Skip Rows After Header", default=0, description="The number of rows to skip after the header row."
|
65
|
+
)
|
66
|
+
autogenerate_column_names: bool = Field(
|
67
|
+
title="Autogenerate Column Names",
|
68
|
+
default=False,
|
69
|
+
description="Whether to autogenerate column names if column_names is empty. If true, column names will be of the form “f0”, “f1”… If false, column names will be read from the first CSV row after skip_rows_before_header.",
|
70
|
+
)
|
71
|
+
true_values: Set[str] = Field(
|
72
|
+
title="True Values",
|
73
|
+
default=DEFAULT_TRUE_VALUES,
|
74
|
+
description="A set of case-sensitive strings that should be interpreted as true values.",
|
75
|
+
)
|
76
|
+
false_values: Set[str] = Field(
|
77
|
+
title="False Values",
|
78
|
+
default=DEFAULT_FALSE_VALUES,
|
79
|
+
description="A set of case-sensitive strings that should be interpreted as false values.",
|
80
|
+
)
|
53
81
|
|
54
82
|
@validator("delimiter")
|
55
83
|
def validate_delimiter(cls, v: str) -> str:
|
@@ -78,3 +106,11 @@ class CsvFormat(BaseModel):
|
|
78
106
|
except LookupError:
|
79
107
|
raise ValueError(f"invalid encoding format: {v}")
|
80
108
|
return v
|
109
|
+
|
110
|
+
@root_validator
|
111
|
+
def validate_option_combinations(cls, values: Mapping[str, Any]) -> Mapping[str, Any]:
|
112
|
+
skip_rows_before_header = values.get("skip_rows_before_header", 0)
|
113
|
+
auto_generate_column_names = values.get("autogenerate_column_names", False)
|
114
|
+
if skip_rows_before_header > 0 and auto_generate_column_names:
|
115
|
+
raise ValueError("Cannot skip rows before header and autogenerate column names at the same time.")
|
116
|
+
return values
|
@@ -19,12 +19,11 @@ from airbyte_cdk.sources.file_based.file_types import default_parsers
|
|
19
19
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
20
20
|
from airbyte_cdk.sources.file_based.schema_validation_policies import DEFAULT_SCHEMA_VALIDATION_POLICIES, AbstractSchemaValidationPolicy
|
21
21
|
from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream, DefaultFileBasedStream
|
22
|
+
from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
|
22
23
|
from airbyte_cdk.sources.file_based.stream.cursor.default_file_based_cursor import DefaultFileBasedCursor
|
23
24
|
from airbyte_cdk.sources.streams import Stream
|
24
25
|
from pydantic.error_wrappers import ValidationError
|
25
26
|
|
26
|
-
DEFAULT_MAX_HISTORY_SIZE = 10_000
|
27
|
-
|
28
27
|
|
29
28
|
class FileBasedSource(AbstractSource, ABC):
|
30
29
|
def __init__(
|
@@ -36,7 +35,7 @@ class FileBasedSource(AbstractSource, ABC):
|
|
36
35
|
discovery_policy: AbstractDiscoveryPolicy = DefaultDiscoveryPolicy(),
|
37
36
|
parsers: Mapping[str, FileTypeParser] = default_parsers,
|
38
37
|
validation_policies: Mapping[str, AbstractSchemaValidationPolicy] = DEFAULT_SCHEMA_VALIDATION_POLICIES,
|
39
|
-
|
38
|
+
cursor_cls: Type[AbstractFileBasedCursor] = DefaultFileBasedCursor,
|
40
39
|
):
|
41
40
|
self.stream_reader = stream_reader
|
42
41
|
self.spec_class = spec_class
|
@@ -46,7 +45,7 @@ class FileBasedSource(AbstractSource, ABC):
|
|
46
45
|
self.validation_policies = validation_policies
|
47
46
|
catalog = self.read_catalog(catalog_path) if catalog_path else None
|
48
47
|
self.stream_schemas = {s.stream.name: s.stream.json_schema for s in catalog.streams} if catalog else {}
|
49
|
-
self.
|
48
|
+
self.cursor_cls = cursor_cls
|
50
49
|
self.logger = logging.getLogger(f"airbyte.{self.name}")
|
51
50
|
|
52
51
|
def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Optional[Any]]:
|
@@ -104,7 +103,7 @@ class FileBasedSource(AbstractSource, ABC):
|
|
104
103
|
discovery_policy=self.discovery_policy,
|
105
104
|
parsers=self.parsers,
|
106
105
|
validation_policy=self._validate_and_get_validation_policy(stream_config),
|
107
|
-
cursor=
|
106
|
+
cursor=self.cursor_cls(stream_config),
|
108
107
|
)
|
109
108
|
)
|
110
109
|
return streams
|
@@ -5,12 +5,13 @@
|
|
5
5
|
import csv
|
6
6
|
import json
|
7
7
|
import logging
|
8
|
-
from
|
9
|
-
from
|
8
|
+
from functools import partial
|
9
|
+
from io import IOBase
|
10
|
+
from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Set
|
10
11
|
|
11
12
|
from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, QuotingBehavior
|
12
13
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
13
|
-
from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError
|
14
|
+
from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
|
14
15
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
|
15
16
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
16
17
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
@@ -34,30 +35,25 @@ class CsvParser(FileTypeParser):
|
|
34
35
|
stream_reader: AbstractFileBasedStreamReader,
|
35
36
|
logger: logging.Logger,
|
36
37
|
) -> Dict[str, Any]:
|
37
|
-
config_format = config.format.get(config.file_type) if config.format else
|
38
|
-
if config_format:
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
return schema
|
57
|
-
else:
|
58
|
-
with stream_reader.open_file(file, self.file_read_mode, logger) as fp:
|
59
|
-
reader = csv.DictReader(fp) # type: ignore
|
60
|
-
return {field.strip(): {"type": "string"} for field in next(reader)}
|
38
|
+
config_format = config.format.get(config.file_type) if config.format else CsvFormat()
|
39
|
+
if not isinstance(config_format, CsvFormat):
|
40
|
+
raise ValueError(f"Invalid format config: {config_format}")
|
41
|
+
dialect_name = config.name + DIALECT_NAME
|
42
|
+
csv.register_dialect(
|
43
|
+
dialect_name,
|
44
|
+
delimiter=config_format.delimiter,
|
45
|
+
quotechar=config_format.quote_char,
|
46
|
+
escapechar=config_format.escape_char,
|
47
|
+
doublequote=config_format.double_quote,
|
48
|
+
quoting=config_to_quoting.get(config_format.quoting_behavior, csv.QUOTE_MINIMAL),
|
49
|
+
)
|
50
|
+
with stream_reader.open_file(file, self.file_read_mode, logger) as fp:
|
51
|
+
# todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual
|
52
|
+
# sources will likely require one. Rather than modify the interface now we can wait until the real use case
|
53
|
+
headers = self._get_headers(fp, config_format, dialect_name)
|
54
|
+
schema = {field.strip(): {"type": "string"} for field in headers}
|
55
|
+
csv.unregister_dialect(dialect_name)
|
56
|
+
return schema
|
61
57
|
|
62
58
|
def parse_records(
|
63
59
|
self,
|
@@ -67,30 +63,28 @@ class CsvParser(FileTypeParser):
|
|
67
63
|
logger: logging.Logger,
|
68
64
|
) -> Iterable[Dict[str, Any]]:
|
69
65
|
schema: Mapping[str, Any] = config.input_schema # type: ignore
|
70
|
-
config_format = config.format.get(config.file_type) if config.format else
|
71
|
-
if config_format:
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
reader = csv.DictReader(fp) # type: ignore
|
93
|
-
yield from self._read_and_cast_types(reader, schema, logger)
|
66
|
+
config_format = config.format.get(config.file_type) if config.format else CsvFormat()
|
67
|
+
if not isinstance(config_format, CsvFormat):
|
68
|
+
raise ValueError(f"Invalid format config: {config_format}")
|
69
|
+
# Formats are configured individually per-stream so a unique dialect should be registered for each stream.
|
70
|
+
# We don't unregister the dialect because we are lazily parsing each csv file to generate records
|
71
|
+
# This will potentially be a problem if we ever process multiple streams concurrently
|
72
|
+
dialect_name = config.name + DIALECT_NAME
|
73
|
+
csv.register_dialect(
|
74
|
+
dialect_name,
|
75
|
+
delimiter=config_format.delimiter,
|
76
|
+
quotechar=config_format.quote_char,
|
77
|
+
escapechar=config_format.escape_char,
|
78
|
+
doublequote=config_format.double_quote,
|
79
|
+
quoting=config_to_quoting.get(config_format.quoting_behavior, csv.QUOTE_MINIMAL),
|
80
|
+
)
|
81
|
+
with stream_reader.open_file(file, self.file_read_mode, logger) as fp:
|
82
|
+
# todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual
|
83
|
+
# sources will likely require one. Rather than modify the interface now we can wait until the real use case
|
84
|
+
self._skip_rows_before_header(fp, config_format.skip_rows_before_header)
|
85
|
+
field_names = self._auto_generate_headers(fp, config_format) if config_format.autogenerate_column_names else None
|
86
|
+
reader = csv.DictReader(fp, dialect=dialect_name, fieldnames=field_names) # type: ignore
|
87
|
+
yield from self._read_and_cast_types(reader, schema, config_format, logger)
|
94
88
|
|
95
89
|
@property
|
96
90
|
def file_read_mode(self) -> FileReadMode:
|
@@ -98,7 +92,7 @@ class CsvParser(FileTypeParser):
|
|
98
92
|
|
99
93
|
@staticmethod
|
100
94
|
def _read_and_cast_types(
|
101
|
-
reader: csv.DictReader, schema: Optional[Mapping[str, Any]], logger: logging.Logger # type: ignore
|
95
|
+
reader: csv.DictReader, schema: Optional[Mapping[str, Any]], config_format: CsvFormat, logger: logging.Logger # type: ignore
|
102
96
|
) -> Iterable[Dict[str, Any]]:
|
103
97
|
"""
|
104
98
|
If the user provided a schema, attempt to cast the record values to the associated type.
|
@@ -107,16 +101,65 @@ class CsvParser(FileTypeParser):
|
|
107
101
|
cast it to a string. Downstream, the user's validation policy will determine whether the
|
108
102
|
record should be emitted.
|
109
103
|
"""
|
110
|
-
|
111
|
-
|
104
|
+
cast_fn = CsvParser._get_cast_function(schema, config_format, logger)
|
105
|
+
for i, row in enumerate(reader):
|
106
|
+
if i < config_format.skip_rows_after_header:
|
107
|
+
continue
|
108
|
+
# The row was not properly parsed if any of the values are None
|
109
|
+
if any(val is None for val in row.values()):
|
110
|
+
raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD)
|
111
|
+
else:
|
112
|
+
yield CsvParser._to_nullable(cast_fn(row), config_format.null_values)
|
112
113
|
|
113
|
-
|
114
|
+
@staticmethod
|
115
|
+
def _get_cast_function(
|
116
|
+
schema: Optional[Mapping[str, Any]], config_format: CsvFormat, logger: logging.Logger
|
117
|
+
) -> Callable[[Mapping[str, str]], Mapping[str, str]]:
|
118
|
+
# Only cast values if the schema is provided
|
119
|
+
if schema:
|
114
120
|
property_types = {col: prop["type"] for col, prop in schema["properties"].items()}
|
115
|
-
|
116
|
-
|
121
|
+
return partial(_cast_types, property_types=property_types, config_format=config_format, logger=logger)
|
122
|
+
else:
|
123
|
+
# If no schema is provided, yield the rows as they are
|
124
|
+
return _no_cast
|
125
|
+
|
126
|
+
@staticmethod
|
127
|
+
def _to_nullable(row: Mapping[str, str], null_values: Set[str]) -> Dict[str, Optional[str]]:
|
128
|
+
nullable = row | {k: None if v in null_values else v for k, v in row.items()}
|
129
|
+
return nullable
|
130
|
+
|
131
|
+
@staticmethod
|
132
|
+
def _skip_rows_before_header(fp: IOBase, rows_to_skip: int) -> None:
|
133
|
+
"""
|
134
|
+
Skip rows before the header. This has to be done on the file object itself, not the reader
|
135
|
+
"""
|
136
|
+
for _ in range(rows_to_skip):
|
137
|
+
fp.readline()
|
138
|
+
|
139
|
+
def _get_headers(self, fp: IOBase, config_format: CsvFormat, dialect_name: str) -> List[str]:
|
140
|
+
# Note that this method assumes the dialect has already been registered if we're parsing the headers
|
141
|
+
if config_format.autogenerate_column_names:
|
142
|
+
return self._auto_generate_headers(fp, config_format)
|
143
|
+
else:
|
144
|
+
# If we're not autogenerating column names, we need to skip the rows before the header
|
145
|
+
self._skip_rows_before_header(fp, config_format.skip_rows_before_header)
|
146
|
+
# Then read the header
|
147
|
+
reader = csv.DictReader(fp, dialect=dialect_name) # type: ignore
|
148
|
+
return next(reader) # type: ignore
|
117
149
|
|
150
|
+
def _auto_generate_headers(self, fp: IOBase, config_format: CsvFormat) -> List[str]:
|
151
|
+
"""
|
152
|
+
Generates field names as [f0, f1, ...] in the same way as pyarrow's csv reader with autogenerate_column_names=True.
|
153
|
+
See https://arrow.apache.org/docs/python/generated/pyarrow.csv.ReadOptions.html
|
154
|
+
"""
|
155
|
+
next_line = next(fp).strip()
|
156
|
+
number_of_columns = len(next_line.split(config_format.delimiter)) # type: ignore
|
157
|
+
# Reset the file pointer to the beginning of the file so that the first row is not skipped
|
158
|
+
fp.seek(0)
|
159
|
+
return [f"f{i}" for i in range(number_of_columns)]
|
118
160
|
|
119
|
-
|
161
|
+
|
162
|
+
def _cast_types(row: Dict[str, str], property_types: Dict[str, Any], config_format: CsvFormat, logger: logging.Logger) -> Dict[str, Any]:
|
120
163
|
"""
|
121
164
|
Casts the values in the input 'row' dictionary according to the types defined in the JSON schema.
|
122
165
|
|
@@ -142,7 +185,7 @@ def cast_types(row: Dict[str, str], property_types: Dict[str, Any], logger: logg
|
|
142
185
|
|
143
186
|
elif python_type == bool:
|
144
187
|
try:
|
145
|
-
cast_value =
|
188
|
+
cast_value = _value_to_bool(value, config_format.true_values, config_format.false_values)
|
146
189
|
except ValueError:
|
147
190
|
warnings.append(_format_warning(key, value, prop_type))
|
148
191
|
|
@@ -178,5 +221,17 @@ def cast_types(row: Dict[str, str], property_types: Dict[str, Any], logger: logg
|
|
178
221
|
return result
|
179
222
|
|
180
223
|
|
224
|
+
def _value_to_bool(value: str, true_values: Set[str], false_values: Set[str]) -> bool:
|
225
|
+
if value in true_values:
|
226
|
+
return True
|
227
|
+
if value in false_values:
|
228
|
+
return False
|
229
|
+
raise ValueError(f"Value {value} is not a valid boolean value")
|
230
|
+
|
231
|
+
|
181
232
|
def _format_warning(key: str, value: str, expected_type: Optional[Any]) -> str:
|
182
233
|
return f"{key}: value={value},expected_type={expected_type}"
|
234
|
+
|
235
|
+
|
236
|
+
def _no_cast(row: Mapping[str, str]) -> Mapping[str, str]:
|
237
|
+
return row
|
@@ -1,4 +1,4 @@
|
|
1
|
+
from .abstract_file_based_cursor import AbstractFileBasedCursor
|
1
2
|
from .default_file_based_cursor import DefaultFileBasedCursor
|
2
|
-
from .file_based_cursor import FileBasedCursor
|
3
3
|
|
4
|
-
__all__ = ["
|
4
|
+
__all__ = ["AbstractFileBasedCursor", "DefaultFileBasedCursor"]
|
airbyte_cdk/sources/file_based/stream/cursor/{file_based_cursor.py → abstract_file_based_cursor.py}
RENAMED
@@ -7,15 +7,23 @@ from abc import ABC, abstractmethod
|
|
7
7
|
from datetime import datetime
|
8
8
|
from typing import Any, Iterable, MutableMapping
|
9
9
|
|
10
|
+
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
10
11
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
11
12
|
from airbyte_cdk.sources.file_based.types import StreamState
|
12
13
|
|
13
14
|
|
14
|
-
class
|
15
|
+
class AbstractFileBasedCursor(ABC):
|
15
16
|
"""
|
16
17
|
Abstract base class for cursors used by file-based streams.
|
17
18
|
"""
|
18
19
|
|
20
|
+
@abstractmethod
|
21
|
+
def __init__(self, stream_config: FileBasedStreamConfig, **kwargs: Any):
|
22
|
+
"""
|
23
|
+
Common interface for all cursors.
|
24
|
+
"""
|
25
|
+
...
|
26
|
+
|
19
27
|
@abstractmethod
|
20
28
|
def add_file(self, file: RemoteFile) -> None:
|
21
29
|
"""
|
@@ -4,26 +4,26 @@
|
|
4
4
|
|
5
5
|
import logging
|
6
6
|
from datetime import datetime, timedelta
|
7
|
-
from typing import Iterable, MutableMapping, Optional
|
7
|
+
from typing import Any, Iterable, MutableMapping, Optional
|
8
8
|
|
9
|
+
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
9
10
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
10
|
-
from airbyte_cdk.sources.file_based.stream.cursor.
|
11
|
+
from airbyte_cdk.sources.file_based.stream.cursor.abstract_file_based_cursor import AbstractFileBasedCursor
|
11
12
|
from airbyte_cdk.sources.file_based.types import StreamState
|
12
13
|
|
13
14
|
|
14
|
-
class DefaultFileBasedCursor(
|
15
|
+
class DefaultFileBasedCursor(AbstractFileBasedCursor):
|
15
16
|
DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL = 3
|
17
|
+
DEFAULT_MAX_HISTORY_SIZE = 10_000
|
16
18
|
DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
|
17
19
|
|
18
|
-
def __init__(self,
|
20
|
+
def __init__(self, stream_config: FileBasedStreamConfig, **_: Any):
|
21
|
+
super().__init__(stream_config)
|
19
22
|
self._file_to_datetime_history: MutableMapping[str, str] = {}
|
20
|
-
self._max_history_size = max_history_size
|
21
23
|
self._time_window_if_history_is_full = timedelta(
|
22
|
-
days=days_to_sync_if_history_is_full or self.DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
|
24
|
+
days=stream_config.days_to_sync_if_history_is_full or self.DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
|
23
25
|
)
|
24
26
|
|
25
|
-
if self._max_history_size <= 0:
|
26
|
-
raise ValueError(f"max_history_size must be a positive integer, got {self._max_history_size}")
|
27
27
|
if self._time_window_if_history_is_full <= timedelta():
|
28
28
|
raise ValueError(f"days_to_sync_if_history_is_full must be a positive timedelta, got {self._time_window_if_history_is_full}")
|
29
29
|
|
@@ -37,7 +37,7 @@ class DefaultFileBasedCursor(FileBasedCursor):
|
|
37
37
|
|
38
38
|
def add_file(self, file: RemoteFile) -> None:
|
39
39
|
self._file_to_datetime_history[file.uri] = file.last_modified.strftime(self.DATE_TIME_FORMAT)
|
40
|
-
if len(self._file_to_datetime_history) > self.
|
40
|
+
if len(self._file_to_datetime_history) > self.DEFAULT_MAX_HISTORY_SIZE:
|
41
41
|
# Get the earliest file based on its last modified date and its uri
|
42
42
|
oldest_file = self._compute_earliest_file_in_history()
|
43
43
|
if oldest_file:
|
@@ -67,7 +67,7 @@ class DefaultFileBasedCursor(FileBasedCursor):
|
|
67
67
|
"""
|
68
68
|
Returns true if the state's history is full, meaning new entries will start to replace old entries.
|
69
69
|
"""
|
70
|
-
return len(self._file_to_datetime_history) >= self.
|
70
|
+
return len(self._file_to_datetime_history) >= self.DEFAULT_MAX_HISTORY_SIZE
|
71
71
|
|
72
72
|
def _should_sync_file(self, file: RemoteFile, logger: logging.Logger) -> bool:
|
73
73
|
if file.uri in self._file_to_datetime_history:
|
@@ -15,13 +15,14 @@ from airbyte_cdk.sources.file_based.exceptions import (
|
|
15
15
|
FileBasedSourceError,
|
16
16
|
InvalidSchemaError,
|
17
17
|
MissingSchemaError,
|
18
|
+
RecordParseError,
|
18
19
|
SchemaInferenceError,
|
19
20
|
StopSyncPerValidationPolicy,
|
20
21
|
)
|
21
22
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
22
23
|
from airbyte_cdk.sources.file_based.schema_helpers import merge_schemas, schemaless_schema
|
23
24
|
from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream
|
24
|
-
from airbyte_cdk.sources.file_based.stream.cursor import
|
25
|
+
from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
|
25
26
|
from airbyte_cdk.sources.file_based.types import StreamSlice
|
26
27
|
from airbyte_cdk.sources.streams import IncrementalMixin
|
27
28
|
from airbyte_cdk.sources.streams.core import JsonSchema
|
@@ -39,7 +40,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
39
40
|
ab_file_name_col = "_ab_source_file_url"
|
40
41
|
airbyte_columns = [ab_last_mod_col, ab_file_name_col]
|
41
42
|
|
42
|
-
def __init__(self, cursor:
|
43
|
+
def __init__(self, cursor: AbstractFileBasedCursor, **kwargs: Any):
|
43
44
|
super().__init__(**kwargs)
|
44
45
|
self._cursor = cursor
|
45
46
|
|
@@ -105,6 +106,18 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
105
106
|
)
|
106
107
|
break
|
107
108
|
|
109
|
+
except RecordParseError:
|
110
|
+
# Increment line_no because the exception was raised before we could increment it
|
111
|
+
line_no += 1
|
112
|
+
yield AirbyteMessage(
|
113
|
+
type=MessageType.LOG,
|
114
|
+
log=AirbyteLogMessage(
|
115
|
+
level=Level.ERROR,
|
116
|
+
message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={self.name} file={file.uri} line_no={line_no} n_skipped={n_skipped}",
|
117
|
+
stack_trace=traceback.format_exc(),
|
118
|
+
),
|
119
|
+
)
|
120
|
+
|
108
121
|
except Exception:
|
109
122
|
yield AirbyteMessage(
|
110
123
|
type=MessageType.LOG,
|