airbyte-cdk 0.50.0__py3-none-any.whl → 0.50.2__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- airbyte_cdk/entrypoint.py +7 -0
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3 -3
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +3 -3
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +9 -9
- airbyte_cdk/sources/file_based/config/csv_format.py +42 -6
- airbyte_cdk/sources/file_based/file_based_source.py +4 -5
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +114 -59
- airbyte_cdk/sources/file_based/stream/cursor/__init__.py +2 -2
- airbyte_cdk/sources/file_based/stream/cursor/{file_based_cursor.py → abstract_file_based_cursor.py} +9 -1
- airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +10 -10
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +15 -2
- {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/METADATA +1 -1
- {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/RECORD +25 -24
- unit_tests/sources/file_based/config/test_csv_format.py +23 -0
- unit_tests/sources/file_based/file_types/test_csv_parser.py +50 -18
- unit_tests/sources/file_based/helpers.py +5 -0
- unit_tests/sources/file_based/in_memory_files_source.py +11 -3
- unit_tests/sources/file_based/scenarios/csv_scenarios.py +1254 -47
- unit_tests/sources/file_based/scenarios/incremental_scenarios.py +6 -5
- unit_tests/sources/file_based/scenarios/scenario_builder.py +8 -7
- unit_tests/sources/file_based/stream/test_default_file_based_cursor.py +13 -12
- unit_tests/sources/file_based/test_scenarios.py +30 -0
- {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/WHEEL +0 -0
- {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/top_level.txt +0 -0
airbyte_cdk/entrypoint.py
CHANGED
@@ -181,6 +181,13 @@ class AirbyteEntrypoint(object):
|
|
181
181
|
return parsed_args.catalog
|
182
182
|
return None
|
183
183
|
|
184
|
+
@classmethod
|
185
|
+
def extract_config(cls, args: List[str]) -> Optional[Any]:
|
186
|
+
parsed_args = cls.parse_args(args)
|
187
|
+
if hasattr(parsed_args, "config"):
|
188
|
+
return parsed_args.config
|
189
|
+
return None
|
190
|
+
|
184
191
|
def _emit_queued_messages(self, source: Source) -> Iterable[AirbyteMessage]:
|
185
192
|
if hasattr(source, "message_repository") and source.message_repository:
|
186
193
|
yield from source.message_repository.consume_queue()
|
@@ -578,9 +578,9 @@ definitions:
|
|
578
578
|
- "created_at"
|
579
579
|
- "{{ config['record_cursor'] }}"
|
580
580
|
datetime_format:
|
581
|
-
title:
|
581
|
+
title: Outgoing Datetime Format
|
582
582
|
description: |
|
583
|
-
The datetime format
|
583
|
+
The datetime format used to format the datetime values that are sent in outgoing requests to the API. Use placeholders starting with "%" to describe the format the API is using. The following placeholders are available:
|
584
584
|
* **%s**: Epoch unix timestamp - `1686218963`
|
585
585
|
* **%a**: Weekday (abbreviated) - `Sun`
|
586
586
|
* **%A**: Weekday (full) - `Sunday`
|
@@ -626,7 +626,7 @@ definitions:
|
|
626
626
|
- "{{ config['start_time'] }}"
|
627
627
|
cursor_datetime_formats:
|
628
628
|
title: Cursor Datetime Formats
|
629
|
-
description: The possible formats for the cursor field
|
629
|
+
description: The possible formats for the cursor field, in order of preference. The first format that matches the cursor field value will be used to parse it. If not provided, the `datetime_format` will be used.
|
630
630
|
type: array
|
631
631
|
items:
|
632
632
|
type: string
|
@@ -810,9 +810,9 @@ class DatetimeBasedCursor(BaseModel):
|
|
810
810
|
)
|
811
811
|
datetime_format: str = Field(
|
812
812
|
...,
|
813
|
-
description="The datetime format
|
813
|
+
description="The datetime format used to format the datetime values that are sent in outgoing requests to the API. Use placeholders starting with \"%\" to describe the format the API is using. The following placeholders are available:\n * **%s**: Epoch unix timestamp - `1686218963`\n * **%a**: Weekday (abbreviated) - `Sun`\n * **%A**: Weekday (full) - `Sunday`\n * **%w**: Weekday (decimal) - `0` (Sunday), `6` (Saturday)\n * **%d**: Day of the month (zero-padded) - `01`, `02`, ..., `31`\n * **%b**: Month (abbreviated) - `Jan`\n * **%B**: Month (full) - `January`\n * **%m**: Month (zero-padded) - `01`, `02`, ..., `12`\n * **%y**: Year (without century, zero-padded) - `00`, `01`, ..., `99`\n * **%Y**: Year (with century) - `0001`, `0002`, ..., `9999`\n * **%H**: Hour (24-hour, zero-padded) - `00`, `01`, ..., `23`\n * **%I**: Hour (12-hour, zero-padded) - `01`, `02`, ..., `12`\n * **%p**: AM/PM indicator\n * **%M**: Minute (zero-padded) - `00`, `01`, ..., `59`\n * **%S**: Second (zero-padded) - `00`, `01`, ..., `59`\n * **%f**: Microsecond (zero-padded to 6 digits) - `000000`\n * **%z**: UTC offset - `(empty)`, `+0000`, `-04:00`\n * **%Z**: Time zone name - `(empty)`, `UTC`, `GMT`\n * **%j**: Day of the year (zero-padded) - `001`, `002`, ..., `366`\n * **%U**: Week number of the year (starting Sunday) - `00`, ..., `53`\n * **%W**: Week number of the year (starting Monday) - `00`, ..., `53`\n * **%c**: Date and time - `Tue Aug 16 21:30:00 1988`\n * **%x**: Date standard format - `08/16/1988`\n * **%X**: Time standard format - `21:30:00`\n * **%%**: Literal '%' character\n\n Some placeholders depend on the locale of the underlying system - in most cases this locale is configured as en/US. For more information see the [Python documentation](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes).\n",
|
814
814
|
examples=["%Y-%m-%dT%H:%M:%S.%f%z", "%Y-%m-%d", "%s"],
|
815
|
-
title="
|
815
|
+
title="Outgoing Datetime Format",
|
816
816
|
)
|
817
817
|
start_datetime: Union[str, MinMaxDatetime] = Field(
|
818
818
|
...,
|
@@ -822,7 +822,7 @@ class DatetimeBasedCursor(BaseModel):
|
|
822
822
|
)
|
823
823
|
cursor_datetime_formats: Optional[List[str]] = Field(
|
824
824
|
None,
|
825
|
-
description="The possible formats for the cursor field",
|
825
|
+
description="The possible formats for the cursor field, in order of preference. The first format that matches the cursor field value will be used to parse it. If not provided, the `datetime_format` will be used.",
|
826
826
|
title="Cursor Datetime Formats",
|
827
827
|
)
|
828
828
|
cursor_granularity: Optional[str] = Field(
|
airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
import logging
|
6
6
|
import traceback
|
7
|
-
from typing import List, Optional, Tuple
|
7
|
+
from typing import TYPE_CHECKING, List, Optional, Tuple
|
8
8
|
|
9
9
|
from airbyte_cdk.sources import Source
|
10
10
|
from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy
|
@@ -12,14 +12,16 @@ from airbyte_cdk.sources.file_based.exceptions import CheckAvailabilityError, Fi
|
|
12
12
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
|
13
13
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
14
14
|
from airbyte_cdk.sources.file_based.schema_helpers import conforms_to_schema
|
15
|
-
|
15
|
+
|
16
|
+
if TYPE_CHECKING:
|
17
|
+
from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream
|
16
18
|
|
17
19
|
|
18
20
|
class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy):
|
19
21
|
def __init__(self, stream_reader: AbstractFileBasedStreamReader):
|
20
22
|
self.stream_reader = stream_reader
|
21
23
|
|
22
|
-
def check_availability(self, stream: AbstractFileBasedStream, logger: logging.Logger, _: Optional[Source]) -> Tuple[bool, Optional[str]]: # type: ignore[override]
|
24
|
+
def check_availability(self, stream: "AbstractFileBasedStream", logger: logging.Logger, _: Optional[Source]) -> Tuple[bool, Optional[str]]: # type: ignore[override]
|
23
25
|
"""
|
24
26
|
Perform a connection check for the stream (verify that we can list files from the stream).
|
25
27
|
|
@@ -33,7 +35,7 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy
|
|
33
35
|
return True, None
|
34
36
|
|
35
37
|
def check_availability_and_parsability(
|
36
|
-
self, stream: AbstractFileBasedStream, logger: logging.Logger, _: Optional[Source]
|
38
|
+
self, stream: "AbstractFileBasedStream", logger: logging.Logger, _: Optional[Source]
|
37
39
|
) -> Tuple[bool, Optional[str]]:
|
38
40
|
"""
|
39
41
|
Perform a connection check for the stream.
|
@@ -51,8 +53,6 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy
|
|
51
53
|
- If the user provided a schema in the config, check that a subset of records in
|
52
54
|
one file conform to the schema via a call to stream.conforms_to_schema(schema).
|
53
55
|
"""
|
54
|
-
if not isinstance(stream, AbstractFileBasedStream):
|
55
|
-
raise ValueError(f"Stream {stream.name} is not a file-based stream.")
|
56
56
|
try:
|
57
57
|
files = self._check_list_files(stream)
|
58
58
|
self._check_extensions(stream, files)
|
@@ -62,7 +62,7 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy
|
|
62
62
|
|
63
63
|
return True, None
|
64
64
|
|
65
|
-
def _check_list_files(self, stream: AbstractFileBasedStream) -> List[RemoteFile]:
|
65
|
+
def _check_list_files(self, stream: "AbstractFileBasedStream") -> List[RemoteFile]:
|
66
66
|
try:
|
67
67
|
files = stream.list_files()
|
68
68
|
except Exception as exc:
|
@@ -73,12 +73,12 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy
|
|
73
73
|
|
74
74
|
return files
|
75
75
|
|
76
|
-
def _check_extensions(self, stream: AbstractFileBasedStream, files: List[RemoteFile]) -> None:
|
76
|
+
def _check_extensions(self, stream: "AbstractFileBasedStream", files: List[RemoteFile]) -> None:
|
77
77
|
if not all(f.extension_agrees_with_file_type(stream.config.file_type) for f in files):
|
78
78
|
raise CheckAvailabilityError(FileBasedSourceError.EXTENSION_MISMATCH, stream=stream.name)
|
79
79
|
return None
|
80
80
|
|
81
|
-
def _check_parse_record(self, stream: AbstractFileBasedStream, file: RemoteFile, logger: logging.Logger) -> None:
|
81
|
+
def _check_parse_record(self, stream: "AbstractFileBasedStream", file: RemoteFile, logger: logging.Logger) -> None:
|
82
82
|
parser = stream.get_parser(stream.config.file_type)
|
83
83
|
|
84
84
|
try:
|
@@ -4,9 +4,9 @@
|
|
4
4
|
|
5
5
|
import codecs
|
6
6
|
from enum import Enum
|
7
|
-
from typing import Optional
|
7
|
+
from typing import Any, Mapping, Optional, Set
|
8
8
|
|
9
|
-
from pydantic import BaseModel, Field, validator
|
9
|
+
from pydantic import BaseModel, Field, root_validator, validator
|
10
10
|
from typing_extensions import Literal
|
11
11
|
|
12
12
|
|
@@ -17,6 +17,10 @@ class QuotingBehavior(Enum):
|
|
17
17
|
QUOTE_NONE = "Quote None"
|
18
18
|
|
19
19
|
|
20
|
+
DEFAULT_TRUE_VALUES = ["y", "yes", "t", "true", "on", "1"]
|
21
|
+
DEFAULT_FALSE_VALUES = ["n", "no", "f", "false", "off", "0"]
|
22
|
+
|
23
|
+
|
20
24
|
class CsvFormat(BaseModel):
|
21
25
|
filetype: Literal["csv"] = "csv"
|
22
26
|
delimiter: str = Field(
|
@@ -46,10 +50,34 @@ class CsvFormat(BaseModel):
|
|
46
50
|
default=QuotingBehavior.QUOTE_SPECIAL_CHARACTERS,
|
47
51
|
description="The quoting behavior determines when a value in a row should have quote marks added around it. For example, if Quote Non-numeric is specified, while reading, quotes are expected for row values that do not contain numbers. Or for Quote All, every row value will be expecting quotes.",
|
48
52
|
)
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
+
null_values: Set[str] = Field(
|
54
|
+
title="Null Values",
|
55
|
+
default=[],
|
56
|
+
description="A set of case-sensitive strings that should be interpreted as null values. For example, if the value 'NA' should be interpreted as null, enter 'NA' in this field.",
|
57
|
+
)
|
58
|
+
skip_rows_before_header: int = Field(
|
59
|
+
title="Skip Rows Before Header",
|
60
|
+
default=0,
|
61
|
+
description="The number of rows to skip before the header row. For example, if the header row is on the 3rd row, enter 2 in this field.",
|
62
|
+
)
|
63
|
+
skip_rows_after_header: int = Field(
|
64
|
+
title="Skip Rows After Header", default=0, description="The number of rows to skip after the header row."
|
65
|
+
)
|
66
|
+
autogenerate_column_names: bool = Field(
|
67
|
+
title="Autogenerate Column Names",
|
68
|
+
default=False,
|
69
|
+
description="Whether to autogenerate column names if column_names is empty. If true, column names will be of the form “f0”, “f1”… If false, column names will be read from the first CSV row after skip_rows_before_header.",
|
70
|
+
)
|
71
|
+
true_values: Set[str] = Field(
|
72
|
+
title="True Values",
|
73
|
+
default=DEFAULT_TRUE_VALUES,
|
74
|
+
description="A set of case-sensitive strings that should be interpreted as true values.",
|
75
|
+
)
|
76
|
+
false_values: Set[str] = Field(
|
77
|
+
title="False Values",
|
78
|
+
default=DEFAULT_FALSE_VALUES,
|
79
|
+
description="A set of case-sensitive strings that should be interpreted as false values.",
|
80
|
+
)
|
53
81
|
|
54
82
|
@validator("delimiter")
|
55
83
|
def validate_delimiter(cls, v: str) -> str:
|
@@ -78,3 +106,11 @@ class CsvFormat(BaseModel):
|
|
78
106
|
except LookupError:
|
79
107
|
raise ValueError(f"invalid encoding format: {v}")
|
80
108
|
return v
|
109
|
+
|
110
|
+
@root_validator
|
111
|
+
def validate_option_combinations(cls, values: Mapping[str, Any]) -> Mapping[str, Any]:
|
112
|
+
skip_rows_before_header = values.get("skip_rows_before_header", 0)
|
113
|
+
auto_generate_column_names = values.get("autogenerate_column_names", False)
|
114
|
+
if skip_rows_before_header > 0 and auto_generate_column_names:
|
115
|
+
raise ValueError("Cannot skip rows before header and autogenerate column names at the same time.")
|
116
|
+
return values
|
@@ -19,12 +19,11 @@ from airbyte_cdk.sources.file_based.file_types import default_parsers
|
|
19
19
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
20
20
|
from airbyte_cdk.sources.file_based.schema_validation_policies import DEFAULT_SCHEMA_VALIDATION_POLICIES, AbstractSchemaValidationPolicy
|
21
21
|
from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream, DefaultFileBasedStream
|
22
|
+
from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
|
22
23
|
from airbyte_cdk.sources.file_based.stream.cursor.default_file_based_cursor import DefaultFileBasedCursor
|
23
24
|
from airbyte_cdk.sources.streams import Stream
|
24
25
|
from pydantic.error_wrappers import ValidationError
|
25
26
|
|
26
|
-
DEFAULT_MAX_HISTORY_SIZE = 10_000
|
27
|
-
|
28
27
|
|
29
28
|
class FileBasedSource(AbstractSource, ABC):
|
30
29
|
def __init__(
|
@@ -36,7 +35,7 @@ class FileBasedSource(AbstractSource, ABC):
|
|
36
35
|
discovery_policy: AbstractDiscoveryPolicy = DefaultDiscoveryPolicy(),
|
37
36
|
parsers: Mapping[str, FileTypeParser] = default_parsers,
|
38
37
|
validation_policies: Mapping[str, AbstractSchemaValidationPolicy] = DEFAULT_SCHEMA_VALIDATION_POLICIES,
|
39
|
-
|
38
|
+
cursor_cls: Type[AbstractFileBasedCursor] = DefaultFileBasedCursor,
|
40
39
|
):
|
41
40
|
self.stream_reader = stream_reader
|
42
41
|
self.spec_class = spec_class
|
@@ -46,7 +45,7 @@ class FileBasedSource(AbstractSource, ABC):
|
|
46
45
|
self.validation_policies = validation_policies
|
47
46
|
catalog = self.read_catalog(catalog_path) if catalog_path else None
|
48
47
|
self.stream_schemas = {s.stream.name: s.stream.json_schema for s in catalog.streams} if catalog else {}
|
49
|
-
self.
|
48
|
+
self.cursor_cls = cursor_cls
|
50
49
|
self.logger = logging.getLogger(f"airbyte.{self.name}")
|
51
50
|
|
52
51
|
def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Optional[Any]]:
|
@@ -104,7 +103,7 @@ class FileBasedSource(AbstractSource, ABC):
|
|
104
103
|
discovery_policy=self.discovery_policy,
|
105
104
|
parsers=self.parsers,
|
106
105
|
validation_policy=self._validate_and_get_validation_policy(stream_config),
|
107
|
-
cursor=
|
106
|
+
cursor=self.cursor_cls(stream_config),
|
108
107
|
)
|
109
108
|
)
|
110
109
|
return streams
|
@@ -5,12 +5,13 @@
|
|
5
5
|
import csv
|
6
6
|
import json
|
7
7
|
import logging
|
8
|
-
from
|
9
|
-
from
|
8
|
+
from functools import partial
|
9
|
+
from io import IOBase
|
10
|
+
from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Set
|
10
11
|
|
11
12
|
from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, QuotingBehavior
|
12
13
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
13
|
-
from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError
|
14
|
+
from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
|
14
15
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
|
15
16
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
16
17
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
@@ -34,30 +35,25 @@ class CsvParser(FileTypeParser):
|
|
34
35
|
stream_reader: AbstractFileBasedStreamReader,
|
35
36
|
logger: logging.Logger,
|
36
37
|
) -> Dict[str, Any]:
|
37
|
-
config_format = config.format.get(config.file_type) if config.format else
|
38
|
-
if config_format:
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
return schema
|
57
|
-
else:
|
58
|
-
with stream_reader.open_file(file, self.file_read_mode, logger) as fp:
|
59
|
-
reader = csv.DictReader(fp) # type: ignore
|
60
|
-
return {field.strip(): {"type": "string"} for field in next(reader)}
|
38
|
+
config_format = config.format.get(config.file_type) if config.format else CsvFormat()
|
39
|
+
if not isinstance(config_format, CsvFormat):
|
40
|
+
raise ValueError(f"Invalid format config: {config_format}")
|
41
|
+
dialect_name = config.name + DIALECT_NAME
|
42
|
+
csv.register_dialect(
|
43
|
+
dialect_name,
|
44
|
+
delimiter=config_format.delimiter,
|
45
|
+
quotechar=config_format.quote_char,
|
46
|
+
escapechar=config_format.escape_char,
|
47
|
+
doublequote=config_format.double_quote,
|
48
|
+
quoting=config_to_quoting.get(config_format.quoting_behavior, csv.QUOTE_MINIMAL),
|
49
|
+
)
|
50
|
+
with stream_reader.open_file(file, self.file_read_mode, logger) as fp:
|
51
|
+
# todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual
|
52
|
+
# sources will likely require one. Rather than modify the interface now we can wait until the real use case
|
53
|
+
headers = self._get_headers(fp, config_format, dialect_name)
|
54
|
+
schema = {field.strip(): {"type": "string"} for field in headers}
|
55
|
+
csv.unregister_dialect(dialect_name)
|
56
|
+
return schema
|
61
57
|
|
62
58
|
def parse_records(
|
63
59
|
self,
|
@@ -67,30 +63,28 @@ class CsvParser(FileTypeParser):
|
|
67
63
|
logger: logging.Logger,
|
68
64
|
) -> Iterable[Dict[str, Any]]:
|
69
65
|
schema: Mapping[str, Any] = config.input_schema # type: ignore
|
70
|
-
config_format = config.format.get(config.file_type) if config.format else
|
71
|
-
if config_format:
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
reader = csv.DictReader(fp) # type: ignore
|
93
|
-
yield from self._read_and_cast_types(reader, schema, logger)
|
66
|
+
config_format = config.format.get(config.file_type) if config.format else CsvFormat()
|
67
|
+
if not isinstance(config_format, CsvFormat):
|
68
|
+
raise ValueError(f"Invalid format config: {config_format}")
|
69
|
+
# Formats are configured individually per-stream so a unique dialect should be registered for each stream.
|
70
|
+
# We don't unregister the dialect because we are lazily parsing each csv file to generate records
|
71
|
+
# This will potentially be a problem if we ever process multiple streams concurrently
|
72
|
+
dialect_name = config.name + DIALECT_NAME
|
73
|
+
csv.register_dialect(
|
74
|
+
dialect_name,
|
75
|
+
delimiter=config_format.delimiter,
|
76
|
+
quotechar=config_format.quote_char,
|
77
|
+
escapechar=config_format.escape_char,
|
78
|
+
doublequote=config_format.double_quote,
|
79
|
+
quoting=config_to_quoting.get(config_format.quoting_behavior, csv.QUOTE_MINIMAL),
|
80
|
+
)
|
81
|
+
with stream_reader.open_file(file, self.file_read_mode, logger) as fp:
|
82
|
+
# todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual
|
83
|
+
# sources will likely require one. Rather than modify the interface now we can wait until the real use case
|
84
|
+
self._skip_rows_before_header(fp, config_format.skip_rows_before_header)
|
85
|
+
field_names = self._auto_generate_headers(fp, config_format) if config_format.autogenerate_column_names else None
|
86
|
+
reader = csv.DictReader(fp, dialect=dialect_name, fieldnames=field_names) # type: ignore
|
87
|
+
yield from self._read_and_cast_types(reader, schema, config_format, logger)
|
94
88
|
|
95
89
|
@property
|
96
90
|
def file_read_mode(self) -> FileReadMode:
|
@@ -98,7 +92,7 @@ class CsvParser(FileTypeParser):
|
|
98
92
|
|
99
93
|
@staticmethod
|
100
94
|
def _read_and_cast_types(
|
101
|
-
reader: csv.DictReader, schema: Optional[Mapping[str, Any]], logger: logging.Logger # type: ignore
|
95
|
+
reader: csv.DictReader, schema: Optional[Mapping[str, Any]], config_format: CsvFormat, logger: logging.Logger # type: ignore
|
102
96
|
) -> Iterable[Dict[str, Any]]:
|
103
97
|
"""
|
104
98
|
If the user provided a schema, attempt to cast the record values to the associated type.
|
@@ -107,16 +101,65 @@ class CsvParser(FileTypeParser):
|
|
107
101
|
cast it to a string. Downstream, the user's validation policy will determine whether the
|
108
102
|
record should be emitted.
|
109
103
|
"""
|
110
|
-
|
111
|
-
|
104
|
+
cast_fn = CsvParser._get_cast_function(schema, config_format, logger)
|
105
|
+
for i, row in enumerate(reader):
|
106
|
+
if i < config_format.skip_rows_after_header:
|
107
|
+
continue
|
108
|
+
# The row was not properly parsed if any of the values are None
|
109
|
+
if any(val is None for val in row.values()):
|
110
|
+
raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD)
|
111
|
+
else:
|
112
|
+
yield CsvParser._to_nullable(cast_fn(row), config_format.null_values)
|
112
113
|
|
113
|
-
|
114
|
+
@staticmethod
|
115
|
+
def _get_cast_function(
|
116
|
+
schema: Optional[Mapping[str, Any]], config_format: CsvFormat, logger: logging.Logger
|
117
|
+
) -> Callable[[Mapping[str, str]], Mapping[str, str]]:
|
118
|
+
# Only cast values if the schema is provided
|
119
|
+
if schema:
|
114
120
|
property_types = {col: prop["type"] for col, prop in schema["properties"].items()}
|
115
|
-
|
116
|
-
|
121
|
+
return partial(_cast_types, property_types=property_types, config_format=config_format, logger=logger)
|
122
|
+
else:
|
123
|
+
# If no schema is provided, yield the rows as they are
|
124
|
+
return _no_cast
|
125
|
+
|
126
|
+
@staticmethod
|
127
|
+
def _to_nullable(row: Mapping[str, str], null_values: Set[str]) -> Dict[str, Optional[str]]:
|
128
|
+
nullable = row | {k: None if v in null_values else v for k, v in row.items()}
|
129
|
+
return nullable
|
130
|
+
|
131
|
+
@staticmethod
|
132
|
+
def _skip_rows_before_header(fp: IOBase, rows_to_skip: int) -> None:
|
133
|
+
"""
|
134
|
+
Skip rows before the header. This has to be done on the file object itself, not the reader
|
135
|
+
"""
|
136
|
+
for _ in range(rows_to_skip):
|
137
|
+
fp.readline()
|
138
|
+
|
139
|
+
def _get_headers(self, fp: IOBase, config_format: CsvFormat, dialect_name: str) -> List[str]:
|
140
|
+
# Note that this method assumes the dialect has already been registered if we're parsing the headers
|
141
|
+
if config_format.autogenerate_column_names:
|
142
|
+
return self._auto_generate_headers(fp, config_format)
|
143
|
+
else:
|
144
|
+
# If we're not autogenerating column names, we need to skip the rows before the header
|
145
|
+
self._skip_rows_before_header(fp, config_format.skip_rows_before_header)
|
146
|
+
# Then read the header
|
147
|
+
reader = csv.DictReader(fp, dialect=dialect_name) # type: ignore
|
148
|
+
return next(reader) # type: ignore
|
117
149
|
|
150
|
+
def _auto_generate_headers(self, fp: IOBase, config_format: CsvFormat) -> List[str]:
|
151
|
+
"""
|
152
|
+
Generates field names as [f0, f1, ...] in the same way as pyarrow's csv reader with autogenerate_column_names=True.
|
153
|
+
See https://arrow.apache.org/docs/python/generated/pyarrow.csv.ReadOptions.html
|
154
|
+
"""
|
155
|
+
next_line = next(fp).strip()
|
156
|
+
number_of_columns = len(next_line.split(config_format.delimiter)) # type: ignore
|
157
|
+
# Reset the file pointer to the beginning of the file so that the first row is not skipped
|
158
|
+
fp.seek(0)
|
159
|
+
return [f"f{i}" for i in range(number_of_columns)]
|
118
160
|
|
119
|
-
|
161
|
+
|
162
|
+
def _cast_types(row: Dict[str, str], property_types: Dict[str, Any], config_format: CsvFormat, logger: logging.Logger) -> Dict[str, Any]:
|
120
163
|
"""
|
121
164
|
Casts the values in the input 'row' dictionary according to the types defined in the JSON schema.
|
122
165
|
|
@@ -142,7 +185,7 @@ def cast_types(row: Dict[str, str], property_types: Dict[str, Any], logger: logg
|
|
142
185
|
|
143
186
|
elif python_type == bool:
|
144
187
|
try:
|
145
|
-
cast_value =
|
188
|
+
cast_value = _value_to_bool(value, config_format.true_values, config_format.false_values)
|
146
189
|
except ValueError:
|
147
190
|
warnings.append(_format_warning(key, value, prop_type))
|
148
191
|
|
@@ -178,5 +221,17 @@ def cast_types(row: Dict[str, str], property_types: Dict[str, Any], logger: logg
|
|
178
221
|
return result
|
179
222
|
|
180
223
|
|
224
|
+
def _value_to_bool(value: str, true_values: Set[str], false_values: Set[str]) -> bool:
|
225
|
+
if value in true_values:
|
226
|
+
return True
|
227
|
+
if value in false_values:
|
228
|
+
return False
|
229
|
+
raise ValueError(f"Value {value} is not a valid boolean value")
|
230
|
+
|
231
|
+
|
181
232
|
def _format_warning(key: str, value: str, expected_type: Optional[Any]) -> str:
|
182
233
|
return f"{key}: value={value},expected_type={expected_type}"
|
234
|
+
|
235
|
+
|
236
|
+
def _no_cast(row: Mapping[str, str]) -> Mapping[str, str]:
|
237
|
+
return row
|
@@ -1,4 +1,4 @@
|
|
1
|
+
from .abstract_file_based_cursor import AbstractFileBasedCursor
|
1
2
|
from .default_file_based_cursor import DefaultFileBasedCursor
|
2
|
-
from .file_based_cursor import FileBasedCursor
|
3
3
|
|
4
|
-
__all__ = ["
|
4
|
+
__all__ = ["AbstractFileBasedCursor", "DefaultFileBasedCursor"]
|
airbyte_cdk/sources/file_based/stream/cursor/{file_based_cursor.py → abstract_file_based_cursor.py}
RENAMED
@@ -7,15 +7,23 @@ from abc import ABC, abstractmethod
|
|
7
7
|
from datetime import datetime
|
8
8
|
from typing import Any, Iterable, MutableMapping
|
9
9
|
|
10
|
+
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
10
11
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
11
12
|
from airbyte_cdk.sources.file_based.types import StreamState
|
12
13
|
|
13
14
|
|
14
|
-
class
|
15
|
+
class AbstractFileBasedCursor(ABC):
|
15
16
|
"""
|
16
17
|
Abstract base class for cursors used by file-based streams.
|
17
18
|
"""
|
18
19
|
|
20
|
+
@abstractmethod
|
21
|
+
def __init__(self, stream_config: FileBasedStreamConfig, **kwargs: Any):
|
22
|
+
"""
|
23
|
+
Common interface for all cursors.
|
24
|
+
"""
|
25
|
+
...
|
26
|
+
|
19
27
|
@abstractmethod
|
20
28
|
def add_file(self, file: RemoteFile) -> None:
|
21
29
|
"""
|
@@ -4,26 +4,26 @@
|
|
4
4
|
|
5
5
|
import logging
|
6
6
|
from datetime import datetime, timedelta
|
7
|
-
from typing import Iterable, MutableMapping, Optional
|
7
|
+
from typing import Any, Iterable, MutableMapping, Optional
|
8
8
|
|
9
|
+
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
9
10
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
10
|
-
from airbyte_cdk.sources.file_based.stream.cursor.
|
11
|
+
from airbyte_cdk.sources.file_based.stream.cursor.abstract_file_based_cursor import AbstractFileBasedCursor
|
11
12
|
from airbyte_cdk.sources.file_based.types import StreamState
|
12
13
|
|
13
14
|
|
14
|
-
class DefaultFileBasedCursor(
|
15
|
+
class DefaultFileBasedCursor(AbstractFileBasedCursor):
|
15
16
|
DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL = 3
|
17
|
+
DEFAULT_MAX_HISTORY_SIZE = 10_000
|
16
18
|
DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
|
17
19
|
|
18
|
-
def __init__(self,
|
20
|
+
def __init__(self, stream_config: FileBasedStreamConfig, **_: Any):
|
21
|
+
super().__init__(stream_config)
|
19
22
|
self._file_to_datetime_history: MutableMapping[str, str] = {}
|
20
|
-
self._max_history_size = max_history_size
|
21
23
|
self._time_window_if_history_is_full = timedelta(
|
22
|
-
days=days_to_sync_if_history_is_full or self.DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
|
24
|
+
days=stream_config.days_to_sync_if_history_is_full or self.DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
|
23
25
|
)
|
24
26
|
|
25
|
-
if self._max_history_size <= 0:
|
26
|
-
raise ValueError(f"max_history_size must be a positive integer, got {self._max_history_size}")
|
27
27
|
if self._time_window_if_history_is_full <= timedelta():
|
28
28
|
raise ValueError(f"days_to_sync_if_history_is_full must be a positive timedelta, got {self._time_window_if_history_is_full}")
|
29
29
|
|
@@ -37,7 +37,7 @@ class DefaultFileBasedCursor(FileBasedCursor):
|
|
37
37
|
|
38
38
|
def add_file(self, file: RemoteFile) -> None:
|
39
39
|
self._file_to_datetime_history[file.uri] = file.last_modified.strftime(self.DATE_TIME_FORMAT)
|
40
|
-
if len(self._file_to_datetime_history) > self.
|
40
|
+
if len(self._file_to_datetime_history) > self.DEFAULT_MAX_HISTORY_SIZE:
|
41
41
|
# Get the earliest file based on its last modified date and its uri
|
42
42
|
oldest_file = self._compute_earliest_file_in_history()
|
43
43
|
if oldest_file:
|
@@ -67,7 +67,7 @@ class DefaultFileBasedCursor(FileBasedCursor):
|
|
67
67
|
"""
|
68
68
|
Returns true if the state's history is full, meaning new entries will start to replace old entries.
|
69
69
|
"""
|
70
|
-
return len(self._file_to_datetime_history) >= self.
|
70
|
+
return len(self._file_to_datetime_history) >= self.DEFAULT_MAX_HISTORY_SIZE
|
71
71
|
|
72
72
|
def _should_sync_file(self, file: RemoteFile, logger: logging.Logger) -> bool:
|
73
73
|
if file.uri in self._file_to_datetime_history:
|
@@ -15,13 +15,14 @@ from airbyte_cdk.sources.file_based.exceptions import (
|
|
15
15
|
FileBasedSourceError,
|
16
16
|
InvalidSchemaError,
|
17
17
|
MissingSchemaError,
|
18
|
+
RecordParseError,
|
18
19
|
SchemaInferenceError,
|
19
20
|
StopSyncPerValidationPolicy,
|
20
21
|
)
|
21
22
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
22
23
|
from airbyte_cdk.sources.file_based.schema_helpers import merge_schemas, schemaless_schema
|
23
24
|
from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream
|
24
|
-
from airbyte_cdk.sources.file_based.stream.cursor import
|
25
|
+
from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
|
25
26
|
from airbyte_cdk.sources.file_based.types import StreamSlice
|
26
27
|
from airbyte_cdk.sources.streams import IncrementalMixin
|
27
28
|
from airbyte_cdk.sources.streams.core import JsonSchema
|
@@ -39,7 +40,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
39
40
|
ab_file_name_col = "_ab_source_file_url"
|
40
41
|
airbyte_columns = [ab_last_mod_col, ab_file_name_col]
|
41
42
|
|
42
|
-
def __init__(self, cursor:
|
43
|
+
def __init__(self, cursor: AbstractFileBasedCursor, **kwargs: Any):
|
43
44
|
super().__init__(**kwargs)
|
44
45
|
self._cursor = cursor
|
45
46
|
|
@@ -105,6 +106,18 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
105
106
|
)
|
106
107
|
break
|
107
108
|
|
109
|
+
except RecordParseError:
|
110
|
+
# Increment line_no because the exception was raised before we could increment it
|
111
|
+
line_no += 1
|
112
|
+
yield AirbyteMessage(
|
113
|
+
type=MessageType.LOG,
|
114
|
+
log=AirbyteLogMessage(
|
115
|
+
level=Level.ERROR,
|
116
|
+
message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={self.name} file={file.uri} line_no={line_no} n_skipped={n_skipped}",
|
117
|
+
stack_trace=traceback.format_exc(),
|
118
|
+
),
|
119
|
+
)
|
120
|
+
|
108
121
|
except Exception:
|
109
122
|
yield AirbyteMessage(
|
110
123
|
type=MessageType.LOG,
|