airbyte-cdk 6.45.0__py3-none-any.whl → 6.45.0.dev4101__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/connector_builder/connector_builder_handler.py +6 -45
- airbyte_cdk/connector_builder/main.py +2 -5
- airbyte_cdk/models/__init__.py +1 -0
- airbyte_cdk/models/airbyte_protocol.py +1 -3
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +1 -1
- airbyte_cdk/sources/declarative/async_job/job.py +0 -6
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +18 -18
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +6 -22
- airbyte_cdk/sources/declarative/checks/__init__.py +2 -5
- airbyte_cdk/sources/declarative/checks/check_stream.py +11 -113
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +8 -0
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +50 -210
- airbyte_cdk/sources/declarative/extractors/record_selector.py +6 -1
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +1 -2
- airbyte_cdk/sources/declarative/interpolation/macros.py +4 -8
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +2 -23
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +43 -142
- airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +4 -16
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +50 -263
- airbyte_cdk/sources/declarative/partition_routers/__init__.py +0 -4
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +1 -5
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +2 -25
- airbyte_cdk/sources/declarative/retrievers/file_uploader.py +89 -0
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +30 -101
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +9 -4
- airbyte_cdk/sources/declarative/transformations/add_fields.py +1 -3
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +32 -14
- airbyte_cdk/sources/file_based/file_record_data.py +24 -0
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +8 -15
- airbyte_cdk/sources/file_based/schema_helpers.py +11 -1
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +0 -1
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +16 -31
- airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +1 -3
- airbyte_cdk/sources/streams/concurrent/default_stream.py +3 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +0 -4
- airbyte_cdk/sources/types.py +11 -2
- airbyte_cdk/sources/utils/files_directory.py +15 -0
- airbyte_cdk/sources/utils/record_helper.py +8 -8
- {airbyte_cdk-6.45.0.dist-info → airbyte_cdk-6.45.0.dev4101.dist-info}/METADATA +2 -2
- {airbyte_cdk-6.45.0.dist-info → airbyte_cdk-6.45.0.dev4101.dist-info}/RECORD +44 -50
- airbyte_cdk/models/file_transfer_record_message.py +0 -13
- airbyte_cdk/sources/declarative/partition_routers/grouping_partition_router.py +0 -150
- airbyte_cdk/sources/declarative/requesters/query_properties/__init__.py +0 -13
- airbyte_cdk/sources/declarative/requesters/query_properties/properties_from_endpoint.py +0 -40
- airbyte_cdk/sources/declarative/requesters/query_properties/property_chunking.py +0 -69
- airbyte_cdk/sources/declarative/requesters/query_properties/query_properties.py +0 -58
- airbyte_cdk/sources/declarative/requesters/query_properties/strategies/__init__.py +0 -10
- airbyte_cdk/sources/declarative/requesters/query_properties/strategies/group_by_key.py +0 -33
- airbyte_cdk/sources/declarative/requesters/query_properties/strategies/merge_strategy.py +0 -19
- {airbyte_cdk-6.45.0.dist-info → airbyte_cdk-6.45.0.dev4101.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.45.0.dist-info → airbyte_cdk-6.45.0.dev4101.dist-info}/LICENSE_SHORT +0 -0
- {airbyte_cdk-6.45.0.dist-info → airbyte_cdk-6.45.0.dev4101.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.45.0.dist-info → airbyte_cdk-6.45.0.dev4101.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,89 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
|
5
|
+
import json
|
6
|
+
import logging
|
7
|
+
import uuid
|
8
|
+
from dataclasses import InitVar, dataclass, field
|
9
|
+
from pathlib import Path
|
10
|
+
from typing import Any, Mapping, Optional, Union
|
11
|
+
|
12
|
+
from airbyte_cdk.models import AirbyteRecordMessageFileReference
|
13
|
+
from airbyte_cdk.sources.declarative.extractors.record_extractor import RecordExtractor
|
14
|
+
from airbyte_cdk.sources.declarative.interpolation.interpolated_string import (
|
15
|
+
InterpolatedString,
|
16
|
+
)
|
17
|
+
from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import (
|
18
|
+
SafeResponse,
|
19
|
+
)
|
20
|
+
from airbyte_cdk.sources.declarative.requesters import Requester
|
21
|
+
from airbyte_cdk.sources.declarative.types import Record, StreamSlice
|
22
|
+
from airbyte_cdk.sources.types import Config
|
23
|
+
from airbyte_cdk.sources.utils.files_directory import get_files_directory
|
24
|
+
|
25
|
+
logger = logging.getLogger("airbyte")
|
26
|
+
|
27
|
+
|
28
|
+
@dataclass
|
29
|
+
class FileUploader:
|
30
|
+
requester: Requester
|
31
|
+
download_target_extractor: RecordExtractor
|
32
|
+
config: Config
|
33
|
+
parameters: InitVar[Mapping[str, Any]]
|
34
|
+
|
35
|
+
filename_extractor: Optional[Union[InterpolatedString, str]] = None
|
36
|
+
content_extractor: Optional[RecordExtractor] = None
|
37
|
+
|
38
|
+
def __post_init__(self, parameters: Mapping[str, Any]) -> None:
|
39
|
+
if self.filename_extractor:
|
40
|
+
self.filename_extractor = InterpolatedString.create(
|
41
|
+
self.filename_extractor,
|
42
|
+
parameters=parameters,
|
43
|
+
)
|
44
|
+
|
45
|
+
def upload(self, record: Record) -> None:
|
46
|
+
mocked_response = SafeResponse()
|
47
|
+
mocked_response.content = json.dumps(record.data).encode()
|
48
|
+
download_target = list(self.download_target_extractor.extract_records(mocked_response))[0]
|
49
|
+
if not isinstance(download_target, str):
|
50
|
+
raise ValueError(
|
51
|
+
f"download_target is expected to be a str but was {type(download_target)}: {download_target}"
|
52
|
+
)
|
53
|
+
|
54
|
+
response = self.requester.send_request(
|
55
|
+
stream_slice=StreamSlice(
|
56
|
+
partition={}, cursor_slice={}, extra_fields={"download_target": download_target}
|
57
|
+
),
|
58
|
+
)
|
59
|
+
|
60
|
+
if self.content_extractor:
|
61
|
+
raise NotImplementedError("TODO")
|
62
|
+
else:
|
63
|
+
files_directory = Path(get_files_directory())
|
64
|
+
|
65
|
+
file_name = (
|
66
|
+
self.filename_extractor.eval(self.config, record=record)
|
67
|
+
if self.filename_extractor
|
68
|
+
else str(uuid.uuid4())
|
69
|
+
)
|
70
|
+
file_name = file_name.lstrip("/")
|
71
|
+
file_relative_path = Path(record.stream_name) / Path(file_name)
|
72
|
+
|
73
|
+
full_path = files_directory / file_relative_path
|
74
|
+
full_path.parent.mkdir(parents=True, exist_ok=True)
|
75
|
+
|
76
|
+
with open(str(full_path), "wb") as f:
|
77
|
+
f.write(response.content)
|
78
|
+
file_size_bytes = full_path.stat().st_size
|
79
|
+
|
80
|
+
logger.info("File uploaded successfully")
|
81
|
+
logger.info(f"File url: {str(full_path)}")
|
82
|
+
logger.info(f"File size: {file_size_bytes / 1024} KB")
|
83
|
+
logger.info(f"File relative path: {str(file_relative_path)}")
|
84
|
+
|
85
|
+
record.file_reference = AirbyteRecordMessageFileReference(
|
86
|
+
file_url=str(full_path),
|
87
|
+
file_relative_path=str(file_relative_path),
|
88
|
+
file_size_bytes=file_size_bytes,
|
89
|
+
)
|
@@ -1,9 +1,8 @@
|
|
1
1
|
#
|
2
|
-
# Copyright (c)
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
|
5
5
|
import json
|
6
|
-
from collections import defaultdict
|
7
6
|
from dataclasses import InitVar, dataclass, field
|
8
7
|
from functools import partial
|
9
8
|
from itertools import islice
|
@@ -13,7 +12,6 @@ from typing import (
|
|
13
12
|
Iterable,
|
14
13
|
List,
|
15
14
|
Mapping,
|
16
|
-
MutableMapping,
|
17
15
|
Optional,
|
18
16
|
Set,
|
19
17
|
Tuple,
|
@@ -33,7 +31,6 @@ from airbyte_cdk.sources.declarative.partition_routers.single_partition_router i
|
|
33
31
|
)
|
34
32
|
from airbyte_cdk.sources.declarative.requesters.paginators.no_pagination import NoPagination
|
35
33
|
from airbyte_cdk.sources.declarative.requesters.paginators.paginator import Paginator
|
36
|
-
from airbyte_cdk.sources.declarative.requesters.query_properties import QueryProperties
|
37
34
|
from airbyte_cdk.sources.declarative.requesters.request_options import (
|
38
35
|
DefaultRequestOptionsProvider,
|
39
36
|
RequestOptionsProvider,
|
@@ -91,7 +88,6 @@ class SimpleRetriever(Retriever):
|
|
91
88
|
)
|
92
89
|
cursor: Optional[DeclarativeCursor] = None
|
93
90
|
ignore_stream_slicer_parameters_on_paginated_requests: bool = False
|
94
|
-
additional_query_properties: Optional[QueryProperties] = None
|
95
91
|
|
96
92
|
def __post_init__(self, parameters: Mapping[str, Any]) -> None:
|
97
93
|
self._paginator = self.paginator or NoPagination(parameters=parameters)
|
@@ -449,110 +445,43 @@ class SimpleRetriever(Retriever):
|
|
449
445
|
:param stream_slice: The stream slice to read data for
|
450
446
|
:return: The records read from the API source
|
451
447
|
"""
|
448
|
+
_slice = stream_slice or StreamSlice(partition={}, cursor_slice={}) # None-check
|
452
449
|
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
if self.additional_query_properties
|
460
|
-
else []
|
450
|
+
most_recent_record_from_slice = None
|
451
|
+
record_generator = partial(
|
452
|
+
self._parse_records,
|
453
|
+
stream_slice=stream_slice,
|
454
|
+
stream_state=self.state or {},
|
455
|
+
records_schema=records_schema,
|
461
456
|
)
|
462
|
-
records_without_merge_key = []
|
463
|
-
merged_records: MutableMapping[str, Any] = defaultdict(dict)
|
464
457
|
|
465
|
-
|
466
|
-
|
458
|
+
if self.cursor and isinstance(self.cursor, ResumableFullRefreshCursor):
|
459
|
+
stream_state = self.state
|
467
460
|
|
468
|
-
|
469
|
-
for
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
461
|
+
# Before syncing the RFR stream, we check if the job's prior attempt was successful and don't need to
|
462
|
+
# fetch more records. The platform deletes stream state for full refresh streams before starting a
|
463
|
+
# new job, so we don't need to worry about this value existing for the initial attempt
|
464
|
+
if stream_state.get(FULL_REFRESH_SYNC_COMPLETE_KEY):
|
465
|
+
return
|
466
|
+
|
467
|
+
yield from self._read_single_page(record_generator, stream_state, _slice)
|
468
|
+
else:
|
469
|
+
for stream_data in self._read_pages(record_generator, self.state, _slice):
|
470
|
+
current_record = self._extract_record(stream_data, _slice)
|
471
|
+
if self.cursor and current_record:
|
472
|
+
self.cursor.observe(_slice, current_record)
|
473
|
+
|
474
|
+
# Latest record read, not necessarily within slice boundaries.
|
475
|
+
# TODO Remove once all custom components implement `observe` method.
|
476
|
+
# https://github.com/airbytehq/airbyte-internal-issues/issues/6955
|
477
|
+
most_recent_record_from_slice = self._get_most_recent_record(
|
478
|
+
most_recent_record_from_slice, current_record, _slice
|
481
479
|
)
|
480
|
+
yield stream_data
|
482
481
|
|
483
|
-
for stream_data in self._read_pages(record_generator, self.state, _slice):
|
484
|
-
current_record = self._extract_record(stream_data, _slice)
|
485
|
-
if self.cursor and current_record:
|
486
|
-
self.cursor.observe(_slice, current_record)
|
487
|
-
|
488
|
-
# Latest record read, not necessarily within slice boundaries.
|
489
|
-
# TODO Remove once all custom components implement `observe` method.
|
490
|
-
# https://github.com/airbytehq/airbyte-internal-issues/issues/6955
|
491
|
-
most_recent_record_from_slice = self._get_most_recent_record(
|
492
|
-
most_recent_record_from_slice, current_record, _slice
|
493
|
-
)
|
494
|
-
|
495
|
-
if current_record and self.additional_query_properties.property_chunking:
|
496
|
-
merge_key = (
|
497
|
-
self.additional_query_properties.property_chunking.get_merge_key(
|
498
|
-
current_record
|
499
|
-
)
|
500
|
-
)
|
501
|
-
if merge_key:
|
502
|
-
merged_records[merge_key].update(current_record)
|
503
|
-
else:
|
504
|
-
# We should still emit records even if the record did not have a merge key
|
505
|
-
records_without_merge_key.append(current_record)
|
506
|
-
else:
|
507
|
-
yield stream_data
|
508
482
|
if self.cursor:
|
509
483
|
self.cursor.close_slice(_slice, most_recent_record_from_slice)
|
510
|
-
|
511
|
-
if len(merged_records) > 0:
|
512
|
-
yield from [
|
513
|
-
Record(data=merged_record, stream_name=self.name, associated_slice=stream_slice)
|
514
|
-
for merged_record in merged_records.values()
|
515
|
-
]
|
516
|
-
if len(records_without_merge_key) > 0:
|
517
|
-
yield from records_without_merge_key
|
518
|
-
else:
|
519
|
-
_slice = stream_slice or StreamSlice(partition={}, cursor_slice={}) # None-check
|
520
|
-
|
521
|
-
most_recent_record_from_slice = None
|
522
|
-
record_generator = partial(
|
523
|
-
self._parse_records,
|
524
|
-
stream_slice=stream_slice,
|
525
|
-
stream_state=self.state or {},
|
526
|
-
records_schema=records_schema,
|
527
|
-
)
|
528
|
-
|
529
|
-
if self.cursor and isinstance(self.cursor, ResumableFullRefreshCursor):
|
530
|
-
stream_state = self.state
|
531
|
-
|
532
|
-
# Before syncing the RFR stream, we check if the job's prior attempt was successful and don't need to
|
533
|
-
# fetch more records. The platform deletes stream state for full refresh streams before starting a
|
534
|
-
# new job, so we don't need to worry about this value existing for the initial attempt
|
535
|
-
if stream_state.get(FULL_REFRESH_SYNC_COMPLETE_KEY):
|
536
|
-
return
|
537
|
-
|
538
|
-
yield from self._read_single_page(record_generator, stream_state, _slice)
|
539
|
-
else:
|
540
|
-
for stream_data in self._read_pages(record_generator, self.state, _slice):
|
541
|
-
current_record = self._extract_record(stream_data, _slice)
|
542
|
-
if self.cursor and current_record:
|
543
|
-
self.cursor.observe(_slice, current_record)
|
544
|
-
|
545
|
-
# Latest record read, not necessarily within slice boundaries.
|
546
|
-
# TODO Remove once all custom components implement `observe` method.
|
547
|
-
# https://github.com/airbytehq/airbyte-internal-issues/issues/6955
|
548
|
-
most_recent_record_from_slice = self._get_most_recent_record(
|
549
|
-
most_recent_record_from_slice, current_record, _slice
|
550
|
-
)
|
551
|
-
yield stream_data
|
552
|
-
|
553
|
-
if self.cursor:
|
554
|
-
self.cursor.close_slice(_slice, most_recent_record_from_slice)
|
555
|
-
return
|
484
|
+
return
|
556
485
|
|
557
486
|
def _get_most_recent_record(
|
558
487
|
self,
|
@@ -58,11 +58,16 @@ class DeclarativePartition(Partition):
|
|
58
58
|
def read(self) -> Iterable[Record]:
|
59
59
|
for stream_data in self._retriever.read_records(self._json_schema, self._stream_slice):
|
60
60
|
if isinstance(stream_data, Mapping):
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
61
|
+
record = (
|
62
|
+
stream_data
|
63
|
+
if isinstance(stream_data, Record)
|
64
|
+
else Record(
|
65
|
+
data=stream_data,
|
66
|
+
stream_name=self.stream_name(),
|
67
|
+
associated_slice=self._stream_slice,
|
68
|
+
)
|
65
69
|
)
|
70
|
+
yield record
|
66
71
|
else:
|
67
72
|
self._message_repository.emit_message(stream_data)
|
68
73
|
|
@@ -139,9 +139,7 @@ class AddFields(RecordTransformation):
|
|
139
139
|
valid_types = (parsed_field.value_type,) if parsed_field.value_type else None
|
140
140
|
value = parsed_field.value.eval(config, valid_types=valid_types, **kwargs)
|
141
141
|
is_empty_condition = not self.condition
|
142
|
-
if is_empty_condition or self._filter_interpolator.eval(
|
143
|
-
config, value=value, path=parsed_field.path, **kwargs
|
144
|
-
):
|
142
|
+
if is_empty_condition or self._filter_interpolator.eval(config, value=value, **kwargs):
|
145
143
|
dpath.new(record, parsed_field.path, value)
|
146
144
|
|
147
145
|
def __eq__(self, other: Any) -> bool:
|
@@ -8,16 +8,18 @@ from datetime import datetime
|
|
8
8
|
from enum import Enum
|
9
9
|
from io import IOBase
|
10
10
|
from os import makedirs, path
|
11
|
-
from typing import Any,
|
11
|
+
from typing import Any, Iterable, List, Optional, Set, Tuple, MutableMapping
|
12
12
|
|
13
13
|
from wcmatch.glob import GLOBSTAR, globmatch
|
14
14
|
|
15
|
+
from airbyte_cdk.models import AirbyteRecordMessageFileReference
|
15
16
|
from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
|
16
17
|
from airbyte_cdk.sources.file_based.config.validate_config_transfer_modes import (
|
17
18
|
include_identities_stream,
|
18
19
|
preserve_directory_structure,
|
19
20
|
use_file_transfer,
|
20
21
|
)
|
22
|
+
from airbyte_cdk.sources.file_based.file_record_data import FileRecordData
|
21
23
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
22
24
|
|
23
25
|
|
@@ -28,6 +30,12 @@ class FileReadMode(Enum):
|
|
28
30
|
|
29
31
|
class AbstractFileBasedStreamReader(ABC):
|
30
32
|
DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
|
33
|
+
FILE_RELATIVE_PATH = "file_relative_path"
|
34
|
+
FILE_NAME = "file_name"
|
35
|
+
LOCAL_FILE_PATH = "local_file_path"
|
36
|
+
ABSOLUTE_FILE_PATH = "absolute_file_path"
|
37
|
+
SOURCE_FILE_URI = "source_file_relative_path"
|
38
|
+
FILE_FOLDER = "file_folder"
|
31
39
|
|
32
40
|
def __init__(self) -> None:
|
33
41
|
self._config = None
|
@@ -148,9 +156,9 @@ class AbstractFileBasedStreamReader(ABC):
|
|
148
156
|
return False
|
149
157
|
|
150
158
|
@abstractmethod
|
151
|
-
def
|
159
|
+
def upload(
|
152
160
|
self, file: RemoteFile, local_directory: str, logger: logging.Logger
|
153
|
-
) ->
|
161
|
+
) -> Tuple[FileRecordData, AirbyteRecordMessageFileReference]:
|
154
162
|
"""
|
155
163
|
This is required for connectors that will support writing to
|
156
164
|
files. It will handle the logic to download,get,read,acquire or
|
@@ -162,25 +170,35 @@ class AbstractFileBasedStreamReader(ABC):
|
|
162
170
|
logger (logging.Logger): Logger for logging information and errors.
|
163
171
|
|
164
172
|
Returns:
|
165
|
-
|
166
|
-
-
|
167
|
-
-
|
168
|
-
-
|
169
|
-
this a mounted volume in the pod container.
|
170
|
-
|
173
|
+
AirbyteRecordMessageFileReference: A file reference object containing:
|
174
|
+
- staging_file_url (str): The absolute path to the referenced file in the staging area.
|
175
|
+
- file_size_bytes (int): The size of the referenced file in bytes.
|
176
|
+
- source_file_relative_path (str): The relative path to the referenced file in source.
|
171
177
|
"""
|
172
178
|
...
|
173
179
|
|
174
|
-
def _get_file_transfer_paths(self, file: RemoteFile, local_directory: str) ->
|
180
|
+
def _get_file_transfer_paths(self, file: RemoteFile, local_directory: str) -> MutableMapping[str, Any]:
|
175
181
|
preserve_directory_structure = self.preserve_directory_structure()
|
182
|
+
file_uri = file.uri
|
183
|
+
file_name = path.basename(file_uri)
|
184
|
+
file_folder = path.dirname(file_uri)
|
176
185
|
if preserve_directory_structure:
|
177
186
|
# Remove left slashes from source path format to make relative path for writing locally
|
178
|
-
file_relative_path =
|
187
|
+
file_relative_path = file_uri.lstrip("/")
|
179
188
|
else:
|
180
|
-
file_relative_path =
|
189
|
+
file_relative_path = file_name
|
181
190
|
local_file_path = path.join(local_directory, file_relative_path)
|
182
|
-
|
183
191
|
# Ensure the local directory exists
|
184
192
|
makedirs(path.dirname(local_file_path), exist_ok=True)
|
185
193
|
absolute_file_path = path.abspath(local_file_path)
|
186
|
-
|
194
|
+
|
195
|
+
file_paths = {
|
196
|
+
self.FILE_RELATIVE_PATH: file_relative_path,
|
197
|
+
self.LOCAL_FILE_PATH: local_file_path,
|
198
|
+
self.ABSOLUTE_FILE_PATH: absolute_file_path,
|
199
|
+
self.FILE_NAME: file_name,
|
200
|
+
self.FILE_FOLDER: file_folder,
|
201
|
+
self.SOURCE_FILE_URI: file_uri,
|
202
|
+
|
203
|
+
}
|
204
|
+
return file_paths
|
@@ -0,0 +1,24 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
|
5
|
+
from datetime import datetime
|
6
|
+
from typing import Optional
|
7
|
+
|
8
|
+
from pydantic.v1 import BaseModel
|
9
|
+
|
10
|
+
|
11
|
+
class FileRecordData(BaseModel):
|
12
|
+
"""
|
13
|
+
A record in a file-based stream.
|
14
|
+
"""
|
15
|
+
|
16
|
+
folder: str
|
17
|
+
filename: str
|
18
|
+
bytes: int
|
19
|
+
|
20
|
+
id: Optional[str] = None
|
21
|
+
created_at: Optional[int] = None
|
22
|
+
updated_at: Optional[int] = None
|
23
|
+
mime_type: Optional[str] = None
|
24
|
+
description: Optional[str] = None
|
@@ -2,34 +2,27 @@
|
|
2
2
|
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
import logging
|
5
|
-
import
|
6
|
-
from typing import Any, Dict, Iterable
|
5
|
+
from typing import Iterable, Tuple
|
7
6
|
|
8
|
-
from airbyte_cdk.
|
7
|
+
from airbyte_cdk.models import AirbyteRecordMessageFileReference
|
9
8
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
|
9
|
+
from airbyte_cdk.sources.file_based.file_record_data import FileRecordData
|
10
10
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
11
|
-
|
12
|
-
AIRBYTE_STAGING_DIRECTORY = os.getenv("AIRBYTE_STAGING_DIRECTORY", "/staging/files")
|
13
|
-
DEFAULT_LOCAL_DIRECTORY = "/tmp/airbyte-file-transfer"
|
11
|
+
from airbyte_cdk.sources.utils.files_directory import get_files_directory
|
14
12
|
|
15
13
|
|
16
14
|
class FileTransfer:
|
17
15
|
def __init__(self) -> None:
|
18
|
-
self._local_directory = (
|
19
|
-
AIRBYTE_STAGING_DIRECTORY
|
20
|
-
if os.path.exists(AIRBYTE_STAGING_DIRECTORY)
|
21
|
-
else DEFAULT_LOCAL_DIRECTORY
|
22
|
-
)
|
16
|
+
self._local_directory = get_files_directory()
|
23
17
|
|
24
|
-
def
|
18
|
+
def upload(
|
25
19
|
self,
|
26
|
-
config: FileBasedStreamConfig,
|
27
20
|
file: RemoteFile,
|
28
21
|
stream_reader: AbstractFileBasedStreamReader,
|
29
22
|
logger: logging.Logger,
|
30
|
-
) -> Iterable[
|
23
|
+
) -> Iterable[Tuple[FileRecordData, AirbyteRecordMessageFileReference]]:
|
31
24
|
try:
|
32
|
-
yield stream_reader.
|
25
|
+
yield stream_reader.upload(
|
33
26
|
file=file, local_directory=self._local_directory, logger=logger
|
34
27
|
)
|
35
28
|
except Exception as ex:
|
@@ -18,9 +18,19 @@ JsonSchemaSupportedType = Union[List[str], Literal["string"], str]
|
|
18
18
|
SchemaType = Mapping[str, Mapping[str, JsonSchemaSupportedType]]
|
19
19
|
|
20
20
|
schemaless_schema = {"type": "object", "properties": {"data": {"type": "object"}}}
|
21
|
+
|
21
22
|
file_transfer_schema = {
|
22
23
|
"type": "object",
|
23
|
-
"properties": {
|
24
|
+
"properties": {
|
25
|
+
"folder": {"type": "string"},
|
26
|
+
"file_name": {"type": "string"},
|
27
|
+
"bytes": {"type": "integer"},
|
28
|
+
"id": {"type": ["null", "string"]},
|
29
|
+
"created_at": {"type": ["null", "integer"]},
|
30
|
+
"updated_at": {"type": ["null", "integer"]},
|
31
|
+
"mime_type": {"type": ["null", "string"]},
|
32
|
+
"description": {"type": ["null", "string"]},
|
33
|
+
},
|
24
34
|
}
|
25
35
|
|
26
36
|
|
@@ -11,7 +11,7 @@ from functools import cache
|
|
11
11
|
from os import path
|
12
12
|
from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Tuple, Union
|
13
13
|
|
14
|
-
from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, FailureType, Level
|
14
|
+
from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, AirbyteStream, FailureType, Level
|
15
15
|
from airbyte_cdk.models import Type as MessageType
|
16
16
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType
|
17
17
|
from airbyte_cdk.sources.file_based.exceptions import (
|
@@ -97,14 +97,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
97
97
|
self, configured_catalog_json_schema: Dict[str, Any]
|
98
98
|
) -> Dict[str, Any]:
|
99
99
|
if self.use_file_transfer:
|
100
|
-
return
|
101
|
-
"type": "object",
|
102
|
-
"properties": {
|
103
|
-
"file_path": {"type": "string"},
|
104
|
-
"file_size": {"type": "string"},
|
105
|
-
self.ab_file_name_col: {"type": "string"},
|
106
|
-
},
|
107
|
-
}
|
100
|
+
return file_transfer_schema
|
108
101
|
else:
|
109
102
|
return super()._filter_schema_invalid_properties(configured_catalog_json_schema)
|
110
103
|
|
@@ -145,14 +138,6 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
145
138
|
record[self.ab_file_name_col] = file.uri
|
146
139
|
return record
|
147
140
|
|
148
|
-
def transform_record_for_file_transfer(
|
149
|
-
self, record: dict[str, Any], file: RemoteFile
|
150
|
-
) -> dict[str, Any]:
|
151
|
-
# timstamp() returns a float representing the number of seconds since the unix epoch
|
152
|
-
record[self.modified] = int(file.last_modified.timestamp()) * 1000
|
153
|
-
record[self.source_file_url] = file.uri
|
154
|
-
return record
|
155
|
-
|
156
141
|
def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[AirbyteMessage]:
|
157
142
|
"""
|
158
143
|
Yield all records from all remote files in `list_files_for_this_sync`.
|
@@ -166,6 +151,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
166
151
|
raise MissingSchemaError(FileBasedSourceError.MISSING_SCHEMA, stream=self.name)
|
167
152
|
# The stream only supports a single file type, so we can use the same parser for all files
|
168
153
|
parser = self.get_parser()
|
154
|
+
file_transfer = FileTransfer()
|
169
155
|
for file in stream_slice["files"]:
|
170
156
|
# only serialize the datetime once
|
171
157
|
file_datetime_string = file.last_modified.strftime(self.DATE_TIME_FORMAT)
|
@@ -173,19 +159,13 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
173
159
|
|
174
160
|
try:
|
175
161
|
if self.use_file_transfer:
|
176
|
-
|
177
|
-
|
178
|
-
file_transfer = FileTransfer()
|
179
|
-
for record in file_transfer.get_file(
|
180
|
-
self.config, file, self.stream_reader, self.logger
|
162
|
+
for file_record_data, file_reference in file_transfer.upload(
|
163
|
+
file=file, stream_reader=self.stream_reader, logger=self.logger
|
181
164
|
):
|
182
|
-
line_no += 1
|
183
|
-
if not self.record_passes_validation_policy(record):
|
184
|
-
n_skipped += 1
|
185
|
-
continue
|
186
|
-
record = self.transform_record_for_file_transfer(record, file)
|
187
165
|
yield stream_data_to_airbyte_message(
|
188
|
-
self.name,
|
166
|
+
self.name,
|
167
|
+
file_record_data.dict(exclude_none=True),
|
168
|
+
file_reference=file_reference,
|
189
169
|
)
|
190
170
|
else:
|
191
171
|
for record in parser.parse_records(
|
@@ -259,6 +239,8 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
259
239
|
|
260
240
|
@cache
|
261
241
|
def get_json_schema(self) -> JsonSchema:
|
242
|
+
if self.use_file_transfer:
|
243
|
+
return file_transfer_schema
|
262
244
|
extra_fields = {
|
263
245
|
self.ab_last_mod_col: {"type": "string"},
|
264
246
|
self.ab_file_name_col: {"type": "string"},
|
@@ -282,9 +264,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
282
264
|
return {"type": "object", "properties": {**extra_fields, **schema["properties"]}}
|
283
265
|
|
284
266
|
def _get_raw_json_schema(self) -> JsonSchema:
|
285
|
-
if self.
|
286
|
-
return file_transfer_schema
|
287
|
-
elif self.config.input_schema:
|
267
|
+
if self.config.input_schema:
|
288
268
|
return self.config.get_input_schema() # type: ignore
|
289
269
|
elif self.config.schemaless:
|
290
270
|
return schemaless_schema
|
@@ -341,6 +321,11 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
341
321
|
self.config.globs or [], self.config.legacy_prefix, self.logger
|
342
322
|
)
|
343
323
|
|
324
|
+
def as_airbyte_stream(self) -> AirbyteStream:
|
325
|
+
file_stream = super().as_airbyte_stream()
|
326
|
+
file_stream.is_file_based = self.use_file_transfer
|
327
|
+
return file_stream
|
328
|
+
|
344
329
|
def infer_schema(self, files: List[RemoteFile]) -> Mapping[str, Any]:
|
345
330
|
loop = asyncio.get_event_loop()
|
346
331
|
schema = loop.run_until_complete(self._infer_schema(files))
|
@@ -61,9 +61,7 @@ class PermissionsFileBasedStream(DefaultFileBasedStream):
|
|
61
61
|
permissions_record = self.transform_record(
|
62
62
|
permissions_record, file, file_datetime_string
|
63
63
|
)
|
64
|
-
yield stream_data_to_airbyte_message(
|
65
|
-
self.name, permissions_record, is_file_transfer_message=False
|
66
|
-
)
|
64
|
+
yield stream_data_to_airbyte_message(self.name, permissions_record)
|
67
65
|
except Exception as e:
|
68
66
|
self.logger.error(f"Failed to retrieve permissions for file {file.uri}: {str(e)}")
|
69
67
|
yield AirbyteMessage(
|
@@ -29,6 +29,7 @@ class DefaultStream(AbstractStream):
|
|
29
29
|
logger: Logger,
|
30
30
|
cursor: Cursor,
|
31
31
|
namespace: Optional[str] = None,
|
32
|
+
supports_file_transfer: bool = False,
|
32
33
|
) -> None:
|
33
34
|
self._stream_partition_generator = partition_generator
|
34
35
|
self._name = name
|
@@ -39,6 +40,7 @@ class DefaultStream(AbstractStream):
|
|
39
40
|
self._logger = logger
|
40
41
|
self._cursor = cursor
|
41
42
|
self._namespace = namespace
|
43
|
+
self._supports_file_transfer = supports_file_transfer
|
42
44
|
|
43
45
|
def generate_partitions(self) -> Iterable[Partition]:
|
44
46
|
yield from self._stream_partition_generator.generate()
|
@@ -68,6 +70,7 @@ class DefaultStream(AbstractStream):
|
|
68
70
|
json_schema=dict(self._json_schema),
|
69
71
|
supported_sync_modes=[SyncMode.full_refresh],
|
70
72
|
is_resumable=False,
|
73
|
+
is_file_based=self._supports_file_transfer,
|
71
74
|
)
|
72
75
|
|
73
76
|
if self._namespace:
|
@@ -71,10 +71,6 @@ class AbstractStreamStateConverter(ABC):
|
|
71
71
|
for stream_slice in state.get("slices", []):
|
72
72
|
stream_slice[self.START_KEY] = self._from_state_message(stream_slice[self.START_KEY])
|
73
73
|
stream_slice[self.END_KEY] = self._from_state_message(stream_slice[self.END_KEY])
|
74
|
-
if self.MOST_RECENT_RECORD_KEY in stream_slice:
|
75
|
-
stream_slice[self.MOST_RECENT_RECORD_KEY] = self._from_state_message(
|
76
|
-
stream_slice[self.MOST_RECENT_RECORD_KEY]
|
77
|
-
)
|
78
74
|
return state
|
79
75
|
|
80
76
|
def serialize(
|