airbyte-cdk 6.7.1rc4__py3-none-any.whl → 6.7.2.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/cli/source_declarative_manifest/_run.py +2 -1
- airbyte_cdk/config_observation.py +2 -1
- airbyte_cdk/connector.py +1 -0
- airbyte_cdk/connector_builder/connector_builder_handler.py +1 -1
- airbyte_cdk/connector_builder/main.py +2 -1
- airbyte_cdk/destinations/destination.py +2 -1
- airbyte_cdk/destinations/vector_db_based/config.py +2 -1
- airbyte_cdk/destinations/vector_db_based/document_processor.py +4 -3
- airbyte_cdk/destinations/vector_db_based/embedder.py +5 -4
- airbyte_cdk/entrypoint.py +3 -2
- airbyte_cdk/logger.py +2 -1
- airbyte_cdk/models/__init__.py +2 -0
- airbyte_cdk/models/airbyte_protocol.py +2 -1
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +3 -3
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +1 -1
- airbyte_cdk/sources/config.py +2 -1
- airbyte_cdk/sources/declarative/auth/jwt.py +1 -0
- airbyte_cdk/sources/declarative/auth/oauth.py +1 -0
- airbyte_cdk/sources/declarative/auth/selective_authenticator.py +1 -0
- airbyte_cdk/sources/declarative/auth/token.py +2 -1
- airbyte_cdk/sources/declarative/auth/token_provider.py +3 -2
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +66 -8
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +196 -0
- airbyte_cdk/sources/declarative/decoders/json_decoder.py +3 -2
- airbyte_cdk/sources/declarative/decoders/noop_decoder.py +1 -0
- airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +1 -0
- airbyte_cdk/sources/declarative/decoders/xml_decoder.py +1 -0
- airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +1 -0
- airbyte_cdk/sources/declarative/extractors/http_selector.py +1 -0
- airbyte_cdk/sources/declarative/extractors/record_filter.py +6 -48
- airbyte_cdk/sources/declarative/extractors/record_selector.py +32 -4
- airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +7 -2
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +2 -1
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +5 -2
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +5 -2
- airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +1 -3
- airbyte_cdk/sources/declarative/interpolation/jinja.py +5 -4
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +4 -3
- airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +1 -1
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +144 -0
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +45 -4
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +1 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +1 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +1 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +1 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +1 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +1 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +1 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +1 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +1 -0
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +3 -2
- airbyte_cdk/sources/declarative/requesters/http_requester.py +1 -0
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +1 -0
- airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +1 -0
- airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +1 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +1 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +1 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +1 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +1 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +9 -3
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +2 -1
- airbyte_cdk/sources/declarative/requesters/requester.py +1 -0
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +2 -1
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +12 -7
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +7 -4
- airbyte_cdk/sources/declarative/transformations/add_fields.py +1 -0
- airbyte_cdk/sources/declarative/transformations/remove_fields.py +1 -0
- airbyte_cdk/sources/declarative/yaml_declarative_source.py +1 -0
- airbyte_cdk/sources/embedded/tools.py +1 -0
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +2 -1
- airbyte_cdk/sources/file_based/config/avro_format.py +2 -1
- airbyte_cdk/sources/file_based/config/csv_format.py +2 -1
- airbyte_cdk/sources/file_based/config/excel_format.py +2 -1
- airbyte_cdk/sources/file_based/config/file_based_stream_config.py +2 -1
- airbyte_cdk/sources/file_based/config/jsonl_format.py +2 -1
- airbyte_cdk/sources/file_based/config/parquet_format.py +2 -1
- airbyte_cdk/sources/file_based/config/unstructured_format.py +2 -1
- airbyte_cdk/sources/file_based/file_based_source.py +2 -1
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +2 -1
- airbyte_cdk/sources/file_based/file_types/avro_parser.py +1 -0
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +2 -1
- airbyte_cdk/sources/file_based/file_types/excel_parser.py +5 -5
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +2 -1
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +2 -1
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +9 -8
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +2 -1
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +5 -4
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +1 -1
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +1 -1
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +1 -1
- airbyte_cdk/sources/http_logger.py +1 -0
- airbyte_cdk/sources/streams/call_rate.py +1 -2
- airbyte_cdk/sources/streams/concurrent/abstract_stream.py +2 -1
- airbyte_cdk/sources/streams/concurrent/adapters.py +8 -4
- airbyte_cdk/sources/streams/concurrent/availability_strategy.py +2 -1
- airbyte_cdk/sources/streams/concurrent/cursor.py +52 -9
- airbyte_cdk/sources/streams/concurrent/default_stream.py +1 -0
- airbyte_cdk/sources/streams/concurrent/partitions/partition.py +1 -1
- airbyte_cdk/sources/streams/concurrent/partitions/types.py +1 -1
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +1 -1
- airbyte_cdk/sources/streams/core.py +2 -1
- airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +2 -1
- airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +1 -0
- airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +1 -0
- airbyte_cdk/sources/streams/http/error_handlers/response_models.py +2 -1
- airbyte_cdk/sources/streams/http/http.py +3 -2
- airbyte_cdk/sources/streams/http/http_client.py +58 -11
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +2 -1
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +1 -0
- airbyte_cdk/sources/types.py +14 -1
- airbyte_cdk/sources/utils/schema_helpers.py +3 -2
- airbyte_cdk/sql/secrets.py +2 -1
- airbyte_cdk/sql/shared/sql_processor.py +8 -6
- airbyte_cdk/test/entrypoint_wrapper.py +4 -3
- airbyte_cdk/test/mock_http/mocker.py +1 -0
- airbyte_cdk/utils/schema_inferrer.py +2 -1
- airbyte_cdk/utils/slice_hasher.py +1 -1
- airbyte_cdk/utils/traced_exception.py +2 -1
- {airbyte_cdk-6.7.1rc4.dist-info → airbyte_cdk-6.7.2.dev0.dist-info}/METADATA +9 -2
- {airbyte_cdk-6.7.1rc4.dist-info → airbyte_cdk-6.7.2.dev0.dist-info}/RECORD +123 -124
- airbyte_cdk/sources/streams/concurrent/partitions/record.py +0 -35
- {airbyte_cdk-6.7.1rc4.dist-info → airbyte_cdk-6.7.2.dev0.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.7.1rc4.dist-info → airbyte_cdk-6.7.2.dev0.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.7.1rc4.dist-info → airbyte_cdk-6.7.2.dev0.dist-info}/entry_points.txt +0 -0
@@ -1,14 +1,13 @@
|
|
1
1
|
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
|
2
2
|
|
3
|
-
from typing import
|
3
|
+
from typing import Any, Callable, Iterable, Mapping, Optional
|
4
4
|
|
5
5
|
from airbyte_cdk.sources.declarative.retrievers import Retriever
|
6
6
|
from airbyte_cdk.sources.message import MessageRepository
|
7
7
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
8
8
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator
|
9
|
-
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
|
10
9
|
from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import StreamSlicer
|
11
|
-
from airbyte_cdk.sources.types import StreamSlice
|
10
|
+
from airbyte_cdk.sources.types import Record, StreamSlice
|
12
11
|
from airbyte_cdk.utils.slice_hasher import SliceHasher
|
13
12
|
|
14
13
|
|
@@ -59,7 +58,11 @@ class DeclarativePartition(Partition):
|
|
59
58
|
def read(self) -> Iterable[Record]:
|
60
59
|
for stream_data in self._retriever.read_records(self._json_schema, self._stream_slice):
|
61
60
|
if isinstance(stream_data, Mapping):
|
62
|
-
yield Record(
|
61
|
+
yield Record(
|
62
|
+
data=stream_data,
|
63
|
+
stream_name=self.stream_name(),
|
64
|
+
associated_slice=self._stream_slice,
|
65
|
+
)
|
63
66
|
else:
|
64
67
|
self._message_repository.emit_message(stream_data)
|
65
68
|
|
@@ -6,6 +6,7 @@ from dataclasses import InitVar, dataclass, field
|
|
6
6
|
from typing import Any, Dict, List, Mapping, Optional, Type, Union
|
7
7
|
|
8
8
|
import dpath
|
9
|
+
|
9
10
|
from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString
|
10
11
|
from airbyte_cdk.sources.declarative.transformations import RecordTransformation
|
11
12
|
from airbyte_cdk.sources.types import Config, FieldPointer, StreamSlice, StreamState
|
@@ -7,6 +7,7 @@ from typing import Any, Dict, List, Mapping, Optional
|
|
7
7
|
|
8
8
|
import dpath
|
9
9
|
import dpath.exceptions
|
10
|
+
|
10
11
|
from airbyte_cdk.sources.declarative.interpolation.interpolated_boolean import InterpolatedBoolean
|
11
12
|
from airbyte_cdk.sources.declarative.transformations import RecordTransformation
|
12
13
|
from airbyte_cdk.sources.types import Config, FieldPointer, StreamSlice, StreamState
|
@@ -6,6 +6,7 @@ import pkgutil
|
|
6
6
|
from typing import Any, List, Mapping, Optional
|
7
7
|
|
8
8
|
import yaml
|
9
|
+
|
9
10
|
from airbyte_cdk.models import AirbyteStateMessage, ConfiguredAirbyteCatalog
|
10
11
|
from airbyte_cdk.sources.declarative.concurrent_declarative_source import (
|
11
12
|
ConcurrentDeclarativeSource,
|
@@ -7,10 +7,11 @@ from abc import abstractmethod
|
|
7
7
|
from typing import Any, Dict, List, Literal, Optional, Union
|
8
8
|
|
9
9
|
import dpath
|
10
|
+
from pydantic.v1 import AnyUrl, BaseModel, Field
|
11
|
+
|
10
12
|
from airbyte_cdk import OneOfOptionConfig
|
11
13
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
12
14
|
from airbyte_cdk.sources.utils import schema_helpers
|
13
|
-
from pydantic.v1 import AnyUrl, BaseModel, Field
|
14
15
|
|
15
16
|
|
16
17
|
class DeliverRecords(BaseModel):
|
@@ -3,9 +3,10 @@
|
|
3
3
|
#
|
4
4
|
|
5
5
|
|
6
|
-
from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
|
7
6
|
from pydantic.v1 import BaseModel, Field
|
8
7
|
|
8
|
+
from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
|
9
|
+
|
9
10
|
|
10
11
|
class AvroFormat(BaseModel):
|
11
12
|
class Config(OneOfOptionConfig):
|
@@ -6,10 +6,11 @@ import codecs
|
|
6
6
|
from enum import Enum
|
7
7
|
from typing import Any, Dict, List, Optional, Set, Union
|
8
8
|
|
9
|
-
from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
|
10
9
|
from pydantic.v1 import BaseModel, Field, root_validator, validator
|
11
10
|
from pydantic.v1.error_wrappers import ValidationError
|
12
11
|
|
12
|
+
from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
|
13
|
+
|
13
14
|
|
14
15
|
class InferenceType(Enum):
|
15
16
|
NONE = "None"
|
@@ -2,9 +2,10 @@
|
|
2
2
|
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
|
5
|
-
from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
|
6
5
|
from pydantic.v1 import BaseModel, Field
|
7
6
|
|
7
|
+
from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
|
8
|
+
|
8
9
|
|
9
10
|
class ExcelFormat(BaseModel):
|
10
11
|
class Config(OneOfOptionConfig):
|
@@ -5,6 +5,8 @@
|
|
5
5
|
from enum import Enum
|
6
6
|
from typing import Any, List, Mapping, Optional, Union
|
7
7
|
|
8
|
+
from pydantic.v1 import BaseModel, Field, validator
|
9
|
+
|
8
10
|
from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat
|
9
11
|
from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat
|
10
12
|
from airbyte_cdk.sources.file_based.config.excel_format import ExcelFormat
|
@@ -13,7 +15,6 @@ from airbyte_cdk.sources.file_based.config.parquet_format import ParquetFormat
|
|
13
15
|
from airbyte_cdk.sources.file_based.config.unstructured_format import UnstructuredFormat
|
14
16
|
from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError
|
15
17
|
from airbyte_cdk.sources.file_based.schema_helpers import type_mapping_to_jsonschema
|
16
|
-
from pydantic.v1 import BaseModel, Field, validator
|
17
18
|
|
18
19
|
PrimaryKeyType = Optional[Union[str, List[str]]]
|
19
20
|
|
@@ -2,9 +2,10 @@
|
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
|
5
|
-
from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
|
6
5
|
from pydantic.v1 import BaseModel, Field
|
7
6
|
|
7
|
+
from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
|
8
|
+
|
8
9
|
|
9
10
|
class JsonlFormat(BaseModel):
|
10
11
|
class Config(OneOfOptionConfig):
|
@@ -3,9 +3,10 @@
|
|
3
3
|
#
|
4
4
|
|
5
5
|
|
6
|
-
from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
|
7
6
|
from pydantic.v1 import BaseModel, Field
|
8
7
|
|
8
|
+
from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
|
9
|
+
|
9
10
|
|
10
11
|
class ParquetFormat(BaseModel):
|
11
12
|
class Config(OneOfOptionConfig):
|
@@ -4,9 +4,10 @@
|
|
4
4
|
|
5
5
|
from typing import List, Literal, Optional, Union
|
6
6
|
|
7
|
-
from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
|
8
7
|
from pydantic.v1 import BaseModel, Field
|
9
8
|
|
9
|
+
from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
|
10
|
+
|
10
11
|
|
11
12
|
class LocalProcessingConfigModel(BaseModel):
|
12
13
|
mode: Literal["local"] = Field("local", const=True)
|
@@ -8,6 +8,8 @@ from abc import ABC
|
|
8
8
|
from collections import Counter
|
9
9
|
from typing import Any, Iterator, List, Mapping, Optional, Tuple, Type, Union
|
10
10
|
|
11
|
+
from pydantic.v1.error_wrappers import ValidationError
|
12
|
+
|
11
13
|
from airbyte_cdk.logger import AirbyteLogFormatter, init_logger
|
12
14
|
from airbyte_cdk.models import (
|
13
15
|
AirbyteMessage,
|
@@ -60,7 +62,6 @@ from airbyte_cdk.sources.streams import Stream
|
|
60
62
|
from airbyte_cdk.sources.streams.concurrent.cursor import CursorField
|
61
63
|
from airbyte_cdk.utils.analytics_message import create_analytics_message
|
62
64
|
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
63
|
-
from pydantic.v1.error_wrappers import ValidationError
|
64
65
|
|
65
66
|
DEFAULT_CONCURRENCY = 100
|
66
67
|
MAX_CONCURRENCY = 100
|
@@ -10,9 +10,10 @@ from io import IOBase
|
|
10
10
|
from os import makedirs, path
|
11
11
|
from typing import Any, Dict, Iterable, List, Optional, Set
|
12
12
|
|
13
|
+
from wcmatch.glob import GLOBSTAR, globmatch
|
14
|
+
|
13
15
|
from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
|
14
16
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
15
|
-
from wcmatch.glob import GLOBSTAR, globmatch
|
16
17
|
|
17
18
|
|
18
19
|
class FileReadMode(Enum):
|
@@ -6,6 +6,7 @@ import logging
|
|
6
6
|
from typing import Any, Dict, Iterable, Mapping, Optional, Tuple
|
7
7
|
|
8
8
|
import fastavro
|
9
|
+
|
9
10
|
from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat
|
10
11
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
11
12
|
from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
|
@@ -12,6 +12,8 @@ from io import IOBase
|
|
12
12
|
from typing import Any, Callable, Dict, Generator, Iterable, List, Mapping, Optional, Set, Tuple
|
13
13
|
from uuid import uuid4
|
14
14
|
|
15
|
+
from orjson import orjson
|
16
|
+
|
15
17
|
from airbyte_cdk.models import FailureType
|
16
18
|
from airbyte_cdk.sources.file_based.config.csv_format import (
|
17
19
|
CsvFormat,
|
@@ -29,7 +31,6 @@ from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeP
|
|
29
31
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
30
32
|
from airbyte_cdk.sources.file_based.schema_helpers import TYPE_PYTHON_MAPPING, SchemaType
|
31
33
|
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
32
|
-
from orjson import orjson
|
33
34
|
|
34
35
|
DIALECT_NAME = "_config_dialect"
|
35
36
|
|
@@ -8,6 +8,11 @@ from pathlib import Path
|
|
8
8
|
from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union
|
9
9
|
|
10
10
|
import pandas as pd
|
11
|
+
from numpy import datetime64, issubdtype
|
12
|
+
from numpy import dtype as dtype_
|
13
|
+
from orjson import orjson
|
14
|
+
from pydantic.v1 import BaseModel
|
15
|
+
|
11
16
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import (
|
12
17
|
ExcelFormat,
|
13
18
|
FileBasedStreamConfig,
|
@@ -24,11 +29,6 @@ from airbyte_cdk.sources.file_based.file_based_stream_reader import (
|
|
24
29
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
25
30
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
26
31
|
from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
|
27
|
-
from numpy import datetime64
|
28
|
-
from numpy import dtype as dtype_
|
29
|
-
from numpy import issubdtype
|
30
|
-
from orjson import orjson
|
31
|
-
from pydantic.v1 import BaseModel
|
32
32
|
|
33
33
|
|
34
34
|
class ExcelParser(FileTypeParser):
|
@@ -6,6 +6,8 @@ import json
|
|
6
6
|
import logging
|
7
7
|
from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union
|
8
8
|
|
9
|
+
from orjson import orjson
|
10
|
+
|
9
11
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
10
12
|
from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
|
11
13
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import (
|
@@ -19,7 +21,6 @@ from airbyte_cdk.sources.file_based.schema_helpers import (
|
|
19
21
|
SchemaType,
|
20
22
|
merge_schemas,
|
21
23
|
)
|
22
|
-
from orjson import orjson
|
23
24
|
|
24
25
|
|
25
26
|
class JsonlParser(FileTypeParser):
|
@@ -10,6 +10,8 @@ from urllib.parse import unquote
|
|
10
10
|
|
11
11
|
import pyarrow as pa
|
12
12
|
import pyarrow.parquet as pq
|
13
|
+
from pyarrow import DictionaryArray, Scalar
|
14
|
+
|
13
15
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import (
|
14
16
|
FileBasedStreamConfig,
|
15
17
|
ParquetFormat,
|
@@ -26,7 +28,6 @@ from airbyte_cdk.sources.file_based.file_based_stream_reader import (
|
|
26
28
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
27
29
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
28
30
|
from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
|
29
|
-
from pyarrow import DictionaryArray, Scalar
|
30
31
|
|
31
32
|
|
32
33
|
class ParquetParser(FileTypeParser):
|
@@ -9,7 +9,16 @@ from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union
|
|
9
9
|
|
10
10
|
import backoff
|
11
11
|
import dpath
|
12
|
+
import nltk
|
12
13
|
import requests
|
14
|
+
from unstructured.file_utils.filetype import (
|
15
|
+
EXT_TO_FILETYPE,
|
16
|
+
FILETYPE_TO_MIMETYPE,
|
17
|
+
STR_TO_FILETYPE,
|
18
|
+
FileType,
|
19
|
+
detect_filetype,
|
20
|
+
)
|
21
|
+
|
13
22
|
from airbyte_cdk.models import FailureType
|
14
23
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
15
24
|
from airbyte_cdk.sources.file_based.config.unstructured_format import (
|
@@ -28,14 +37,6 @@ from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
|
28
37
|
from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
|
29
38
|
from airbyte_cdk.utils import is_cloud_environment
|
30
39
|
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
31
|
-
from unstructured.file_utils.filetype import (
|
32
|
-
EXT_TO_FILETYPE,
|
33
|
-
FILETYPE_TO_MIMETYPE,
|
34
|
-
STR_TO_FILETYPE,
|
35
|
-
FileType,
|
36
|
-
detect_filetype,
|
37
|
-
)
|
38
|
-
import nltk
|
39
40
|
|
40
41
|
unstructured_partition_pdf = None
|
41
42
|
unstructured_partition_docx = None
|
@@ -6,6 +6,8 @@ from abc import abstractmethod
|
|
6
6
|
from functools import cache, cached_property, lru_cache
|
7
7
|
from typing import Any, Dict, Iterable, List, Mapping, Optional, Type
|
8
8
|
|
9
|
+
from deprecated import deprecated
|
10
|
+
|
9
11
|
from airbyte_cdk import AirbyteMessage
|
10
12
|
from airbyte_cdk.models import SyncMode
|
11
13
|
from airbyte_cdk.sources.file_based.availability_strategy import (
|
@@ -30,7 +32,6 @@ from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
|
|
30
32
|
from airbyte_cdk.sources.file_based.types import StreamSlice
|
31
33
|
from airbyte_cdk.sources.streams import Stream
|
32
34
|
from airbyte_cdk.sources.streams.checkpoint import Cursor
|
33
|
-
from deprecated import deprecated
|
34
35
|
|
35
36
|
|
36
37
|
class AbstractFileBasedStream(Stream):
|
@@ -7,6 +7,8 @@ import logging
|
|
7
7
|
from functools import cache, lru_cache
|
8
8
|
from typing import TYPE_CHECKING, Any, Iterable, List, Mapping, MutableMapping, Optional, Union
|
9
9
|
|
10
|
+
from deprecated.classic import deprecated
|
11
|
+
|
10
12
|
from airbyte_cdk.models import (
|
11
13
|
AirbyteLogMessage,
|
12
14
|
AirbyteMessage,
|
@@ -39,11 +41,10 @@ from airbyte_cdk.sources.streams.concurrent.helpers import (
|
|
39
41
|
)
|
40
42
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
41
43
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator
|
42
|
-
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
|
43
44
|
from airbyte_cdk.sources.streams.core import StreamData
|
45
|
+
from airbyte_cdk.sources.types import Record
|
44
46
|
from airbyte_cdk.sources.utils.schema_helpers import InternalConfig
|
45
47
|
from airbyte_cdk.sources.utils.slice_logger import SliceLogger
|
46
|
-
from deprecated.classic import deprecated
|
47
48
|
|
48
49
|
if TYPE_CHECKING:
|
49
50
|
from airbyte_cdk.sources.file_based.stream.concurrent.cursor import (
|
@@ -247,7 +248,7 @@ class FileBasedStreamPartition(Partition):
|
|
247
248
|
self._stream.transformer.transform(
|
248
249
|
data_to_return, self._stream.get_json_schema()
|
249
250
|
)
|
250
|
-
yield Record(data_to_return, self)
|
251
|
+
yield Record(data=data_to_return, stream_name=self.stream_name())
|
251
252
|
elif (
|
252
253
|
isinstance(record_data, AirbyteMessage)
|
253
254
|
and record_data.type == Type.RECORD
|
@@ -265,7 +266,7 @@ class FileBasedStreamPartition(Partition):
|
|
265
266
|
else:
|
266
267
|
yield Record(
|
267
268
|
data=record_message_data,
|
268
|
-
|
269
|
+
stream_name=self.stream_name(),
|
269
270
|
is_file_transfer_message=self._use_file_transfer(),
|
270
271
|
)
|
271
272
|
else:
|
airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py
CHANGED
@@ -12,7 +12,7 @@ from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
|
|
12
12
|
from airbyte_cdk.sources.file_based.types import StreamState
|
13
13
|
from airbyte_cdk.sources.streams.concurrent.cursor import Cursor
|
14
14
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
15
|
-
from airbyte_cdk.sources.
|
15
|
+
from airbyte_cdk.sources.types import Record
|
16
16
|
|
17
17
|
if TYPE_CHECKING:
|
18
18
|
from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamPartition
|
@@ -19,7 +19,7 @@ from airbyte_cdk.sources.file_based.types import StreamState
|
|
19
19
|
from airbyte_cdk.sources.message.repository import MessageRepository
|
20
20
|
from airbyte_cdk.sources.streams.concurrent.cursor import CursorField
|
21
21
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
22
|
-
from airbyte_cdk.sources.
|
22
|
+
from airbyte_cdk.sources.types import Record
|
23
23
|
|
24
24
|
if TYPE_CHECKING:
|
25
25
|
from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamPartition
|
@@ -16,7 +16,7 @@ from airbyte_cdk.sources.file_based.types import StreamState
|
|
16
16
|
from airbyte_cdk.sources.message import MessageRepository
|
17
17
|
from airbyte_cdk.sources.streams import NO_CURSOR_STATE_KEY
|
18
18
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
19
|
-
from airbyte_cdk.sources.
|
19
|
+
from airbyte_cdk.sources.types import Record
|
20
20
|
|
21
21
|
if TYPE_CHECKING:
|
22
22
|
from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamPartition
|
@@ -14,9 +14,8 @@ from urllib import parse
|
|
14
14
|
|
15
15
|
import requests
|
16
16
|
import requests_cache
|
17
|
-
from pyrate_limiter import InMemoryBucket, Limiter
|
17
|
+
from pyrate_limiter import InMemoryBucket, Limiter, RateItem, TimeClock
|
18
18
|
from pyrate_limiter import Rate as PyRateRate
|
19
|
-
from pyrate_limiter import RateItem, TimeClock
|
20
19
|
from pyrate_limiter.exceptions import BucketFullException
|
21
20
|
|
22
21
|
# prevents mypy from complaining about missing session attributes in LimiterMixin
|
@@ -5,12 +5,13 @@
|
|
5
5
|
from abc import ABC, abstractmethod
|
6
6
|
from typing import Any, Iterable, Mapping, Optional
|
7
7
|
|
8
|
+
from deprecated.classic import deprecated
|
9
|
+
|
8
10
|
from airbyte_cdk.models import AirbyteStream
|
9
11
|
from airbyte_cdk.sources.source import ExperimentalClassWarning
|
10
12
|
from airbyte_cdk.sources.streams.concurrent.availability_strategy import StreamAvailability
|
11
13
|
from airbyte_cdk.sources.streams.concurrent.cursor import Cursor
|
12
14
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
13
|
-
from deprecated.classic import deprecated
|
14
15
|
|
15
16
|
|
16
17
|
@deprecated("This class is experimental. Use at your own risk.", category=ExperimentalClassWarning)
|
@@ -8,6 +8,8 @@ import logging
|
|
8
8
|
from functools import lru_cache
|
9
9
|
from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Tuple, Union
|
10
10
|
|
11
|
+
from deprecated.classic import deprecated
|
12
|
+
|
11
13
|
from airbyte_cdk.models import (
|
12
14
|
AirbyteLogMessage,
|
13
15
|
AirbyteMessage,
|
@@ -37,12 +39,10 @@ from airbyte_cdk.sources.streams.concurrent.helpers import (
|
|
37
39
|
)
|
38
40
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
39
41
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator
|
40
|
-
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
|
41
42
|
from airbyte_cdk.sources.streams.core import StreamData
|
43
|
+
from airbyte_cdk.sources.types import Record
|
42
44
|
from airbyte_cdk.sources.utils.schema_helpers import InternalConfig
|
43
45
|
from airbyte_cdk.sources.utils.slice_logger import SliceLogger
|
44
|
-
from deprecated.classic import deprecated
|
45
|
-
|
46
46
|
from airbyte_cdk.utils.slice_hasher import SliceHasher
|
47
47
|
|
48
48
|
"""
|
@@ -294,7 +294,11 @@ class StreamPartition(Partition):
|
|
294
294
|
self._stream.transformer.transform(
|
295
295
|
data_to_return, self._stream.get_json_schema()
|
296
296
|
)
|
297
|
-
yield Record(
|
297
|
+
yield Record(
|
298
|
+
data=data_to_return,
|
299
|
+
stream_name=self.stream_name(),
|
300
|
+
associated_slice=self._slice,
|
301
|
+
)
|
298
302
|
else:
|
299
303
|
self._message_repository.emit_message(record_data)
|
300
304
|
except Exception as e:
|
@@ -6,9 +6,10 @@ import logging
|
|
6
6
|
from abc import ABC, abstractmethod
|
7
7
|
from typing import Optional
|
8
8
|
|
9
|
-
from airbyte_cdk.sources.source import ExperimentalClassWarning
|
10
9
|
from deprecated.classic import deprecated
|
11
10
|
|
11
|
+
from airbyte_cdk.sources.source import ExperimentalClassWarning
|
12
|
+
|
12
13
|
|
13
14
|
class StreamAvailability(ABC):
|
14
15
|
@abstractmethod
|
@@ -3,19 +3,32 @@
|
|
3
3
|
#
|
4
4
|
|
5
5
|
import functools
|
6
|
+
import logging
|
6
7
|
from abc import ABC, abstractmethod
|
7
|
-
from typing import
|
8
|
+
from typing import (
|
9
|
+
Any,
|
10
|
+
Callable,
|
11
|
+
Iterable,
|
12
|
+
List,
|
13
|
+
Mapping,
|
14
|
+
MutableMapping,
|
15
|
+
Optional,
|
16
|
+
Protocol,
|
17
|
+
Tuple,
|
18
|
+
Union,
|
19
|
+
)
|
8
20
|
|
9
21
|
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
10
22
|
from airbyte_cdk.sources.message import MessageRepository
|
11
23
|
from airbyte_cdk.sources.streams import NO_CURSOR_STATE_KEY
|
12
24
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
13
|
-
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
|
14
25
|
from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import StreamSlicer
|
15
26
|
from airbyte_cdk.sources.streams.concurrent.state_converters.abstract_stream_state_converter import (
|
16
27
|
AbstractStreamStateConverter,
|
17
28
|
)
|
18
|
-
from airbyte_cdk.sources.types import StreamSlice
|
29
|
+
from airbyte_cdk.sources.types import Record, StreamSlice
|
30
|
+
|
31
|
+
LOGGER = logging.getLogger("airbyte")
|
19
32
|
|
20
33
|
|
21
34
|
def _extract_value(mapping: Mapping[str, Any], path: List[str]) -> Any:
|
@@ -173,9 +186,13 @@ class ConcurrentCursor(Cursor):
|
|
173
186
|
self.start, self._concurrent_state = self._get_concurrent_state(stream_state)
|
174
187
|
self._lookback_window = lookback_window
|
175
188
|
self._slice_range = slice_range
|
176
|
-
self._most_recent_cursor_value_per_partition: MutableMapping[
|
189
|
+
self._most_recent_cursor_value_per_partition: MutableMapping[
|
190
|
+
Union[StreamSlice, Mapping[str, Any], None], Any
|
191
|
+
] = {}
|
177
192
|
self._has_closed_at_least_one_slice = False
|
178
193
|
self._cursor_granularity = cursor_granularity
|
194
|
+
# Flag to track if the logger has been triggered (per stream)
|
195
|
+
self._should_be_synced_logger_triggered = False
|
179
196
|
|
180
197
|
@property
|
181
198
|
def state(self) -> MutableMapping[str, Any]:
|
@@ -210,12 +227,15 @@ class ConcurrentCursor(Cursor):
|
|
210
227
|
|
211
228
|
def observe(self, record: Record) -> None:
|
212
229
|
most_recent_cursor_value = self._most_recent_cursor_value_per_partition.get(
|
213
|
-
record.
|
230
|
+
record.associated_slice
|
214
231
|
)
|
215
|
-
|
232
|
+
try:
|
233
|
+
cursor_value = self._extract_cursor_value(record)
|
216
234
|
|
217
|
-
|
218
|
-
|
235
|
+
if most_recent_cursor_value is None or most_recent_cursor_value < cursor_value:
|
236
|
+
self._most_recent_cursor_value_per_partition[record.associated_slice] = cursor_value
|
237
|
+
except ValueError:
|
238
|
+
self._log_for_record_without_cursor_value()
|
219
239
|
|
220
240
|
def _extract_cursor_value(self, record: Record) -> Any:
|
221
241
|
return self._connector_state_converter.parse_value(self._cursor_field.extract_value(record))
|
@@ -231,7 +251,9 @@ class ConcurrentCursor(Cursor):
|
|
231
251
|
self._has_closed_at_least_one_slice = True
|
232
252
|
|
233
253
|
def _add_slice_to_state(self, partition: Partition) -> None:
|
234
|
-
most_recent_cursor_value = self._most_recent_cursor_value_per_partition.get(
|
254
|
+
most_recent_cursor_value = self._most_recent_cursor_value_per_partition.get(
|
255
|
+
partition.to_slice()
|
256
|
+
)
|
235
257
|
|
236
258
|
if self._slice_boundary_fields:
|
237
259
|
if "slices" not in self.state:
|
@@ -442,3 +464,24 @@ class ConcurrentCursor(Cursor):
|
|
442
464
|
return lower + step
|
443
465
|
except OverflowError:
|
444
466
|
return self._end_provider()
|
467
|
+
|
468
|
+
def should_be_synced(self, record: Record) -> bool:
|
469
|
+
"""
|
470
|
+
Determines if a record should be synced based on its cursor value.
|
471
|
+
:param record: The record to evaluate
|
472
|
+
|
473
|
+
:return: True if the record's cursor value falls within the sync boundaries
|
474
|
+
"""
|
475
|
+
try:
|
476
|
+
record_cursor_value: CursorValueType = self._extract_cursor_value(record) # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__
|
477
|
+
except ValueError:
|
478
|
+
self._log_for_record_without_cursor_value()
|
479
|
+
return True
|
480
|
+
return self.start <= record_cursor_value <= self._end_provider()
|
481
|
+
|
482
|
+
def _log_for_record_without_cursor_value(self) -> None:
|
483
|
+
if not self._should_be_synced_logger_triggered:
|
484
|
+
LOGGER.warning(
|
485
|
+
f"Could not find cursor field `{self.cursor_field.cursor_field_key}` in record for stream {self._stream_name}. The incremental sync will assume it needs to be synced"
|
486
|
+
)
|
487
|
+
self._should_be_synced_logger_triggered = True
|
@@ -8,7 +8,7 @@ from airbyte_cdk.sources.concurrent_source.partition_generation_completed_sentin
|
|
8
8
|
PartitionGenerationCompletedSentinel,
|
9
9
|
)
|
10
10
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
11
|
-
from airbyte_cdk.sources.
|
11
|
+
from airbyte_cdk.sources.types import Record
|
12
12
|
|
13
13
|
|
14
14
|
class PartitionCompleteSentinel:
|
@@ -7,6 +7,7 @@ from datetime import datetime, timedelta, timezone
|
|
7
7
|
from typing import Any, Callable, List, MutableMapping, Optional, Tuple
|
8
8
|
|
9
9
|
import pendulum
|
10
|
+
from pendulum.datetime import DateTime
|
10
11
|
|
11
12
|
# FIXME We would eventually like the Concurrent package do be agnostic of the declarative package. However, this is a breaking change and
|
12
13
|
# the goal in the short term is only to fix the issue we are seeing for source-declarative-manifest.
|
@@ -16,7 +17,6 @@ from airbyte_cdk.sources.streams.concurrent.state_converters.abstract_stream_sta
|
|
16
17
|
AbstractStreamStateConverter,
|
17
18
|
ConcurrencyCompatibleStateType,
|
18
19
|
)
|
19
|
-
from pendulum.datetime import DateTime
|
20
20
|
|
21
21
|
|
22
22
|
class DateTimeStreamStateConverter(AbstractStreamStateConverter):
|