PyPI - airbyte-source-google-sheets - Versions diffs - 0.8.4__py3-none-any.whl → 0.9.0rc1__py3-none-any.whl - Mend

airbyte-source-google-sheets 0.8.4py3-none-any.whl → 0.9.0rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

{airbyte_source_google_sheets-0.8.4.dist-info → airbyte_source_google_sheets-0.9.0rc1.dist-info}/METADATA RENAMED Viewed

@@ -1,23 +1,19 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.3
 Name: airbyte-source-google-sheets
-Version: 0.8.4
+Version: 0.9.0rc1
 Summary: Source implementation for Google Sheets.
-Home-page: https://airbyte.com
 License: Elv2
 Author: Airbyte
 Author-email: contact@airbyte.io
-Requires-Python: >=3.10,<4.0
+Requires-Python: >=3.10,<3.13
 Classifier: License :: Other/Proprietary License
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
-Classifier: Programming Language :: Python :: 3.13
-Requires-Dist: Unidecode (==1.3.8)
-Requires-Dist: airbyte-cdk (>=4,<5)
-Requires-Dist: google-api-python-client (==2.114.0)
-Requires-Dist: google-auth-httplib2 (==0.2.0)
+Requires-Dist: airbyte-cdk (>=6,<7)
 Project-URL: Documentation, https://docs.airbyte.com/integrations/sources/google-sheets
+Project-URL: Homepage, https://airbyte.com
 Project-URL: Repository, https://github.com/airbytehq/airbyte
 Description-Content-Type: text/markdown

airbyte_source_google_sheets-0.9.0rc1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,16 @@
+source_google_sheets/__init__.py,sha256=dYaZS0KrTjksk_yeSuXmNDXDsNKWctFnlZJSMgLryXE,135
+source_google_sheets/components/__init__.py,sha256=v7odPaLdz2S_NRzExsxkk6694Vmjbplz4Z5mA4lxdkA,343
+source_google_sheets/components/extractors.py,sha256=S7lPBuy9MO_mnl3h4B48F_szuusvDdvpX0OBK-_AxVA,8909
+source_google_sheets/components/partition_routers.py,sha256=SWo1V0K10ZdWE2TQ0KuQUfue04RTyHJe1f6BOj6c96s,1265
+source_google_sheets/manifest.yaml,sha256=qoCSQ4i7PnXCArmJ4RPOQA_31j8Vmd22ryI9ZL2cVDc,15756
+source_google_sheets/models/__init__.py,sha256=Z-4MTpxG5t2jGhXzs4PPoIOa83zw3jRnUDx0N9Puv3s,61
+source_google_sheets/models/spreadsheet.py,sha256=DEef7bWQEpY1Uqyk7RN6qLF8oxLSNzuAtHlGRmMAKQA,1112
+source_google_sheets/models/spreadsheet_values.py,sha256=-XRMuuILn9JN8svHNTj6-oG8mLTZOZ5Hejy2pJ5bILk,440
+source_google_sheets/run.py,sha256=eaPRcarWqkB2b2DokvI83w7rz1blmWPQCFahvCyCdSY,1887
+source_google_sheets/source.py,sha256=qO1KoGdphieu7F5VgDYtrbqs56AUvMWFGNvFHP2b9Z4,778
+source_google_sheets/spec.yaml,sha256=RIUILMhfS0is2r_mCkmIVrQfvND1D3eobDK1YElmzhU,5009
+source_google_sheets/utils.py,sha256=JEQIVLSFEAff-7zF3gPzsvFc9xLfCj9hVuFFYrSWiOo,2290
+airbyte_source_google_sheets-0.9.0rc1.dist-info/METADATA,sha256=V4s8KsVDIXElKB48e04etCJ7Le4i-KBOS0xQZZpK9Nw,5371
+airbyte_source_google_sheets-0.9.0rc1.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
+airbyte_source_google_sheets-0.9.0rc1.dist-info/entry_points.txt,sha256=Dtsfjohe5IPUFyqojk49SIoP7CifCTlNLG_pgivzppo,69
+airbyte_source_google_sheets-0.9.0rc1.dist-info/RECORD,,

{airbyte_source_google_sheets-0.8.4.dist-info → airbyte_source_google_sheets-0.9.0rc1.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: poetry-core 1.9.1
+Generator: poetry-core 2.0.1
 Root-Is-Purelib: true
 Tag: py3-none-any

source_google_sheets/__init__.py CHANGED Viewed

@@ -1,3 +1,7 @@
+#
+# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
+#
 from .source import SourceGoogleSheets
 __all__ = ["SourceGoogleSheets"]

source_google_sheets/components/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+#
+# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
+#
+from source_google_sheets.components.extractors import DpathSchemaMatchingExtractor, DpathSchemaExtractor
+from source_google_sheets.components.partition_routers import RangePartitionRouter
+__all__ = ["DpathSchemaMatchingExtractor", "RangePartitionRouter", "DpathSchemaExtractor"]

source_google_sheets/components/extractors.py ADDED Viewed

@@ -0,0 +1,207 @@
+#
+# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
+#
+from dataclasses import dataclass
+from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Union
+import dpath
+import requests
+from airbyte_cdk.sources.declarative.decoders.json_decoder import JsonDecoder
+from airbyte_cdk.sources.declarative.extractors.dpath_extractor import DpathExtractor
+from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString
+from airbyte_cdk.sources.types import Config
+from source_google_sheets.utils import name_conversion, safe_name_conversion
+class RawSchemaParser:
+    config: Config
+    def _extract_data(
+        self,
+        body: Mapping[str, Any],
+        extraction_path: Optional[List[Union[InterpolatedString, str]]] = None,
+        default: Any = None,
+    ) -> Any:
+        """
+        Extracts data from the body based on the provided extraction path.
+        """
+        if not extraction_path:
+            return body
+        path = [node.eval(self.config) if not isinstance(node, str) else node for node in extraction_path]
+        return dpath.get(body, path, default=default)  # type: ignore # extracted
+    def _set_data(
+        self, value: Any, body: MutableMapping[str, Any], extraction_path: Optional[List[Union[InterpolatedString, str]]] = None
+    ) -> Any:
+        """
+        Sets data in the body based on the provided extraction path.
+        """
+        if not extraction_path:
+            body = value
+        path = [node.eval(self.config) if not isinstance(node, str) else node for node in extraction_path]
+        dpath.set(body, path, value=value)
+    def parse_raw_schema_values(
+        self,
+        raw_schema_data: MutableMapping[Any, Any],
+        schema_pointer: List[Union[InterpolatedString, str]],
+        key_pointer: List[Union[InterpolatedString, str]],
+        names_conversion: bool,
+    ):
+        """
+        1. Parses sheet headers from the provided raw schema. This method assumes that data is contiguous
+            i.e: every cell contains a value and the first cell which does not contain a value denotes the end
+            of the headers.
+        2. Makes name conversion if required.
+        3. Removes duplicated fields from the schema.
+        Return a list of tuples with correct property index (by found in array), value and raw_schema
+        """
+        raw_schema_properties = self._extract_data(raw_schema_data, schema_pointer, default=[])
+        duplicate_fields = set()
+        parsed_schema_values = []
+        seen_values = set()
+        for property_index, raw_schema_property in enumerate(raw_schema_properties):
+            raw_schema_property_value = self._extract_data(raw_schema_property, key_pointer)
+            if not raw_schema_property_value:
+                break
+            if names_conversion:
+                raw_schema_property_value = safe_name_conversion(raw_schema_property_value)
+            if raw_schema_property_value in seen_values:
+                duplicate_fields.add(raw_schema_property_value)
+            seen_values.add(raw_schema_property_value)
+            parsed_schema_values.append((property_index, raw_schema_property_value, raw_schema_property))
+        if duplicate_fields:
+            parsed_schema_values = [
+                parsed_schema_value for parsed_schema_value in parsed_schema_values if parsed_schema_value[1] not in duplicate_fields
+            ]
+        return parsed_schema_values
+    def parse(self, schema_type_identifier, records: Iterable[MutableMapping[Any, Any]]):
+        """Removes duplicated fields and makes names conversion"""
+        names_conversion = self.config.get("names_conversion", False)
+        schema_pointer = schema_type_identifier.get("schema_pointer")
+        key_pointer = schema_type_identifier["key_pointer"]
+        parsed_properties = []
+        for raw_schema_data in records:
+            for _, parsed_value, raw_schema_property in self.parse_raw_schema_values(
+                raw_schema_data, schema_pointer, key_pointer, names_conversion
+            ):
+                self._set_data(parsed_value, raw_schema_property, key_pointer)
+                parsed_properties.append(raw_schema_property)
+            self._set_data(parsed_properties, raw_schema_data, schema_pointer)
+            yield raw_schema_data
+@dataclass
+class DpathSchemaMatchingExtractor(DpathExtractor, RawSchemaParser):
+    """
+    Current DpathExtractor has problems for this type of data in response:
+    [
+      {
+        "values": [
+          [
+            "name1",
+            "22"
+          ],
+          [
+            "name2",
+            "24"
+          ],
+          [
+            "name3",
+            "25"
+          ]
+        ]
+      }
+    ]
+    This is because "values" field is a list of lists instead of objects that we could extract with "*".
+    In order to do so we need the ordered properties from the schema that we can match with each list of values.
+    Then, if we get a properties object like {0: 'name', 1: 'age'} we end up with:
+    {"type":"RECORD","record":{"stream":"a_stream_name","data":{"name":"name1","age":"22"},"emitted_at":1734371904128}}
+    {"type":"RECORD","record":{"stream":"a_stream_name","data":{"name":"name2","age":"24"},"emitted_at":1734371904134}}
+    {"type":"RECORD","record":{"stream":"a_stream_name","data":{"name":"name3","age":"25"},"emitted_at":1734371904134}}
+    """
+    def __post_init__(self, parameters: Mapping[str, Any]) -> None:
+        super().__post_init__(parameters)
+        self.decoder = JsonDecoder(parameters={})
+        self._values_to_match_key = parameters["values_to_match_key"]
+        schema_type_identifier = parameters["schema_type_identifier"]
+        names_conversion = self.config.get("names_conversion", False)
+        self._indexed_properties_to_match = self.extract_properties_to_match(
+            parameters["properties_to_match"], schema_type_identifier, names_conversion=names_conversion
+        )
+    def extract_properties_to_match(self, properties_to_match, schema_type_identifier, names_conversion):
+        schema_pointer = schema_type_identifier.get("schema_pointer")
+        key_pointer = schema_type_identifier["key_pointer"]
+        indexed_properties = {}
+        for property_index, property_parsed_value, _ in self.parse_raw_schema_values(
+            properties_to_match, schema_pointer, key_pointer, names_conversion
+        ):
+            indexed_properties[property_index] = property_parsed_value
+        return indexed_properties
+    @staticmethod
+    def match_properties_with_values(unmatched_values: List[str], indexed_properties: Dict[int, str]):
+        data = {}
+        for relevant_index in sorted(indexed_properties.keys()):
+            if relevant_index >= len(unmatched_values):
+                break
+            unmatch_value = unmatched_values[relevant_index]
+            if unmatch_value.strip() != "":
+                data[indexed_properties[relevant_index]] = unmatch_value
+        yield data
+    @staticmethod
+    def is_row_empty(cell_values: List[str]) -> bool:
+        for cell in cell_values:
+            if cell.strip() != "":
+                return False
+        return True
+    @staticmethod
+    def row_contains_relevant_data(cell_values: List[str], relevant_indices: Iterable[int]) -> bool:
+        for idx in relevant_indices:
+            if len(cell_values) > idx and cell_values[idx].strip() != "":
+                return True
+        return False
+    def extract_records(self, response: requests.Response) -> Iterable[MutableMapping[Any, Any]]:
+        raw_records_extracted = super().extract_records(response=response)
+        for raw_record in raw_records_extracted:
+            unmatched_values_collection = raw_record.get(self._values_to_match_key, [])
+            for unmatched_values in unmatched_values_collection:
+                if not DpathSchemaMatchingExtractor.is_row_empty(
+                    unmatched_values
+                ) and DpathSchemaMatchingExtractor.row_contains_relevant_data(unmatched_values, self._indexed_properties_to_match.keys()):
+                    yield from DpathSchemaMatchingExtractor.match_properties_with_values(
+                        unmatched_values, self._indexed_properties_to_match
+                    )
+class DpathSchemaExtractor(DpathExtractor, RawSchemaParser):
+    """
+    Makes names conversion and parses sheet headers from the provided row.
+    """
+    def __post_init__(self, parameters: Mapping[str, Any]) -> None:
+        super().__post_init__(parameters)
+        self.schema_type_identifier = parameters["schema_type_identifier"]
+    def extract_records(self, response: requests.Response) -> Iterable[MutableMapping[Any, Any]]:
+        extracted_records = super().extract_records(response=response)
+        yield from self.parse(schema_type_identifier=self.schema_type_identifier, records=extracted_records)

source_google_sheets/components/partition_routers.py ADDED Viewed

@@ -0,0 +1,36 @@
+#
+# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
+#
+import logging
+from typing import Any, Iterable, Mapping
+from airbyte_cdk.sources.declarative.partition_routers.single_partition_router import SinglePartitionRouter
+from airbyte_cdk.sources.types import StreamSlice
+logger = logging.getLogger("airbyte")
+class RangePartitionRouter(SinglePartitionRouter):
+    """
+    Create ranges to request rows data to google sheets api.
+    """
+    parameters: Mapping[str, Any]
+    def __init__(self, parameters: Mapping[str, Any]) -> None:
+        super().__init__(parameters)
+        self.parameters = parameters
+        self.sheet_row_count = parameters.get("row_count", 0)
+        self.sheet_id = parameters.get("sheet_id")
+        self.batch_size = parameters.get("batch_size")
+    def stream_slices(self) -> Iterable[StreamSlice]:
+        start_range = 2  # skip 1 row, as expected column (fields) names there
+        while start_range <= self.sheet_row_count:
+            end_range = start_range + self.batch_size
+            logger.info(f"Fetching range {self.sheet_id}!{start_range}:{end_range}")
+            yield StreamSlice(partition={"start_range": start_range, "end_range": end_range}, cursor_slice={})
+            start_range = end_range + 1

airbyte-source-google-sheets 0.8.4__py3-none-any.whl → 0.9.0rc1__py3-none-any.whl

airbyte-source-google-sheets 0.8.4py3-none-any.whl → 0.9.0rc1py3-none-any.whl