airbyte-source-google-sheets 0.8.4__py3-none-any.whl → 0.9.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,23 +1,19 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.3
2
2
  Name: airbyte-source-google-sheets
3
- Version: 0.8.4
3
+ Version: 0.9.0rc1
4
4
  Summary: Source implementation for Google Sheets.
5
- Home-page: https://airbyte.com
6
5
  License: Elv2
7
6
  Author: Airbyte
8
7
  Author-email: contact@airbyte.io
9
- Requires-Python: >=3.10,<4.0
8
+ Requires-Python: >=3.10,<3.13
10
9
  Classifier: License :: Other/Proprietary License
11
10
  Classifier: Programming Language :: Python :: 3
12
11
  Classifier: Programming Language :: Python :: 3.10
13
12
  Classifier: Programming Language :: Python :: 3.11
14
13
  Classifier: Programming Language :: Python :: 3.12
15
- Classifier: Programming Language :: Python :: 3.13
16
- Requires-Dist: Unidecode (==1.3.8)
17
- Requires-Dist: airbyte-cdk (>=4,<5)
18
- Requires-Dist: google-api-python-client (==2.114.0)
19
- Requires-Dist: google-auth-httplib2 (==0.2.0)
14
+ Requires-Dist: airbyte-cdk (>=6,<7)
20
15
  Project-URL: Documentation, https://docs.airbyte.com/integrations/sources/google-sheets
16
+ Project-URL: Homepage, https://airbyte.com
21
17
  Project-URL: Repository, https://github.com/airbytehq/airbyte
22
18
  Description-Content-Type: text/markdown
23
19
 
@@ -0,0 +1,16 @@
1
+ source_google_sheets/__init__.py,sha256=dYaZS0KrTjksk_yeSuXmNDXDsNKWctFnlZJSMgLryXE,135
2
+ source_google_sheets/components/__init__.py,sha256=v7odPaLdz2S_NRzExsxkk6694Vmjbplz4Z5mA4lxdkA,343
3
+ source_google_sheets/components/extractors.py,sha256=S7lPBuy9MO_mnl3h4B48F_szuusvDdvpX0OBK-_AxVA,8909
4
+ source_google_sheets/components/partition_routers.py,sha256=SWo1V0K10ZdWE2TQ0KuQUfue04RTyHJe1f6BOj6c96s,1265
5
+ source_google_sheets/manifest.yaml,sha256=qoCSQ4i7PnXCArmJ4RPOQA_31j8Vmd22ryI9ZL2cVDc,15756
6
+ source_google_sheets/models/__init__.py,sha256=Z-4MTpxG5t2jGhXzs4PPoIOa83zw3jRnUDx0N9Puv3s,61
7
+ source_google_sheets/models/spreadsheet.py,sha256=DEef7bWQEpY1Uqyk7RN6qLF8oxLSNzuAtHlGRmMAKQA,1112
8
+ source_google_sheets/models/spreadsheet_values.py,sha256=-XRMuuILn9JN8svHNTj6-oG8mLTZOZ5Hejy2pJ5bILk,440
9
+ source_google_sheets/run.py,sha256=eaPRcarWqkB2b2DokvI83w7rz1blmWPQCFahvCyCdSY,1887
10
+ source_google_sheets/source.py,sha256=qO1KoGdphieu7F5VgDYtrbqs56AUvMWFGNvFHP2b9Z4,778
11
+ source_google_sheets/spec.yaml,sha256=RIUILMhfS0is2r_mCkmIVrQfvND1D3eobDK1YElmzhU,5009
12
+ source_google_sheets/utils.py,sha256=JEQIVLSFEAff-7zF3gPzsvFc9xLfCj9hVuFFYrSWiOo,2290
13
+ airbyte_source_google_sheets-0.9.0rc1.dist-info/METADATA,sha256=V4s8KsVDIXElKB48e04etCJ7Le4i-KBOS0xQZZpK9Nw,5371
14
+ airbyte_source_google_sheets-0.9.0rc1.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
15
+ airbyte_source_google_sheets-0.9.0rc1.dist-info/entry_points.txt,sha256=Dtsfjohe5IPUFyqojk49SIoP7CifCTlNLG_pgivzppo,69
16
+ airbyte_source_google_sheets-0.9.0rc1.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 1.9.1
2
+ Generator: poetry-core 2.0.1
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,3 +1,7 @@
1
+ #
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
+ #
4
+
1
5
  from .source import SourceGoogleSheets
2
6
 
3
7
  __all__ = ["SourceGoogleSheets"]
@@ -0,0 +1,8 @@
1
+ #
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from source_google_sheets.components.extractors import DpathSchemaMatchingExtractor, DpathSchemaExtractor
6
+ from source_google_sheets.components.partition_routers import RangePartitionRouter
7
+
8
+ __all__ = ["DpathSchemaMatchingExtractor", "RangePartitionRouter", "DpathSchemaExtractor"]
@@ -0,0 +1,207 @@
1
+ #
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Union
7
+
8
+ import dpath
9
+ import requests
10
+
11
+ from airbyte_cdk.sources.declarative.decoders.json_decoder import JsonDecoder
12
+ from airbyte_cdk.sources.declarative.extractors.dpath_extractor import DpathExtractor
13
+ from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString
14
+ from airbyte_cdk.sources.types import Config
15
+ from source_google_sheets.utils import name_conversion, safe_name_conversion
16
+
17
+
18
+ class RawSchemaParser:
19
+ config: Config
20
+
21
+ def _extract_data(
22
+ self,
23
+ body: Mapping[str, Any],
24
+ extraction_path: Optional[List[Union[InterpolatedString, str]]] = None,
25
+ default: Any = None,
26
+ ) -> Any:
27
+ """
28
+ Extracts data from the body based on the provided extraction path.
29
+ """
30
+
31
+ if not extraction_path:
32
+ return body
33
+
34
+ path = [node.eval(self.config) if not isinstance(node, str) else node for node in extraction_path]
35
+
36
+ return dpath.get(body, path, default=default) # type: ignore # extracted
37
+
38
+ def _set_data(
39
+ self, value: Any, body: MutableMapping[str, Any], extraction_path: Optional[List[Union[InterpolatedString, str]]] = None
40
+ ) -> Any:
41
+ """
42
+ Sets data in the body based on the provided extraction path.
43
+ """
44
+ if not extraction_path:
45
+ body = value
46
+
47
+ path = [node.eval(self.config) if not isinstance(node, str) else node for node in extraction_path]
48
+
49
+ dpath.set(body, path, value=value)
50
+
51
+ def parse_raw_schema_values(
52
+ self,
53
+ raw_schema_data: MutableMapping[Any, Any],
54
+ schema_pointer: List[Union[InterpolatedString, str]],
55
+ key_pointer: List[Union[InterpolatedString, str]],
56
+ names_conversion: bool,
57
+ ):
58
+ """
59
+ 1. Parses sheet headers from the provided raw schema. This method assumes that data is contiguous
60
+ i.e: every cell contains a value and the first cell which does not contain a value denotes the end
61
+ of the headers.
62
+ 2. Makes name conversion if required.
63
+ 3. Removes duplicated fields from the schema.
64
+ Return a list of tuples with correct property index (by found in array), value and raw_schema
65
+ """
66
+ raw_schema_properties = self._extract_data(raw_schema_data, schema_pointer, default=[])
67
+ duplicate_fields = set()
68
+ parsed_schema_values = []
69
+ seen_values = set()
70
+ for property_index, raw_schema_property in enumerate(raw_schema_properties):
71
+ raw_schema_property_value = self._extract_data(raw_schema_property, key_pointer)
72
+ if not raw_schema_property_value:
73
+ break
74
+ if names_conversion:
75
+ raw_schema_property_value = safe_name_conversion(raw_schema_property_value)
76
+
77
+ if raw_schema_property_value in seen_values:
78
+ duplicate_fields.add(raw_schema_property_value)
79
+ seen_values.add(raw_schema_property_value)
80
+ parsed_schema_values.append((property_index, raw_schema_property_value, raw_schema_property))
81
+
82
+ if duplicate_fields:
83
+ parsed_schema_values = [
84
+ parsed_schema_value for parsed_schema_value in parsed_schema_values if parsed_schema_value[1] not in duplicate_fields
85
+ ]
86
+
87
+ return parsed_schema_values
88
+
89
+ def parse(self, schema_type_identifier, records: Iterable[MutableMapping[Any, Any]]):
90
+ """Removes duplicated fields and makes names conversion"""
91
+ names_conversion = self.config.get("names_conversion", False)
92
+ schema_pointer = schema_type_identifier.get("schema_pointer")
93
+ key_pointer = schema_type_identifier["key_pointer"]
94
+ parsed_properties = []
95
+ for raw_schema_data in records:
96
+ for _, parsed_value, raw_schema_property in self.parse_raw_schema_values(
97
+ raw_schema_data, schema_pointer, key_pointer, names_conversion
98
+ ):
99
+ self._set_data(parsed_value, raw_schema_property, key_pointer)
100
+ parsed_properties.append(raw_schema_property)
101
+ self._set_data(parsed_properties, raw_schema_data, schema_pointer)
102
+ yield raw_schema_data
103
+
104
+
105
+ @dataclass
106
+ class DpathSchemaMatchingExtractor(DpathExtractor, RawSchemaParser):
107
+ """
108
+ Current DpathExtractor has problems for this type of data in response:
109
+ [
110
+ {
111
+ "values": [
112
+ [
113
+ "name1",
114
+ "22"
115
+ ],
116
+ [
117
+ "name2",
118
+ "24"
119
+ ],
120
+ [
121
+ "name3",
122
+ "25"
123
+ ]
124
+ ]
125
+ }
126
+ ]
127
+
128
+ This is because "values" field is a list of lists instead of objects that we could extract with "*".
129
+ In order to do so we need the ordered properties from the schema that we can match with each list of values.
130
+ Then, if we get a properties object like {0: 'name', 1: 'age'} we end up with:
131
+
132
+ {"type":"RECORD","record":{"stream":"a_stream_name","data":{"name":"name1","age":"22"},"emitted_at":1734371904128}}
133
+ {"type":"RECORD","record":{"stream":"a_stream_name","data":{"name":"name2","age":"24"},"emitted_at":1734371904134}}
134
+ {"type":"RECORD","record":{"stream":"a_stream_name","data":{"name":"name3","age":"25"},"emitted_at":1734371904134}}
135
+ """
136
+
137
+ def __post_init__(self, parameters: Mapping[str, Any]) -> None:
138
+ super().__post_init__(parameters)
139
+ self.decoder = JsonDecoder(parameters={})
140
+ self._values_to_match_key = parameters["values_to_match_key"]
141
+ schema_type_identifier = parameters["schema_type_identifier"]
142
+ names_conversion = self.config.get("names_conversion", False)
143
+ self._indexed_properties_to_match = self.extract_properties_to_match(
144
+ parameters["properties_to_match"], schema_type_identifier, names_conversion=names_conversion
145
+ )
146
+
147
+ def extract_properties_to_match(self, properties_to_match, schema_type_identifier, names_conversion):
148
+ schema_pointer = schema_type_identifier.get("schema_pointer")
149
+ key_pointer = schema_type_identifier["key_pointer"]
150
+ indexed_properties = {}
151
+ for property_index, property_parsed_value, _ in self.parse_raw_schema_values(
152
+ properties_to_match, schema_pointer, key_pointer, names_conversion
153
+ ):
154
+ indexed_properties[property_index] = property_parsed_value
155
+ return indexed_properties
156
+
157
+ @staticmethod
158
+ def match_properties_with_values(unmatched_values: List[str], indexed_properties: Dict[int, str]):
159
+ data = {}
160
+ for relevant_index in sorted(indexed_properties.keys()):
161
+ if relevant_index >= len(unmatched_values):
162
+ break
163
+
164
+ unmatch_value = unmatched_values[relevant_index]
165
+ if unmatch_value.strip() != "":
166
+ data[indexed_properties[relevant_index]] = unmatch_value
167
+ yield data
168
+
169
+ @staticmethod
170
+ def is_row_empty(cell_values: List[str]) -> bool:
171
+ for cell in cell_values:
172
+ if cell.strip() != "":
173
+ return False
174
+ return True
175
+
176
+ @staticmethod
177
+ def row_contains_relevant_data(cell_values: List[str], relevant_indices: Iterable[int]) -> bool:
178
+ for idx in relevant_indices:
179
+ if len(cell_values) > idx and cell_values[idx].strip() != "":
180
+ return True
181
+ return False
182
+
183
+ def extract_records(self, response: requests.Response) -> Iterable[MutableMapping[Any, Any]]:
184
+ raw_records_extracted = super().extract_records(response=response)
185
+ for raw_record in raw_records_extracted:
186
+ unmatched_values_collection = raw_record.get(self._values_to_match_key, [])
187
+ for unmatched_values in unmatched_values_collection:
188
+ if not DpathSchemaMatchingExtractor.is_row_empty(
189
+ unmatched_values
190
+ ) and DpathSchemaMatchingExtractor.row_contains_relevant_data(unmatched_values, self._indexed_properties_to_match.keys()):
191
+ yield from DpathSchemaMatchingExtractor.match_properties_with_values(
192
+ unmatched_values, self._indexed_properties_to_match
193
+ )
194
+
195
+
196
+ class DpathSchemaExtractor(DpathExtractor, RawSchemaParser):
197
+ """
198
+ Makes names conversion and parses sheet headers from the provided row.
199
+ """
200
+
201
+ def __post_init__(self, parameters: Mapping[str, Any]) -> None:
202
+ super().__post_init__(parameters)
203
+ self.schema_type_identifier = parameters["schema_type_identifier"]
204
+
205
+ def extract_records(self, response: requests.Response) -> Iterable[MutableMapping[Any, Any]]:
206
+ extracted_records = super().extract_records(response=response)
207
+ yield from self.parse(schema_type_identifier=self.schema_type_identifier, records=extracted_records)
@@ -0,0 +1,36 @@
1
+ #
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import logging
6
+ from typing import Any, Iterable, Mapping
7
+
8
+ from airbyte_cdk.sources.declarative.partition_routers.single_partition_router import SinglePartitionRouter
9
+ from airbyte_cdk.sources.types import StreamSlice
10
+
11
+
12
+ logger = logging.getLogger("airbyte")
13
+
14
+
15
+ class RangePartitionRouter(SinglePartitionRouter):
16
+ """
17
+ Create ranges to request rows data to google sheets api.
18
+ """
19
+
20
+ parameters: Mapping[str, Any]
21
+
22
+ def __init__(self, parameters: Mapping[str, Any]) -> None:
23
+ super().__init__(parameters)
24
+ self.parameters = parameters
25
+ self.sheet_row_count = parameters.get("row_count", 0)
26
+ self.sheet_id = parameters.get("sheet_id")
27
+ self.batch_size = parameters.get("batch_size")
28
+
29
+ def stream_slices(self) -> Iterable[StreamSlice]:
30
+ start_range = 2 # skip 1 row, as expected column (fields) names there
31
+
32
+ while start_range <= self.sheet_row_count:
33
+ end_range = start_range + self.batch_size
34
+ logger.info(f"Fetching range {self.sheet_id}!{start_range}:{end_range}")
35
+ yield StreamSlice(partition={"start_range": start_range, "end_range": end_range}, cursor_slice={})
36
+ start_range = end_range + 1