airbyte-source-google-sheets 0.8.5__py3-none-any.whl → 0.9.0rc2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,21 +1,17 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: airbyte-source-google-sheets
3
- Version: 0.8.5
3
+ Version: 0.9.0rc2
4
4
  Summary: Source implementation for Google Sheets.
5
5
  License: Elv2
6
6
  Author: Airbyte
7
7
  Author-email: contact@airbyte.io
8
- Requires-Python: >=3.10,<4.0
8
+ Requires-Python: >=3.10,<3.13
9
9
  Classifier: License :: Other/Proprietary License
10
10
  Classifier: Programming Language :: Python :: 3
11
11
  Classifier: Programming Language :: Python :: 3.10
12
12
  Classifier: Programming Language :: Python :: 3.11
13
13
  Classifier: Programming Language :: Python :: 3.12
14
- Classifier: Programming Language :: Python :: 3.13
15
- Requires-Dist: Unidecode (==1.3.8)
16
- Requires-Dist: airbyte-cdk (>=4,<5)
17
- Requires-Dist: google-api-python-client (==2.114.0)
18
- Requires-Dist: google-auth-httplib2 (==0.2.0)
14
+ Requires-Dist: airbyte-cdk (>=6,<7)
19
15
  Project-URL: Documentation, https://docs.airbyte.com/integrations/sources/google-sheets
20
16
  Project-URL: Homepage, https://airbyte.com
21
17
  Project-URL: Repository, https://github.com/airbytehq/airbyte
@@ -0,0 +1,16 @@
1
+ source_google_sheets/__init__.py,sha256=dYaZS0KrTjksk_yeSuXmNDXDsNKWctFnlZJSMgLryXE,135
2
+ source_google_sheets/components/__init__.py,sha256=v7odPaLdz2S_NRzExsxkk6694Vmjbplz4Z5mA4lxdkA,343
3
+ source_google_sheets/components/extractors.py,sha256=S7lPBuy9MO_mnl3h4B48F_szuusvDdvpX0OBK-_AxVA,8909
4
+ source_google_sheets/components/partition_routers.py,sha256=SWo1V0K10ZdWE2TQ0KuQUfue04RTyHJe1f6BOj6c96s,1265
5
+ source_google_sheets/manifest.yaml,sha256=vUSTlaaDlja8CjHxmx-JFDOgmBP9BJC-puisx2ToEzE,15792
6
+ source_google_sheets/models/__init__.py,sha256=Z-4MTpxG5t2jGhXzs4PPoIOa83zw3jRnUDx0N9Puv3s,61
7
+ source_google_sheets/models/spreadsheet.py,sha256=DEef7bWQEpY1Uqyk7RN6qLF8oxLSNzuAtHlGRmMAKQA,1112
8
+ source_google_sheets/models/spreadsheet_values.py,sha256=-XRMuuILn9JN8svHNTj6-oG8mLTZOZ5Hejy2pJ5bILk,440
9
+ source_google_sheets/run.py,sha256=eaPRcarWqkB2b2DokvI83w7rz1blmWPQCFahvCyCdSY,1887
10
+ source_google_sheets/source.py,sha256=qO1KoGdphieu7F5VgDYtrbqs56AUvMWFGNvFHP2b9Z4,778
11
+ source_google_sheets/spec.yaml,sha256=RIUILMhfS0is2r_mCkmIVrQfvND1D3eobDK1YElmzhU,5009
12
+ source_google_sheets/utils.py,sha256=JEQIVLSFEAff-7zF3gPzsvFc9xLfCj9hVuFFYrSWiOo,2290
13
+ airbyte_source_google_sheets-0.9.0rc2.dist-info/METADATA,sha256=d6XRDNn0v7pO4lGtJji_KvCJtX63WCqTILijgvYB_mE,5371
14
+ airbyte_source_google_sheets-0.9.0rc2.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
15
+ airbyte_source_google_sheets-0.9.0rc2.dist-info/entry_points.txt,sha256=Dtsfjohe5IPUFyqojk49SIoP7CifCTlNLG_pgivzppo,69
16
+ airbyte_source_google_sheets-0.9.0rc2.dist-info/RECORD,,
@@ -1,3 +1,7 @@
1
+ #
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
+ #
4
+
1
5
  from .source import SourceGoogleSheets
2
6
 
3
7
  __all__ = ["SourceGoogleSheets"]
@@ -0,0 +1,8 @@
1
+ #
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from source_google_sheets.components.extractors import DpathSchemaMatchingExtractor, DpathSchemaExtractor
6
+ from source_google_sheets.components.partition_routers import RangePartitionRouter
7
+
8
+ __all__ = ["DpathSchemaMatchingExtractor", "RangePartitionRouter", "DpathSchemaExtractor"]
@@ -0,0 +1,207 @@
1
+ #
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Union
7
+
8
+ import dpath
9
+ import requests
10
+
11
+ from airbyte_cdk.sources.declarative.decoders.json_decoder import JsonDecoder
12
+ from airbyte_cdk.sources.declarative.extractors.dpath_extractor import DpathExtractor
13
+ from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString
14
+ from airbyte_cdk.sources.types import Config
15
+ from source_google_sheets.utils import name_conversion, safe_name_conversion
16
+
17
+
18
+ class RawSchemaParser:
19
+ config: Config
20
+
21
+ def _extract_data(
22
+ self,
23
+ body: Mapping[str, Any],
24
+ extraction_path: Optional[List[Union[InterpolatedString, str]]] = None,
25
+ default: Any = None,
26
+ ) -> Any:
27
+ """
28
+ Extracts data from the body based on the provided extraction path.
29
+ """
30
+
31
+ if not extraction_path:
32
+ return body
33
+
34
+ path = [node.eval(self.config) if not isinstance(node, str) else node for node in extraction_path]
35
+
36
+ return dpath.get(body, path, default=default) # type: ignore # extracted
37
+
38
+ def _set_data(
39
+ self, value: Any, body: MutableMapping[str, Any], extraction_path: Optional[List[Union[InterpolatedString, str]]] = None
40
+ ) -> Any:
41
+ """
42
+ Sets data in the body based on the provided extraction path.
43
+ """
44
+ if not extraction_path:
45
+ body = value
46
+
47
+ path = [node.eval(self.config) if not isinstance(node, str) else node for node in extraction_path]
48
+
49
+ dpath.set(body, path, value=value)
50
+
51
+ def parse_raw_schema_values(
52
+ self,
53
+ raw_schema_data: MutableMapping[Any, Any],
54
+ schema_pointer: List[Union[InterpolatedString, str]],
55
+ key_pointer: List[Union[InterpolatedString, str]],
56
+ names_conversion: bool,
57
+ ):
58
+ """
59
+ 1. Parses sheet headers from the provided raw schema. This method assumes that data is contiguous
60
+ i.e: every cell contains a value and the first cell which does not contain a value denotes the end
61
+ of the headers.
62
+ 2. Makes name conversion if required.
63
+ 3. Removes duplicated fields from the schema.
64
+ Return a list of tuples with correct property index (by found in array), value and raw_schema
65
+ """
66
+ raw_schema_properties = self._extract_data(raw_schema_data, schema_pointer, default=[])
67
+ duplicate_fields = set()
68
+ parsed_schema_values = []
69
+ seen_values = set()
70
+ for property_index, raw_schema_property in enumerate(raw_schema_properties):
71
+ raw_schema_property_value = self._extract_data(raw_schema_property, key_pointer)
72
+ if not raw_schema_property_value:
73
+ break
74
+ if names_conversion:
75
+ raw_schema_property_value = safe_name_conversion(raw_schema_property_value)
76
+
77
+ if raw_schema_property_value in seen_values:
78
+ duplicate_fields.add(raw_schema_property_value)
79
+ seen_values.add(raw_schema_property_value)
80
+ parsed_schema_values.append((property_index, raw_schema_property_value, raw_schema_property))
81
+
82
+ if duplicate_fields:
83
+ parsed_schema_values = [
84
+ parsed_schema_value for parsed_schema_value in parsed_schema_values if parsed_schema_value[1] not in duplicate_fields
85
+ ]
86
+
87
+ return parsed_schema_values
88
+
89
+ def parse(self, schema_type_identifier, records: Iterable[MutableMapping[Any, Any]]):
90
+ """Removes duplicated fields and makes names conversion"""
91
+ names_conversion = self.config.get("names_conversion", False)
92
+ schema_pointer = schema_type_identifier.get("schema_pointer")
93
+ key_pointer = schema_type_identifier["key_pointer"]
94
+ parsed_properties = []
95
+ for raw_schema_data in records:
96
+ for _, parsed_value, raw_schema_property in self.parse_raw_schema_values(
97
+ raw_schema_data, schema_pointer, key_pointer, names_conversion
98
+ ):
99
+ self._set_data(parsed_value, raw_schema_property, key_pointer)
100
+ parsed_properties.append(raw_schema_property)
101
+ self._set_data(parsed_properties, raw_schema_data, schema_pointer)
102
+ yield raw_schema_data
103
+
104
+
105
+ @dataclass
106
+ class DpathSchemaMatchingExtractor(DpathExtractor, RawSchemaParser):
107
+ """
108
+ Current DpathExtractor has problems for this type of data in response:
109
+ [
110
+ {
111
+ "values": [
112
+ [
113
+ "name1",
114
+ "22"
115
+ ],
116
+ [
117
+ "name2",
118
+ "24"
119
+ ],
120
+ [
121
+ "name3",
122
+ "25"
123
+ ]
124
+ ]
125
+ }
126
+ ]
127
+
128
+ This is because "values" field is a list of lists instead of objects that we could extract with "*".
129
+ In order to do so we need the ordered properties from the schema that we can match with each list of values.
130
+ Then, if we get a properties object like {0: 'name', 1: 'age'} we end up with:
131
+
132
+ {"type":"RECORD","record":{"stream":"a_stream_name","data":{"name":"name1","age":"22"},"emitted_at":1734371904128}}
133
+ {"type":"RECORD","record":{"stream":"a_stream_name","data":{"name":"name2","age":"24"},"emitted_at":1734371904134}}
134
+ {"type":"RECORD","record":{"stream":"a_stream_name","data":{"name":"name3","age":"25"},"emitted_at":1734371904134}}
135
+ """
136
+
137
+ def __post_init__(self, parameters: Mapping[str, Any]) -> None:
138
+ super().__post_init__(parameters)
139
+ self.decoder = JsonDecoder(parameters={})
140
+ self._values_to_match_key = parameters["values_to_match_key"]
141
+ schema_type_identifier = parameters["schema_type_identifier"]
142
+ names_conversion = self.config.get("names_conversion", False)
143
+ self._indexed_properties_to_match = self.extract_properties_to_match(
144
+ parameters["properties_to_match"], schema_type_identifier, names_conversion=names_conversion
145
+ )
146
+
147
+ def extract_properties_to_match(self, properties_to_match, schema_type_identifier, names_conversion):
148
+ schema_pointer = schema_type_identifier.get("schema_pointer")
149
+ key_pointer = schema_type_identifier["key_pointer"]
150
+ indexed_properties = {}
151
+ for property_index, property_parsed_value, _ in self.parse_raw_schema_values(
152
+ properties_to_match, schema_pointer, key_pointer, names_conversion
153
+ ):
154
+ indexed_properties[property_index] = property_parsed_value
155
+ return indexed_properties
156
+
157
+ @staticmethod
158
+ def match_properties_with_values(unmatched_values: List[str], indexed_properties: Dict[int, str]):
159
+ data = {}
160
+ for relevant_index in sorted(indexed_properties.keys()):
161
+ if relevant_index >= len(unmatched_values):
162
+ break
163
+
164
+ unmatch_value = unmatched_values[relevant_index]
165
+ if unmatch_value.strip() != "":
166
+ data[indexed_properties[relevant_index]] = unmatch_value
167
+ yield data
168
+
169
+ @staticmethod
170
+ def is_row_empty(cell_values: List[str]) -> bool:
171
+ for cell in cell_values:
172
+ if cell.strip() != "":
173
+ return False
174
+ return True
175
+
176
+ @staticmethod
177
+ def row_contains_relevant_data(cell_values: List[str], relevant_indices: Iterable[int]) -> bool:
178
+ for idx in relevant_indices:
179
+ if len(cell_values) > idx and cell_values[idx].strip() != "":
180
+ return True
181
+ return False
182
+
183
+ def extract_records(self, response: requests.Response) -> Iterable[MutableMapping[Any, Any]]:
184
+ raw_records_extracted = super().extract_records(response=response)
185
+ for raw_record in raw_records_extracted:
186
+ unmatched_values_collection = raw_record.get(self._values_to_match_key, [])
187
+ for unmatched_values in unmatched_values_collection:
188
+ if not DpathSchemaMatchingExtractor.is_row_empty(
189
+ unmatched_values
190
+ ) and DpathSchemaMatchingExtractor.row_contains_relevant_data(unmatched_values, self._indexed_properties_to_match.keys()):
191
+ yield from DpathSchemaMatchingExtractor.match_properties_with_values(
192
+ unmatched_values, self._indexed_properties_to_match
193
+ )
194
+
195
+
196
+ class DpathSchemaExtractor(DpathExtractor, RawSchemaParser):
197
+ """
198
+ Makes names conversion and parses sheet headers from the provided row.
199
+ """
200
+
201
+ def __post_init__(self, parameters: Mapping[str, Any]) -> None:
202
+ super().__post_init__(parameters)
203
+ self.schema_type_identifier = parameters["schema_type_identifier"]
204
+
205
+ def extract_records(self, response: requests.Response) -> Iterable[MutableMapping[Any, Any]]:
206
+ extracted_records = super().extract_records(response=response)
207
+ yield from self.parse(schema_type_identifier=self.schema_type_identifier, records=extracted_records)
@@ -0,0 +1,36 @@
1
+ #
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import logging
6
+ from typing import Any, Iterable, Mapping
7
+
8
+ from airbyte_cdk.sources.declarative.partition_routers.single_partition_router import SinglePartitionRouter
9
+ from airbyte_cdk.sources.types import StreamSlice
10
+
11
+
12
+ logger = logging.getLogger("airbyte")
13
+
14
+
15
+ class RangePartitionRouter(SinglePartitionRouter):
16
+ """
17
+ Create ranges to request rows data to google sheets api.
18
+ """
19
+
20
+ parameters: Mapping[str, Any]
21
+
22
+ def __init__(self, parameters: Mapping[str, Any]) -> None:
23
+ super().__init__(parameters)
24
+ self.parameters = parameters
25
+ self.sheet_row_count = parameters.get("row_count", 0)
26
+ self.sheet_id = parameters.get("sheet_id")
27
+ self.batch_size = parameters.get("batch_size")
28
+
29
+ def stream_slices(self) -> Iterable[StreamSlice]:
30
+ start_range = 2 # skip 1 row, as expected column (fields) names there
31
+
32
+ while start_range <= self.sheet_row_count:
33
+ end_range = start_range + self.batch_size
34
+ logger.info(f"Fetching range {self.sheet_id}!{start_range}:{end_range}")
35
+ yield StreamSlice(partition={"start_range": start_range, "end_range": end_range}, cursor_slice={})
36
+ start_range = end_range + 1