airbyte-source-google-sheets 0.8.4__py3-none-any.whl → 0.9.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {airbyte_source_google_sheets-0.8.4.dist-info → airbyte_source_google_sheets-0.9.0rc1.dist-info}/METADATA +5 -9
- airbyte_source_google_sheets-0.9.0rc1.dist-info/RECORD +16 -0
- {airbyte_source_google_sheets-0.8.4.dist-info → airbyte_source_google_sheets-0.9.0rc1.dist-info}/WHEEL +1 -1
- source_google_sheets/__init__.py +4 -0
- source_google_sheets/components/__init__.py +8 -0
- source_google_sheets/components/extractors.py +207 -0
- source_google_sheets/components/partition_routers.py +36 -0
- source_google_sheets/manifest.yaml +407 -0
- source_google_sheets/models/spreadsheet.py +1 -1
- source_google_sheets/models/spreadsheet_values.py +1 -1
- source_google_sheets/run.py +43 -5
- source_google_sheets/source.py +14 -269
- source_google_sheets/spec.yaml +4 -6
- source_google_sheets/utils.py +2 -1
- airbyte_source_google_sheets-0.8.4.dist-info/RECORD +0 -14
- source_google_sheets/client.py +0 -48
- source_google_sheets/helpers.py +0 -232
- {airbyte_source_google_sheets-0.8.4.dist-info → airbyte_source_google_sheets-0.9.0rc1.dist-info}/entry_points.txt +0 -0
@@ -1,23 +1,19 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.3
|
2
2
|
Name: airbyte-source-google-sheets
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.9.0rc1
|
4
4
|
Summary: Source implementation for Google Sheets.
|
5
|
-
Home-page: https://airbyte.com
|
6
5
|
License: Elv2
|
7
6
|
Author: Airbyte
|
8
7
|
Author-email: contact@airbyte.io
|
9
|
-
Requires-Python: >=3.10,<
|
8
|
+
Requires-Python: >=3.10,<3.13
|
10
9
|
Classifier: License :: Other/Proprietary License
|
11
10
|
Classifier: Programming Language :: Python :: 3
|
12
11
|
Classifier: Programming Language :: Python :: 3.10
|
13
12
|
Classifier: Programming Language :: Python :: 3.11
|
14
13
|
Classifier: Programming Language :: Python :: 3.12
|
15
|
-
|
16
|
-
Requires-Dist: Unidecode (==1.3.8)
|
17
|
-
Requires-Dist: airbyte-cdk (>=4,<5)
|
18
|
-
Requires-Dist: google-api-python-client (==2.114.0)
|
19
|
-
Requires-Dist: google-auth-httplib2 (==0.2.0)
|
14
|
+
Requires-Dist: airbyte-cdk (>=6,<7)
|
20
15
|
Project-URL: Documentation, https://docs.airbyte.com/integrations/sources/google-sheets
|
16
|
+
Project-URL: Homepage, https://airbyte.com
|
21
17
|
Project-URL: Repository, https://github.com/airbytehq/airbyte
|
22
18
|
Description-Content-Type: text/markdown
|
23
19
|
|
@@ -0,0 +1,16 @@
|
|
1
|
+
source_google_sheets/__init__.py,sha256=dYaZS0KrTjksk_yeSuXmNDXDsNKWctFnlZJSMgLryXE,135
|
2
|
+
source_google_sheets/components/__init__.py,sha256=v7odPaLdz2S_NRzExsxkk6694Vmjbplz4Z5mA4lxdkA,343
|
3
|
+
source_google_sheets/components/extractors.py,sha256=S7lPBuy9MO_mnl3h4B48F_szuusvDdvpX0OBK-_AxVA,8909
|
4
|
+
source_google_sheets/components/partition_routers.py,sha256=SWo1V0K10ZdWE2TQ0KuQUfue04RTyHJe1f6BOj6c96s,1265
|
5
|
+
source_google_sheets/manifest.yaml,sha256=qoCSQ4i7PnXCArmJ4RPOQA_31j8Vmd22ryI9ZL2cVDc,15756
|
6
|
+
source_google_sheets/models/__init__.py,sha256=Z-4MTpxG5t2jGhXzs4PPoIOa83zw3jRnUDx0N9Puv3s,61
|
7
|
+
source_google_sheets/models/spreadsheet.py,sha256=DEef7bWQEpY1Uqyk7RN6qLF8oxLSNzuAtHlGRmMAKQA,1112
|
8
|
+
source_google_sheets/models/spreadsheet_values.py,sha256=-XRMuuILn9JN8svHNTj6-oG8mLTZOZ5Hejy2pJ5bILk,440
|
9
|
+
source_google_sheets/run.py,sha256=eaPRcarWqkB2b2DokvI83w7rz1blmWPQCFahvCyCdSY,1887
|
10
|
+
source_google_sheets/source.py,sha256=qO1KoGdphieu7F5VgDYtrbqs56AUvMWFGNvFHP2b9Z4,778
|
11
|
+
source_google_sheets/spec.yaml,sha256=RIUILMhfS0is2r_mCkmIVrQfvND1D3eobDK1YElmzhU,5009
|
12
|
+
source_google_sheets/utils.py,sha256=JEQIVLSFEAff-7zF3gPzsvFc9xLfCj9hVuFFYrSWiOo,2290
|
13
|
+
airbyte_source_google_sheets-0.9.0rc1.dist-info/METADATA,sha256=V4s8KsVDIXElKB48e04etCJ7Le4i-KBOS0xQZZpK9Nw,5371
|
14
|
+
airbyte_source_google_sheets-0.9.0rc1.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
|
15
|
+
airbyte_source_google_sheets-0.9.0rc1.dist-info/entry_points.txt,sha256=Dtsfjohe5IPUFyqojk49SIoP7CifCTlNLG_pgivzppo,69
|
16
|
+
airbyte_source_google_sheets-0.9.0rc1.dist-info/RECORD,,
|
source_google_sheets/__init__.py
CHANGED
@@ -0,0 +1,8 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
|
5
|
+
from source_google_sheets.components.extractors import DpathSchemaMatchingExtractor, DpathSchemaExtractor
|
6
|
+
from source_google_sheets.components.partition_routers import RangePartitionRouter
|
7
|
+
|
8
|
+
__all__ = ["DpathSchemaMatchingExtractor", "RangePartitionRouter", "DpathSchemaExtractor"]
|
@@ -0,0 +1,207 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
|
5
|
+
from dataclasses import dataclass
|
6
|
+
from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Union
|
7
|
+
|
8
|
+
import dpath
|
9
|
+
import requests
|
10
|
+
|
11
|
+
from airbyte_cdk.sources.declarative.decoders.json_decoder import JsonDecoder
|
12
|
+
from airbyte_cdk.sources.declarative.extractors.dpath_extractor import DpathExtractor
|
13
|
+
from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString
|
14
|
+
from airbyte_cdk.sources.types import Config
|
15
|
+
from source_google_sheets.utils import name_conversion, safe_name_conversion
|
16
|
+
|
17
|
+
|
18
|
+
class RawSchemaParser:
|
19
|
+
config: Config
|
20
|
+
|
21
|
+
def _extract_data(
|
22
|
+
self,
|
23
|
+
body: Mapping[str, Any],
|
24
|
+
extraction_path: Optional[List[Union[InterpolatedString, str]]] = None,
|
25
|
+
default: Any = None,
|
26
|
+
) -> Any:
|
27
|
+
"""
|
28
|
+
Extracts data from the body based on the provided extraction path.
|
29
|
+
"""
|
30
|
+
|
31
|
+
if not extraction_path:
|
32
|
+
return body
|
33
|
+
|
34
|
+
path = [node.eval(self.config) if not isinstance(node, str) else node for node in extraction_path]
|
35
|
+
|
36
|
+
return dpath.get(body, path, default=default) # type: ignore # extracted
|
37
|
+
|
38
|
+
def _set_data(
|
39
|
+
self, value: Any, body: MutableMapping[str, Any], extraction_path: Optional[List[Union[InterpolatedString, str]]] = None
|
40
|
+
) -> Any:
|
41
|
+
"""
|
42
|
+
Sets data in the body based on the provided extraction path.
|
43
|
+
"""
|
44
|
+
if not extraction_path:
|
45
|
+
body = value
|
46
|
+
|
47
|
+
path = [node.eval(self.config) if not isinstance(node, str) else node for node in extraction_path]
|
48
|
+
|
49
|
+
dpath.set(body, path, value=value)
|
50
|
+
|
51
|
+
def parse_raw_schema_values(
|
52
|
+
self,
|
53
|
+
raw_schema_data: MutableMapping[Any, Any],
|
54
|
+
schema_pointer: List[Union[InterpolatedString, str]],
|
55
|
+
key_pointer: List[Union[InterpolatedString, str]],
|
56
|
+
names_conversion: bool,
|
57
|
+
):
|
58
|
+
"""
|
59
|
+
1. Parses sheet headers from the provided raw schema. This method assumes that data is contiguous
|
60
|
+
i.e: every cell contains a value and the first cell which does not contain a value denotes the end
|
61
|
+
of the headers.
|
62
|
+
2. Makes name conversion if required.
|
63
|
+
3. Removes duplicated fields from the schema.
|
64
|
+
Return a list of tuples with correct property index (by found in array), value and raw_schema
|
65
|
+
"""
|
66
|
+
raw_schema_properties = self._extract_data(raw_schema_data, schema_pointer, default=[])
|
67
|
+
duplicate_fields = set()
|
68
|
+
parsed_schema_values = []
|
69
|
+
seen_values = set()
|
70
|
+
for property_index, raw_schema_property in enumerate(raw_schema_properties):
|
71
|
+
raw_schema_property_value = self._extract_data(raw_schema_property, key_pointer)
|
72
|
+
if not raw_schema_property_value:
|
73
|
+
break
|
74
|
+
if names_conversion:
|
75
|
+
raw_schema_property_value = safe_name_conversion(raw_schema_property_value)
|
76
|
+
|
77
|
+
if raw_schema_property_value in seen_values:
|
78
|
+
duplicate_fields.add(raw_schema_property_value)
|
79
|
+
seen_values.add(raw_schema_property_value)
|
80
|
+
parsed_schema_values.append((property_index, raw_schema_property_value, raw_schema_property))
|
81
|
+
|
82
|
+
if duplicate_fields:
|
83
|
+
parsed_schema_values = [
|
84
|
+
parsed_schema_value for parsed_schema_value in parsed_schema_values if parsed_schema_value[1] not in duplicate_fields
|
85
|
+
]
|
86
|
+
|
87
|
+
return parsed_schema_values
|
88
|
+
|
89
|
+
def parse(self, schema_type_identifier, records: Iterable[MutableMapping[Any, Any]]):
|
90
|
+
"""Removes duplicated fields and makes names conversion"""
|
91
|
+
names_conversion = self.config.get("names_conversion", False)
|
92
|
+
schema_pointer = schema_type_identifier.get("schema_pointer")
|
93
|
+
key_pointer = schema_type_identifier["key_pointer"]
|
94
|
+
parsed_properties = []
|
95
|
+
for raw_schema_data in records:
|
96
|
+
for _, parsed_value, raw_schema_property in self.parse_raw_schema_values(
|
97
|
+
raw_schema_data, schema_pointer, key_pointer, names_conversion
|
98
|
+
):
|
99
|
+
self._set_data(parsed_value, raw_schema_property, key_pointer)
|
100
|
+
parsed_properties.append(raw_schema_property)
|
101
|
+
self._set_data(parsed_properties, raw_schema_data, schema_pointer)
|
102
|
+
yield raw_schema_data
|
103
|
+
|
104
|
+
|
105
|
+
@dataclass
|
106
|
+
class DpathSchemaMatchingExtractor(DpathExtractor, RawSchemaParser):
|
107
|
+
"""
|
108
|
+
Current DpathExtractor has problems for this type of data in response:
|
109
|
+
[
|
110
|
+
{
|
111
|
+
"values": [
|
112
|
+
[
|
113
|
+
"name1",
|
114
|
+
"22"
|
115
|
+
],
|
116
|
+
[
|
117
|
+
"name2",
|
118
|
+
"24"
|
119
|
+
],
|
120
|
+
[
|
121
|
+
"name3",
|
122
|
+
"25"
|
123
|
+
]
|
124
|
+
]
|
125
|
+
}
|
126
|
+
]
|
127
|
+
|
128
|
+
This is because "values" field is a list of lists instead of objects that we could extract with "*".
|
129
|
+
In order to do so we need the ordered properties from the schema that we can match with each list of values.
|
130
|
+
Then, if we get a properties object like {0: 'name', 1: 'age'} we end up with:
|
131
|
+
|
132
|
+
{"type":"RECORD","record":{"stream":"a_stream_name","data":{"name":"name1","age":"22"},"emitted_at":1734371904128}}
|
133
|
+
{"type":"RECORD","record":{"stream":"a_stream_name","data":{"name":"name2","age":"24"},"emitted_at":1734371904134}}
|
134
|
+
{"type":"RECORD","record":{"stream":"a_stream_name","data":{"name":"name3","age":"25"},"emitted_at":1734371904134}}
|
135
|
+
"""
|
136
|
+
|
137
|
+
def __post_init__(self, parameters: Mapping[str, Any]) -> None:
|
138
|
+
super().__post_init__(parameters)
|
139
|
+
self.decoder = JsonDecoder(parameters={})
|
140
|
+
self._values_to_match_key = parameters["values_to_match_key"]
|
141
|
+
schema_type_identifier = parameters["schema_type_identifier"]
|
142
|
+
names_conversion = self.config.get("names_conversion", False)
|
143
|
+
self._indexed_properties_to_match = self.extract_properties_to_match(
|
144
|
+
parameters["properties_to_match"], schema_type_identifier, names_conversion=names_conversion
|
145
|
+
)
|
146
|
+
|
147
|
+
def extract_properties_to_match(self, properties_to_match, schema_type_identifier, names_conversion):
|
148
|
+
schema_pointer = schema_type_identifier.get("schema_pointer")
|
149
|
+
key_pointer = schema_type_identifier["key_pointer"]
|
150
|
+
indexed_properties = {}
|
151
|
+
for property_index, property_parsed_value, _ in self.parse_raw_schema_values(
|
152
|
+
properties_to_match, schema_pointer, key_pointer, names_conversion
|
153
|
+
):
|
154
|
+
indexed_properties[property_index] = property_parsed_value
|
155
|
+
return indexed_properties
|
156
|
+
|
157
|
+
@staticmethod
|
158
|
+
def match_properties_with_values(unmatched_values: List[str], indexed_properties: Dict[int, str]):
|
159
|
+
data = {}
|
160
|
+
for relevant_index in sorted(indexed_properties.keys()):
|
161
|
+
if relevant_index >= len(unmatched_values):
|
162
|
+
break
|
163
|
+
|
164
|
+
unmatch_value = unmatched_values[relevant_index]
|
165
|
+
if unmatch_value.strip() != "":
|
166
|
+
data[indexed_properties[relevant_index]] = unmatch_value
|
167
|
+
yield data
|
168
|
+
|
169
|
+
@staticmethod
|
170
|
+
def is_row_empty(cell_values: List[str]) -> bool:
|
171
|
+
for cell in cell_values:
|
172
|
+
if cell.strip() != "":
|
173
|
+
return False
|
174
|
+
return True
|
175
|
+
|
176
|
+
@staticmethod
|
177
|
+
def row_contains_relevant_data(cell_values: List[str], relevant_indices: Iterable[int]) -> bool:
|
178
|
+
for idx in relevant_indices:
|
179
|
+
if len(cell_values) > idx and cell_values[idx].strip() != "":
|
180
|
+
return True
|
181
|
+
return False
|
182
|
+
|
183
|
+
def extract_records(self, response: requests.Response) -> Iterable[MutableMapping[Any, Any]]:
|
184
|
+
raw_records_extracted = super().extract_records(response=response)
|
185
|
+
for raw_record in raw_records_extracted:
|
186
|
+
unmatched_values_collection = raw_record.get(self._values_to_match_key, [])
|
187
|
+
for unmatched_values in unmatched_values_collection:
|
188
|
+
if not DpathSchemaMatchingExtractor.is_row_empty(
|
189
|
+
unmatched_values
|
190
|
+
) and DpathSchemaMatchingExtractor.row_contains_relevant_data(unmatched_values, self._indexed_properties_to_match.keys()):
|
191
|
+
yield from DpathSchemaMatchingExtractor.match_properties_with_values(
|
192
|
+
unmatched_values, self._indexed_properties_to_match
|
193
|
+
)
|
194
|
+
|
195
|
+
|
196
|
+
class DpathSchemaExtractor(DpathExtractor, RawSchemaParser):
|
197
|
+
"""
|
198
|
+
Makes names conversion and parses sheet headers from the provided row.
|
199
|
+
"""
|
200
|
+
|
201
|
+
def __post_init__(self, parameters: Mapping[str, Any]) -> None:
|
202
|
+
super().__post_init__(parameters)
|
203
|
+
self.schema_type_identifier = parameters["schema_type_identifier"]
|
204
|
+
|
205
|
+
def extract_records(self, response: requests.Response) -> Iterable[MutableMapping[Any, Any]]:
|
206
|
+
extracted_records = super().extract_records(response=response)
|
207
|
+
yield from self.parse(schema_type_identifier=self.schema_type_identifier, records=extracted_records)
|
@@ -0,0 +1,36 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
|
5
|
+
import logging
|
6
|
+
from typing import Any, Iterable, Mapping
|
7
|
+
|
8
|
+
from airbyte_cdk.sources.declarative.partition_routers.single_partition_router import SinglePartitionRouter
|
9
|
+
from airbyte_cdk.sources.types import StreamSlice
|
10
|
+
|
11
|
+
|
12
|
+
logger = logging.getLogger("airbyte")
|
13
|
+
|
14
|
+
|
15
|
+
class RangePartitionRouter(SinglePartitionRouter):
|
16
|
+
"""
|
17
|
+
Create ranges to request rows data to google sheets api.
|
18
|
+
"""
|
19
|
+
|
20
|
+
parameters: Mapping[str, Any]
|
21
|
+
|
22
|
+
def __init__(self, parameters: Mapping[str, Any]) -> None:
|
23
|
+
super().__init__(parameters)
|
24
|
+
self.parameters = parameters
|
25
|
+
self.sheet_row_count = parameters.get("row_count", 0)
|
26
|
+
self.sheet_id = parameters.get("sheet_id")
|
27
|
+
self.batch_size = parameters.get("batch_size")
|
28
|
+
|
29
|
+
def stream_slices(self) -> Iterable[StreamSlice]:
|
30
|
+
start_range = 2 # skip 1 row, as expected column (fields) names there
|
31
|
+
|
32
|
+
while start_range <= self.sheet_row_count:
|
33
|
+
end_range = start_range + self.batch_size
|
34
|
+
logger.info(f"Fetching range {self.sheet_id}!{start_range}:{end_range}")
|
35
|
+
yield StreamSlice(partition={"start_range": start_range, "end_range": end_range}, cursor_slice={})
|
36
|
+
start_range = end_range + 1
|