airbyte-source-google-sheets 0.8.5__py3-none-any.whl → 0.9.0__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- {airbyte_source_google_sheets-0.8.5.dist-info → airbyte_source_google_sheets-0.9.0.dist-info}/METADATA +3 -7
- airbyte_source_google_sheets-0.9.0.dist-info/RECORD +16 -0
- source_google_sheets/__init__.py +4 -0
- source_google_sheets/components/__init__.py +8 -0
- source_google_sheets/components/extractors.py +207 -0
- source_google_sheets/components/partition_routers.py +36 -0
- source_google_sheets/manifest.yaml +408 -0
- source_google_sheets/models/spreadsheet.py +1 -1
- source_google_sheets/models/spreadsheet_values.py +1 -1
- source_google_sheets/run.py +43 -5
- source_google_sheets/source.py +14 -271
- source_google_sheets/spec.yaml +4 -6
- source_google_sheets/utils.py +1 -1
- airbyte_source_google_sheets-0.8.5.dist-info/RECORD +0 -14
- source_google_sheets/client.py +0 -49
- source_google_sheets/helpers.py +0 -234
- {airbyte_source_google_sheets-0.8.5.dist-info → airbyte_source_google_sheets-0.9.0.dist-info}/WHEEL +0 -0
- {airbyte_source_google_sheets-0.8.5.dist-info → airbyte_source_google_sheets-0.9.0.dist-info}/entry_points.txt +0 -0
@@ -1,21 +1,17 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: airbyte-source-google-sheets
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.9.0
|
4
4
|
Summary: Source implementation for Google Sheets.
|
5
5
|
License: Elv2
|
6
6
|
Author: Airbyte
|
7
7
|
Author-email: contact@airbyte.io
|
8
|
-
Requires-Python: >=3.10,<
|
8
|
+
Requires-Python: >=3.10,<3.13
|
9
9
|
Classifier: License :: Other/Proprietary License
|
10
10
|
Classifier: Programming Language :: Python :: 3
|
11
11
|
Classifier: Programming Language :: Python :: 3.10
|
12
12
|
Classifier: Programming Language :: Python :: 3.11
|
13
13
|
Classifier: Programming Language :: Python :: 3.12
|
14
|
-
|
15
|
-
Requires-Dist: Unidecode (==1.3.8)
|
16
|
-
Requires-Dist: airbyte-cdk (>=4,<5)
|
17
|
-
Requires-Dist: google-api-python-client (==2.114.0)
|
18
|
-
Requires-Dist: google-auth-httplib2 (==0.2.0)
|
14
|
+
Requires-Dist: airbyte-cdk (>=6,<7)
|
19
15
|
Project-URL: Documentation, https://docs.airbyte.com/integrations/sources/google-sheets
|
20
16
|
Project-URL: Homepage, https://airbyte.com
|
21
17
|
Project-URL: Repository, https://github.com/airbytehq/airbyte
|
@@ -0,0 +1,16 @@
|
|
1
|
+
source_google_sheets/__init__.py,sha256=dYaZS0KrTjksk_yeSuXmNDXDsNKWctFnlZJSMgLryXE,135
|
2
|
+
source_google_sheets/components/__init__.py,sha256=v7odPaLdz2S_NRzExsxkk6694Vmjbplz4Z5mA4lxdkA,343
|
3
|
+
source_google_sheets/components/extractors.py,sha256=S7lPBuy9MO_mnl3h4B48F_szuusvDdvpX0OBK-_AxVA,8909
|
4
|
+
source_google_sheets/components/partition_routers.py,sha256=SWo1V0K10ZdWE2TQ0KuQUfue04RTyHJe1f6BOj6c96s,1265
|
5
|
+
source_google_sheets/manifest.yaml,sha256=HG8vpxv4lqwe5TCneq_zxuERaGNdtno7yAs61Wsk30E,15821
|
6
|
+
source_google_sheets/models/__init__.py,sha256=Z-4MTpxG5t2jGhXzs4PPoIOa83zw3jRnUDx0N9Puv3s,61
|
7
|
+
source_google_sheets/models/spreadsheet.py,sha256=DEef7bWQEpY1Uqyk7RN6qLF8oxLSNzuAtHlGRmMAKQA,1112
|
8
|
+
source_google_sheets/models/spreadsheet_values.py,sha256=-XRMuuILn9JN8svHNTj6-oG8mLTZOZ5Hejy2pJ5bILk,440
|
9
|
+
source_google_sheets/run.py,sha256=eaPRcarWqkB2b2DokvI83w7rz1blmWPQCFahvCyCdSY,1887
|
10
|
+
source_google_sheets/source.py,sha256=qO1KoGdphieu7F5VgDYtrbqs56AUvMWFGNvFHP2b9Z4,778
|
11
|
+
source_google_sheets/spec.yaml,sha256=RIUILMhfS0is2r_mCkmIVrQfvND1D3eobDK1YElmzhU,5009
|
12
|
+
source_google_sheets/utils.py,sha256=JEQIVLSFEAff-7zF3gPzsvFc9xLfCj9hVuFFYrSWiOo,2290
|
13
|
+
airbyte_source_google_sheets-0.9.0.dist-info/METADATA,sha256=TDctCjzHtQ3IV9mhNhq0_mqjI0TTMY4ZQAdhu33pIyM,5368
|
14
|
+
airbyte_source_google_sheets-0.9.0.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
|
15
|
+
airbyte_source_google_sheets-0.9.0.dist-info/entry_points.txt,sha256=Dtsfjohe5IPUFyqojk49SIoP7CifCTlNLG_pgivzppo,69
|
16
|
+
airbyte_source_google_sheets-0.9.0.dist-info/RECORD,,
|
source_google_sheets/__init__.py
CHANGED
@@ -0,0 +1,8 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
|
5
|
+
from source_google_sheets.components.extractors import DpathSchemaMatchingExtractor, DpathSchemaExtractor
|
6
|
+
from source_google_sheets.components.partition_routers import RangePartitionRouter
|
7
|
+
|
8
|
+
__all__ = ["DpathSchemaMatchingExtractor", "RangePartitionRouter", "DpathSchemaExtractor"]
|
@@ -0,0 +1,207 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
|
5
|
+
from dataclasses import dataclass
|
6
|
+
from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Union
|
7
|
+
|
8
|
+
import dpath
|
9
|
+
import requests
|
10
|
+
|
11
|
+
from airbyte_cdk.sources.declarative.decoders.json_decoder import JsonDecoder
|
12
|
+
from airbyte_cdk.sources.declarative.extractors.dpath_extractor import DpathExtractor
|
13
|
+
from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString
|
14
|
+
from airbyte_cdk.sources.types import Config
|
15
|
+
from source_google_sheets.utils import name_conversion, safe_name_conversion
|
16
|
+
|
17
|
+
|
18
|
+
class RawSchemaParser:
|
19
|
+
config: Config
|
20
|
+
|
21
|
+
def _extract_data(
|
22
|
+
self,
|
23
|
+
body: Mapping[str, Any],
|
24
|
+
extraction_path: Optional[List[Union[InterpolatedString, str]]] = None,
|
25
|
+
default: Any = None,
|
26
|
+
) -> Any:
|
27
|
+
"""
|
28
|
+
Extracts data from the body based on the provided extraction path.
|
29
|
+
"""
|
30
|
+
|
31
|
+
if not extraction_path:
|
32
|
+
return body
|
33
|
+
|
34
|
+
path = [node.eval(self.config) if not isinstance(node, str) else node for node in extraction_path]
|
35
|
+
|
36
|
+
return dpath.get(body, path, default=default) # type: ignore # extracted
|
37
|
+
|
38
|
+
def _set_data(
|
39
|
+
self, value: Any, body: MutableMapping[str, Any], extraction_path: Optional[List[Union[InterpolatedString, str]]] = None
|
40
|
+
) -> Any:
|
41
|
+
"""
|
42
|
+
Sets data in the body based on the provided extraction path.
|
43
|
+
"""
|
44
|
+
if not extraction_path:
|
45
|
+
body = value
|
46
|
+
|
47
|
+
path = [node.eval(self.config) if not isinstance(node, str) else node for node in extraction_path]
|
48
|
+
|
49
|
+
dpath.set(body, path, value=value)
|
50
|
+
|
51
|
+
def parse_raw_schema_values(
|
52
|
+
self,
|
53
|
+
raw_schema_data: MutableMapping[Any, Any],
|
54
|
+
schema_pointer: List[Union[InterpolatedString, str]],
|
55
|
+
key_pointer: List[Union[InterpolatedString, str]],
|
56
|
+
names_conversion: bool,
|
57
|
+
):
|
58
|
+
"""
|
59
|
+
1. Parses sheet headers from the provided raw schema. This method assumes that data is contiguous
|
60
|
+
i.e: every cell contains a value and the first cell which does not contain a value denotes the end
|
61
|
+
of the headers.
|
62
|
+
2. Makes name conversion if required.
|
63
|
+
3. Removes duplicated fields from the schema.
|
64
|
+
Return a list of tuples with correct property index (by found in array), value and raw_schema
|
65
|
+
"""
|
66
|
+
raw_schema_properties = self._extract_data(raw_schema_data, schema_pointer, default=[])
|
67
|
+
duplicate_fields = set()
|
68
|
+
parsed_schema_values = []
|
69
|
+
seen_values = set()
|
70
|
+
for property_index, raw_schema_property in enumerate(raw_schema_properties):
|
71
|
+
raw_schema_property_value = self._extract_data(raw_schema_property, key_pointer)
|
72
|
+
if not raw_schema_property_value:
|
73
|
+
break
|
74
|
+
if names_conversion:
|
75
|
+
raw_schema_property_value = safe_name_conversion(raw_schema_property_value)
|
76
|
+
|
77
|
+
if raw_schema_property_value in seen_values:
|
78
|
+
duplicate_fields.add(raw_schema_property_value)
|
79
|
+
seen_values.add(raw_schema_property_value)
|
80
|
+
parsed_schema_values.append((property_index, raw_schema_property_value, raw_schema_property))
|
81
|
+
|
82
|
+
if duplicate_fields:
|
83
|
+
parsed_schema_values = [
|
84
|
+
parsed_schema_value for parsed_schema_value in parsed_schema_values if parsed_schema_value[1] not in duplicate_fields
|
85
|
+
]
|
86
|
+
|
87
|
+
return parsed_schema_values
|
88
|
+
|
89
|
+
def parse(self, schema_type_identifier, records: Iterable[MutableMapping[Any, Any]]):
|
90
|
+
"""Removes duplicated fields and makes names conversion"""
|
91
|
+
names_conversion = self.config.get("names_conversion", False)
|
92
|
+
schema_pointer = schema_type_identifier.get("schema_pointer")
|
93
|
+
key_pointer = schema_type_identifier["key_pointer"]
|
94
|
+
parsed_properties = []
|
95
|
+
for raw_schema_data in records:
|
96
|
+
for _, parsed_value, raw_schema_property in self.parse_raw_schema_values(
|
97
|
+
raw_schema_data, schema_pointer, key_pointer, names_conversion
|
98
|
+
):
|
99
|
+
self._set_data(parsed_value, raw_schema_property, key_pointer)
|
100
|
+
parsed_properties.append(raw_schema_property)
|
101
|
+
self._set_data(parsed_properties, raw_schema_data, schema_pointer)
|
102
|
+
yield raw_schema_data
|
103
|
+
|
104
|
+
|
105
|
+
@dataclass
|
106
|
+
class DpathSchemaMatchingExtractor(DpathExtractor, RawSchemaParser):
|
107
|
+
"""
|
108
|
+
Current DpathExtractor has problems for this type of data in response:
|
109
|
+
[
|
110
|
+
{
|
111
|
+
"values": [
|
112
|
+
[
|
113
|
+
"name1",
|
114
|
+
"22"
|
115
|
+
],
|
116
|
+
[
|
117
|
+
"name2",
|
118
|
+
"24"
|
119
|
+
],
|
120
|
+
[
|
121
|
+
"name3",
|
122
|
+
"25"
|
123
|
+
]
|
124
|
+
]
|
125
|
+
}
|
126
|
+
]
|
127
|
+
|
128
|
+
This is because "values" field is a list of lists instead of objects that we could extract with "*".
|
129
|
+
In order to do so we need the ordered properties from the schema that we can match with each list of values.
|
130
|
+
Then, if we get a properties object like {0: 'name', 1: 'age'} we end up with:
|
131
|
+
|
132
|
+
{"type":"RECORD","record":{"stream":"a_stream_name","data":{"name":"name1","age":"22"},"emitted_at":1734371904128}}
|
133
|
+
{"type":"RECORD","record":{"stream":"a_stream_name","data":{"name":"name2","age":"24"},"emitted_at":1734371904134}}
|
134
|
+
{"type":"RECORD","record":{"stream":"a_stream_name","data":{"name":"name3","age":"25"},"emitted_at":1734371904134}}
|
135
|
+
"""
|
136
|
+
|
137
|
+
def __post_init__(self, parameters: Mapping[str, Any]) -> None:
|
138
|
+
super().__post_init__(parameters)
|
139
|
+
self.decoder = JsonDecoder(parameters={})
|
140
|
+
self._values_to_match_key = parameters["values_to_match_key"]
|
141
|
+
schema_type_identifier = parameters["schema_type_identifier"]
|
142
|
+
names_conversion = self.config.get("names_conversion", False)
|
143
|
+
self._indexed_properties_to_match = self.extract_properties_to_match(
|
144
|
+
parameters["properties_to_match"], schema_type_identifier, names_conversion=names_conversion
|
145
|
+
)
|
146
|
+
|
147
|
+
def extract_properties_to_match(self, properties_to_match, schema_type_identifier, names_conversion):
|
148
|
+
schema_pointer = schema_type_identifier.get("schema_pointer")
|
149
|
+
key_pointer = schema_type_identifier["key_pointer"]
|
150
|
+
indexed_properties = {}
|
151
|
+
for property_index, property_parsed_value, _ in self.parse_raw_schema_values(
|
152
|
+
properties_to_match, schema_pointer, key_pointer, names_conversion
|
153
|
+
):
|
154
|
+
indexed_properties[property_index] = property_parsed_value
|
155
|
+
return indexed_properties
|
156
|
+
|
157
|
+
@staticmethod
|
158
|
+
def match_properties_with_values(unmatched_values: List[str], indexed_properties: Dict[int, str]):
|
159
|
+
data = {}
|
160
|
+
for relevant_index in sorted(indexed_properties.keys()):
|
161
|
+
if relevant_index >= len(unmatched_values):
|
162
|
+
break
|
163
|
+
|
164
|
+
unmatch_value = unmatched_values[relevant_index]
|
165
|
+
if unmatch_value.strip() != "":
|
166
|
+
data[indexed_properties[relevant_index]] = unmatch_value
|
167
|
+
yield data
|
168
|
+
|
169
|
+
@staticmethod
|
170
|
+
def is_row_empty(cell_values: List[str]) -> bool:
|
171
|
+
for cell in cell_values:
|
172
|
+
if cell.strip() != "":
|
173
|
+
return False
|
174
|
+
return True
|
175
|
+
|
176
|
+
@staticmethod
|
177
|
+
def row_contains_relevant_data(cell_values: List[str], relevant_indices: Iterable[int]) -> bool:
|
178
|
+
for idx in relevant_indices:
|
179
|
+
if len(cell_values) > idx and cell_values[idx].strip() != "":
|
180
|
+
return True
|
181
|
+
return False
|
182
|
+
|
183
|
+
def extract_records(self, response: requests.Response) -> Iterable[MutableMapping[Any, Any]]:
|
184
|
+
raw_records_extracted = super().extract_records(response=response)
|
185
|
+
for raw_record in raw_records_extracted:
|
186
|
+
unmatched_values_collection = raw_record.get(self._values_to_match_key, [])
|
187
|
+
for unmatched_values in unmatched_values_collection:
|
188
|
+
if not DpathSchemaMatchingExtractor.is_row_empty(
|
189
|
+
unmatched_values
|
190
|
+
) and DpathSchemaMatchingExtractor.row_contains_relevant_data(unmatched_values, self._indexed_properties_to_match.keys()):
|
191
|
+
yield from DpathSchemaMatchingExtractor.match_properties_with_values(
|
192
|
+
unmatched_values, self._indexed_properties_to_match
|
193
|
+
)
|
194
|
+
|
195
|
+
|
196
|
+
class DpathSchemaExtractor(DpathExtractor, RawSchemaParser):
|
197
|
+
"""
|
198
|
+
Makes names conversion and parses sheet headers from the provided row.
|
199
|
+
"""
|
200
|
+
|
201
|
+
def __post_init__(self, parameters: Mapping[str, Any]) -> None:
|
202
|
+
super().__post_init__(parameters)
|
203
|
+
self.schema_type_identifier = parameters["schema_type_identifier"]
|
204
|
+
|
205
|
+
def extract_records(self, response: requests.Response) -> Iterable[MutableMapping[Any, Any]]:
|
206
|
+
extracted_records = super().extract_records(response=response)
|
207
|
+
yield from self.parse(schema_type_identifier=self.schema_type_identifier, records=extracted_records)
|
@@ -0,0 +1,36 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
|
5
|
+
import logging
|
6
|
+
from typing import Any, Iterable, Mapping
|
7
|
+
|
8
|
+
from airbyte_cdk.sources.declarative.partition_routers.single_partition_router import SinglePartitionRouter
|
9
|
+
from airbyte_cdk.sources.types import StreamSlice
|
10
|
+
|
11
|
+
|
12
|
+
logger = logging.getLogger("airbyte")
|
13
|
+
|
14
|
+
|
15
|
+
class RangePartitionRouter(SinglePartitionRouter):
|
16
|
+
"""
|
17
|
+
Create ranges to request rows data to google sheets api.
|
18
|
+
"""
|
19
|
+
|
20
|
+
parameters: Mapping[str, Any]
|
21
|
+
|
22
|
+
def __init__(self, parameters: Mapping[str, Any]) -> None:
|
23
|
+
super().__init__(parameters)
|
24
|
+
self.parameters = parameters
|
25
|
+
self.sheet_row_count = parameters.get("row_count", 0)
|
26
|
+
self.sheet_id = parameters.get("sheet_id")
|
27
|
+
self.batch_size = parameters.get("batch_size")
|
28
|
+
|
29
|
+
def stream_slices(self) -> Iterable[StreamSlice]:
|
30
|
+
start_range = 2 # skip 1 row, as expected column (fields) names there
|
31
|
+
|
32
|
+
while start_range <= self.sheet_row_count:
|
33
|
+
end_range = start_range + self.batch_size
|
34
|
+
logger.info(f"Fetching range {self.sheet_id}!{start_range}:{end_range}")
|
35
|
+
yield StreamSlice(partition={"start_range": start_range, "end_range": end_range}, cursor_slice={})
|
36
|
+
start_range = end_range + 1
|