airbyte-source-shopify 2.4.14.dev202407181247__py3-none-any.whl → 3.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {airbyte_source_shopify-2.4.14.dev202407181247.dist-info → airbyte_source_shopify-3.1.0.dist-info}/METADATA +4 -4
- {airbyte_source_shopify-2.4.14.dev202407181247.dist-info → airbyte_source_shopify-3.1.0.dist-info}/RECORD +25 -27
- {airbyte_source_shopify-2.4.14.dev202407181247.dist-info → airbyte_source_shopify-3.1.0.dist-info}/WHEEL +1 -1
- source_shopify/auth.py +0 -1
- source_shopify/config_migrations.py +4 -1
- source_shopify/http_request.py +4 -2
- source_shopify/schemas/countries.json +7 -19
- source_shopify/schemas/customer_journey_summary.json +228 -148
- source_shopify/schemas/deleted_products.json +27 -0
- source_shopify/schemas/orders.json +38 -0
- source_shopify/schemas/product_variants.json +26 -8
- source_shopify/schemas/profile_location_groups.json +10 -0
- source_shopify/scopes.py +7 -6
- source_shopify/shopify_graphql/bulk/exceptions.py +6 -1
- source_shopify/shopify_graphql/bulk/job.py +173 -65
- source_shopify/shopify_graphql/bulk/query.py +440 -88
- source_shopify/shopify_graphql/bulk/record.py +260 -29
- source_shopify/shopify_graphql/bulk/retry.py +12 -12
- source_shopify/shopify_graphql/bulk/tools.py +17 -2
- source_shopify/source.py +6 -10
- source_shopify/spec.json +11 -5
- source_shopify/streams/base_streams.py +181 -54
- source_shopify/streams/streams.py +211 -58
- source_shopify/utils.py +47 -12
- source_shopify/schemas/customer_saved_search.json +0 -32
- source_shopify/schemas/products_graph_ql.json +0 -123
- source_shopify/shopify_graphql/graphql.py +0 -64
- source_shopify/shopify_graphql/schema.py +0 -29442
- {airbyte_source_shopify-2.4.14.dev202407181247.dist-info → airbyte_source_shopify-3.1.0.dist-info}/entry_points.txt +0 -0
|
@@ -2,27 +2,38 @@
|
|
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
3
|
#
|
|
4
4
|
|
|
5
|
-
import logging
|
|
6
5
|
from dataclasses import dataclass, field
|
|
7
6
|
from datetime import datetime
|
|
7
|
+
from enum import Enum
|
|
8
8
|
from time import sleep, time
|
|
9
9
|
from typing import Any, Final, Iterable, List, Mapping, Optional
|
|
10
10
|
|
|
11
11
|
import pendulum as pdm
|
|
12
12
|
import requests
|
|
13
|
-
from airbyte_cdk.sources.streams.http import HttpClient
|
|
14
13
|
from requests.exceptions import JSONDecodeError
|
|
15
|
-
from source_shopify.utils import ApiTypeEnum
|
|
14
|
+
from source_shopify.utils import LOGGER, ApiTypeEnum
|
|
16
15
|
from source_shopify.utils import ShopifyRateLimiter as limiter
|
|
17
16
|
|
|
18
|
-
from
|
|
17
|
+
from airbyte_cdk.sources.streams.http import HttpClient
|
|
18
|
+
|
|
19
19
|
from .exceptions import AirbyteTracedException, ShopifyBulkExceptions
|
|
20
20
|
from .query import ShopifyBulkQuery, ShopifyBulkTemplates
|
|
21
|
+
from .record import ShopifyBulkRecord
|
|
21
22
|
from .retry import bulk_retry_on_exception
|
|
22
23
|
from .status import ShopifyBulkJobStatus
|
|
23
24
|
from .tools import END_OF_FILE, BulkTools
|
|
24
25
|
|
|
25
26
|
|
|
27
|
+
class BulkOperationUserErrorCode(Enum):
|
|
28
|
+
"""
|
|
29
|
+
Possible error codes that can be returned by BulkOperationUserError.
|
|
30
|
+
https://shopify.dev/docs/api/admin-graphql/latest/enums/BulkOperationUserErrorCode
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
INVALID = "INVALID"
|
|
34
|
+
OPERATION_IN_PROGRESS = "OPERATION_IN_PROGRESS"
|
|
35
|
+
|
|
36
|
+
|
|
26
37
|
@dataclass
|
|
27
38
|
class ShopifyBulkManager:
|
|
28
39
|
http_client: HttpClient
|
|
@@ -32,8 +43,8 @@ class ShopifyBulkManager:
|
|
|
32
43
|
job_size: float
|
|
33
44
|
job_checkpoint_interval: int
|
|
34
45
|
|
|
35
|
-
|
|
36
|
-
|
|
46
|
+
parent_stream_name: Optional[str] = None
|
|
47
|
+
parent_stream_cursor: Optional[str] = None
|
|
37
48
|
|
|
38
49
|
# 10Mb chunk size to save the file
|
|
39
50
|
_retrieve_chunk_size: Final[int] = 1024 * 1024 * 10
|
|
@@ -54,7 +65,7 @@ class ShopifyBulkManager:
|
|
|
54
65
|
|
|
55
66
|
# currents: _job_id, _job_state, _job_created_at, _job_self_canceled
|
|
56
67
|
_job_id: Optional[str] = field(init=False, default=None)
|
|
57
|
-
_job_state: str = field(init=False, default=None) # this string is based on ShopifyBulkJobStatus
|
|
68
|
+
_job_state: str | None = field(init=False, default=None) # this string is based on ShopifyBulkJobStatus
|
|
58
69
|
# completed and saved Bulk Job result filename
|
|
59
70
|
_job_result_filename: Optional[str] = field(init=False, default=None)
|
|
60
71
|
# date-time when the Bulk Job was created on the server
|
|
@@ -71,8 +82,8 @@ class ShopifyBulkManager:
|
|
|
71
82
|
_job_last_rec_count: int = field(init=False, default=0)
|
|
72
83
|
# the flag to adjust the next slice from the checkpointed cursor vaue
|
|
73
84
|
_job_adjust_slice_from_checkpoint: bool = field(init=False, default=False)
|
|
74
|
-
#
|
|
75
|
-
|
|
85
|
+
# keeps the last checkpointed cursor value for supported streams
|
|
86
|
+
_job_last_checkpoint_cursor_value: str | None = field(init=False, default=None)
|
|
76
87
|
|
|
77
88
|
# expand slice factor
|
|
78
89
|
_job_size_expand_factor: int = field(init=False, default=2)
|
|
@@ -84,7 +95,7 @@ class ShopifyBulkManager:
|
|
|
84
95
|
# 2 sec is set as default value to cover the case with the empty-fast-completed jobs
|
|
85
96
|
_job_last_elapsed_time: float = field(init=False, default=2.0)
|
|
86
97
|
|
|
87
|
-
def __post_init__(self):
|
|
98
|
+
def __post_init__(self) -> None:
|
|
88
99
|
self._job_size = self.job_size
|
|
89
100
|
# The upper boundary for slice size is limited by the value from the config, default value is `P30D`
|
|
90
101
|
self._job_size_max = self.job_size
|
|
@@ -95,6 +106,8 @@ class ShopifyBulkManager:
|
|
|
95
106
|
self._job_max_elapsed_time = self.job_termination_threshold
|
|
96
107
|
# how many records should be collected before we use the checkpoining
|
|
97
108
|
self._job_checkpoint_interval = self.job_checkpoint_interval
|
|
109
|
+
# define Record Producer instance
|
|
110
|
+
self.record_producer: ShopifyBulkRecord = ShopifyBulkRecord(self.query, self.parent_stream_name, self.parent_stream_cursor)
|
|
98
111
|
|
|
99
112
|
@property
|
|
100
113
|
def _tools(self) -> BulkTools:
|
|
@@ -151,9 +164,20 @@ class ShopifyBulkManager:
|
|
|
151
164
|
self._job_should_revert_slice = False
|
|
152
165
|
return False
|
|
153
166
|
|
|
167
|
+
@property
|
|
168
|
+
def _supports_checkpointing(self) -> bool:
|
|
169
|
+
"""
|
|
170
|
+
The flag to determine whether or not the BULK Stream supports the `BULK checkpointing`.
|
|
171
|
+
"""
|
|
172
|
+
return self.query.supports_checkpointing
|
|
173
|
+
|
|
154
174
|
@property
|
|
155
175
|
def _job_should_checkpoint(self) -> bool:
|
|
156
|
-
return self._job_last_rec_count >= self._job_checkpoint_interval
|
|
176
|
+
return self._supports_checkpointing and self._job_last_rec_count >= self._job_checkpoint_interval
|
|
177
|
+
|
|
178
|
+
@property
|
|
179
|
+
def _job_any_lines_collected(self) -> bool:
|
|
180
|
+
return self._job_last_rec_count > 0
|
|
157
181
|
|
|
158
182
|
def _expand_job_size(self) -> None:
|
|
159
183
|
self._job_size += self._job_size_adjusted_expand_factor
|
|
@@ -191,18 +215,49 @@ class ShopifyBulkManager:
|
|
|
191
215
|
# set the running job object count to default
|
|
192
216
|
self._job_last_rec_count = 0
|
|
193
217
|
|
|
218
|
+
def _set_checkpointing(self) -> None:
|
|
219
|
+
# set the flag to adjust the next slice from the checkpointed cursor value
|
|
220
|
+
self._job_adjust_slice_from_checkpoint = True
|
|
221
|
+
|
|
222
|
+
def _reset_checkpointing(self) -> None:
|
|
223
|
+
# reseting the checkpoint flag, if bulk job has completed normally
|
|
224
|
+
self._job_adjust_slice_from_checkpoint = False
|
|
225
|
+
|
|
226
|
+
def _set_last_checkpoint_cursor_value(self, checkpointed_cursor: str) -> None:
|
|
227
|
+
"""
|
|
228
|
+
Sets the last checkpoint cursor value.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
checkpointed_cursor (str): The cursor value to set as the last checkpoint. Defaults to None.
|
|
232
|
+
"""
|
|
233
|
+
self._job_last_checkpoint_cursor_value = checkpointed_cursor
|
|
234
|
+
|
|
235
|
+
def _checkpoint_cursor_has_collision(self, checkpointed_cursor: str) -> bool:
|
|
236
|
+
"""
|
|
237
|
+
Checks if the provided checkpointed cursor collides with the last checkpointed cursor value.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
checkpointed_cursor (str): The cursor value to check for collision. Defaults to None.
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
bool: True if the provided cursor collides with the last checkpointed cursor value, False otherwise.
|
|
244
|
+
"""
|
|
245
|
+
return self._job_last_checkpoint_cursor_value == checkpointed_cursor
|
|
246
|
+
|
|
194
247
|
def _job_completed(self) -> bool:
|
|
195
248
|
return self._job_state == ShopifyBulkJobStatus.COMPLETED.value
|
|
196
249
|
|
|
197
250
|
def _job_canceled(self) -> bool:
|
|
198
251
|
return self._job_state == ShopifyBulkJobStatus.CANCELED.value
|
|
199
252
|
|
|
253
|
+
def _job_failed(self) -> bool:
|
|
254
|
+
return self._job_state == ShopifyBulkJobStatus.FAILED.value
|
|
255
|
+
|
|
200
256
|
def _job_cancel(self) -> None:
|
|
201
257
|
_, canceled_response = self.http_client.send_request(
|
|
202
258
|
http_method="POST",
|
|
203
259
|
url=self.base_url,
|
|
204
|
-
|
|
205
|
-
headers={"Content-Type": "application/graphql"},
|
|
260
|
+
json={"query": ShopifyBulkTemplates.cancel(self._job_id)},
|
|
206
261
|
request_kwargs={},
|
|
207
262
|
)
|
|
208
263
|
# mark the job was self-canceled
|
|
@@ -221,21 +276,24 @@ class ShopifyBulkManager:
|
|
|
221
276
|
else:
|
|
222
277
|
message = f"Elapsed time: {self._job_elapsed_time_in_state} sec"
|
|
223
278
|
if self._job_last_rec_count > 0:
|
|
224
|
-
count_message = f".
|
|
279
|
+
count_message = f". Rows collected: {self._job_last_rec_count}"
|
|
225
280
|
message = message + count_message
|
|
226
281
|
self._log_state(message)
|
|
227
282
|
self._log_job_msg_count = 0
|
|
228
283
|
|
|
229
284
|
def _log_state(self, message: Optional[str] = None) -> None:
|
|
230
|
-
pattern = f"Stream: `{self.http_client.
|
|
285
|
+
pattern = f"Stream: `{self.http_client.name}`, the BULK Job: `{self._job_id}` is {self._job_state}"
|
|
231
286
|
if message:
|
|
232
|
-
|
|
287
|
+
LOGGER.info(f"{pattern}. {message}.")
|
|
233
288
|
else:
|
|
234
|
-
|
|
289
|
+
LOGGER.info(pattern)
|
|
235
290
|
|
|
236
291
|
def _job_get_result(self, response: Optional[requests.Response] = None) -> Optional[str]:
|
|
237
292
|
parsed_response = response.json().get("data", {}).get("node", {}) if response else None
|
|
238
|
-
|
|
293
|
+
# get `complete` or `partial` result from collected Bulk Job results
|
|
294
|
+
full_result_url = parsed_response.get("url") if parsed_response else None
|
|
295
|
+
partial_result_url = parsed_response.get("partialDataUrl") if parsed_response else None
|
|
296
|
+
job_result_url = full_result_url if full_result_url else partial_result_url
|
|
239
297
|
if job_result_url:
|
|
240
298
|
# save to local file using chunks to avoid OOM
|
|
241
299
|
filename = self._tools.filename_from_url(job_result_url)
|
|
@@ -248,6 +306,13 @@ class ShopifyBulkManager:
|
|
|
248
306
|
file.write(END_OF_FILE.encode())
|
|
249
307
|
return filename
|
|
250
308
|
|
|
309
|
+
def _job_get_checkpointed_result(self, response: Optional[requests.Response]) -> None:
|
|
310
|
+
if self._job_any_lines_collected or self._job_should_checkpoint:
|
|
311
|
+
# set the flag to adjust the next slice from the checkpointed cursor value
|
|
312
|
+
self._set_checkpointing()
|
|
313
|
+
# fetch the collected records from CANCELED Job on checkpointing
|
|
314
|
+
self._job_result_filename = self._job_get_result(response)
|
|
315
|
+
|
|
251
316
|
def _job_update_state(self, response: Optional[requests.Response] = None) -> None:
|
|
252
317
|
if response:
|
|
253
318
|
self._job_state = response.json().get("data", {}).get("node", {}).get("status")
|
|
@@ -257,9 +322,7 @@ class ShopifyBulkManager:
|
|
|
257
322
|
self._log_job_state_with_count()
|
|
258
323
|
elif self._job_state in [ShopifyBulkJobStatus.CANCELED.value, ShopifyBulkJobStatus.CANCELING.value]:
|
|
259
324
|
# do not emit `CANCELED / CANCELING` Bulk Job status, while checkpointing
|
|
260
|
-
if self._job_should_checkpoint:
|
|
261
|
-
pass
|
|
262
|
-
else:
|
|
325
|
+
if not self._job_should_checkpoint:
|
|
263
326
|
self._log_job_state_with_count()
|
|
264
327
|
else:
|
|
265
328
|
self._log_state()
|
|
@@ -273,26 +336,20 @@ class ShopifyBulkManager:
|
|
|
273
336
|
f"The BULK Job: `{self._job_id}` exited with {self._job_state}, details: {response.text}"
|
|
274
337
|
)
|
|
275
338
|
else:
|
|
276
|
-
|
|
277
|
-
# set the flag to adjust the next slice from the checkpointed cursor value
|
|
278
|
-
self._job_adjust_slice_from_checkpoint = True
|
|
279
|
-
# fetch the collected records from CANCELED Job on checkpointing
|
|
280
|
-
self._job_result_filename = self._job_get_result(response)
|
|
339
|
+
self._job_get_checkpointed_result(response)
|
|
281
340
|
|
|
282
341
|
def _on_canceling_job(self, **kwargs) -> None:
|
|
283
342
|
sleep(self._job_check_interval)
|
|
284
343
|
|
|
285
344
|
def _cancel_on_long_running_job(self) -> None:
|
|
286
|
-
|
|
287
|
-
f"Stream: `{self.http_client.
|
|
345
|
+
LOGGER.info(
|
|
346
|
+
f"Stream: `{self.http_client.name}` the BULK Job: {self._job_id} runs longer than expected ({self._job_max_elapsed_time} sec). Retry with the reduced `Slice Size` after self-cancelation."
|
|
288
347
|
)
|
|
289
|
-
self._job_long_running_cancelation = True
|
|
290
348
|
self._job_cancel()
|
|
291
349
|
|
|
292
350
|
def _cancel_on_checkpointing(self) -> None:
|
|
293
|
-
|
|
351
|
+
LOGGER.info(f"Stream: `{self.http_client.name}`, checkpointing after >= `{self._job_checkpoint_interval}` rows collected.")
|
|
294
352
|
# set the flag to adjust the next slice from the checkpointed cursor value
|
|
295
|
-
self._job_adjust_slice_from_checkpoint = True
|
|
296
353
|
self._job_cancel()
|
|
297
354
|
|
|
298
355
|
def _on_running_job(self, **kwargs) -> None:
|
|
@@ -306,10 +363,15 @@ class ShopifyBulkManager:
|
|
|
306
363
|
def _on_completed_job(self, response: Optional[requests.Response] = None) -> None:
|
|
307
364
|
self._job_result_filename = self._job_get_result(response)
|
|
308
365
|
|
|
309
|
-
def _on_failed_job(self, response: requests.Response) -> AirbyteTracedException:
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
366
|
+
def _on_failed_job(self, response: requests.Response) -> AirbyteTracedException | None:
|
|
367
|
+
if not self._supports_checkpointing:
|
|
368
|
+
raise ShopifyBulkExceptions.BulkJobFailed(
|
|
369
|
+
f"The BULK Job: `{self._job_id}` exited with {self._job_state}, details: {response.text}",
|
|
370
|
+
)
|
|
371
|
+
else:
|
|
372
|
+
# when the Bulk Job fails, usually there is a `partialDataUrl` available,
|
|
373
|
+
# we leverage the checkpointing in this case.
|
|
374
|
+
self._job_get_checkpointed_result(response)
|
|
313
375
|
|
|
314
376
|
def _on_timeout_job(self, **kwargs) -> AirbyteTracedException:
|
|
315
377
|
raise ShopifyBulkExceptions.BulkJobTimout(
|
|
@@ -325,17 +387,17 @@ class ShopifyBulkManager:
|
|
|
325
387
|
raise ShopifyBulkExceptions.BulkJobError(f"Could not validate the status of the BULK Job `{self._job_id}`. Errors: {errors}.")
|
|
326
388
|
|
|
327
389
|
def _on_non_handable_job_error(self, errors: List[Mapping[str, Any]]) -> AirbyteTracedException:
|
|
328
|
-
raise ShopifyBulkExceptions.BulkJobNonHandableError(f"The Stream: `{self.http_client.
|
|
390
|
+
raise ShopifyBulkExceptions.BulkJobNonHandableError(f"The Stream: `{self.http_client.name}`, Non-handable error occured: {errors}")
|
|
329
391
|
|
|
330
|
-
def _get_server_errors(self, response: requests.Response) -> List[Optional[
|
|
392
|
+
def _get_server_errors(self, response: requests.Response) -> List[Optional[Mapping[str, Any]]]:
|
|
331
393
|
server_errors = response.json().get("errors", [])
|
|
332
394
|
return [server_errors] if isinstance(server_errors, str) else server_errors
|
|
333
395
|
|
|
334
|
-
def _get_user_errors(self, response: requests.Response) -> List[Optional[
|
|
396
|
+
def _get_user_errors(self, response: requests.Response) -> List[Optional[Mapping[str, Any]]]:
|
|
335
397
|
user_errors = response.json().get("data", {}).get("bulkOperationRunQuery", {}).get("userErrors", [])
|
|
336
398
|
return [user_errors] if isinstance(user_errors, str) else user_errors
|
|
337
399
|
|
|
338
|
-
def _collect_bulk_errors(self, response: requests.Response) -> List[Optional[
|
|
400
|
+
def _collect_bulk_errors(self, response: requests.Response) -> List[Optional[Mapping[str, Any]]]:
|
|
339
401
|
try:
|
|
340
402
|
return self._get_server_errors(response) + self._get_user_errors(response)
|
|
341
403
|
except (Exception, JSONDecodeError) as e:
|
|
@@ -353,8 +415,7 @@ class ShopifyBulkManager:
|
|
|
353
415
|
_, response = self.http_client.send_request(
|
|
354
416
|
http_method="POST",
|
|
355
417
|
url=self.base_url,
|
|
356
|
-
|
|
357
|
-
headers={"Content-Type": "application/graphql"},
|
|
418
|
+
json={"query": ShopifyBulkTemplates.status(self._job_id)},
|
|
358
419
|
request_kwargs={},
|
|
359
420
|
)
|
|
360
421
|
self._job_healthcheck(response)
|
|
@@ -367,30 +428,23 @@ class ShopifyBulkManager:
|
|
|
367
428
|
Error example:
|
|
368
429
|
[
|
|
369
430
|
{
|
|
431
|
+
'code': 'OPERATION_IN_PROGRESS',
|
|
370
432
|
'field': None,
|
|
371
433
|
'message': 'A bulk query operation for this app and shop is already in progress: gid://shopify/BulkOperation/4039184154813.',
|
|
372
434
|
}
|
|
373
435
|
]
|
|
374
436
|
"""
|
|
375
|
-
|
|
376
|
-
concurrent_job_pattern = "A bulk query operation for this app and shop is already in progress"
|
|
377
437
|
# the errors are handled in `job_job_check_for_errors`
|
|
378
438
|
if errors:
|
|
379
439
|
for error in errors:
|
|
380
|
-
|
|
381
|
-
if
|
|
440
|
+
error_code = error.get("code", "") if isinstance(error, dict) else ""
|
|
441
|
+
if error_code == BulkOperationUserErrorCode.OPERATION_IN_PROGRESS.value:
|
|
382
442
|
return True
|
|
383
443
|
return False
|
|
384
444
|
|
|
385
445
|
def _has_reached_max_concurrency(self) -> bool:
|
|
386
446
|
return self._concurrent_attempt == self._concurrent_max_retry
|
|
387
447
|
|
|
388
|
-
def _switch_base_url(self) -> None:
|
|
389
|
-
if self._new_base_url:
|
|
390
|
-
self.base_url = self._new_base_url
|
|
391
|
-
else:
|
|
392
|
-
self.logger.warning(f"Failed switching the `base url`, no `new base url` has been retrieved.")
|
|
393
|
-
|
|
394
448
|
def _should_switch_shop_name(self, response: requests.Response) -> bool:
|
|
395
449
|
"""
|
|
396
450
|
Sometimes the API returns the redirected response that points to the same Store but with different Name:
|
|
@@ -402,24 +456,26 @@ class ShopifyBulkManager:
|
|
|
402
456
|
|
|
403
457
|
This redirection is related to:
|
|
404
458
|
1) `aliased` or `hidden` store names from being exposed
|
|
405
|
-
2) migrated
|
|
459
|
+
2) `migrated` store data to the `new store`, but referenced within the old one stil
|
|
406
460
|
|
|
407
461
|
reference issue: https://github.com/airbytehq/oncall/issues/5866
|
|
408
462
|
"""
|
|
409
463
|
if self.base_url != response.url:
|
|
410
|
-
self.
|
|
464
|
+
self.base_url = response.url
|
|
411
465
|
return True
|
|
412
466
|
return False
|
|
413
467
|
|
|
414
|
-
@bulk_retry_on_exception(
|
|
468
|
+
@bulk_retry_on_exception()
|
|
415
469
|
def _job_check_state(self) -> None:
|
|
416
470
|
while not self._job_completed():
|
|
417
471
|
if self._job_canceled():
|
|
418
472
|
break
|
|
473
|
+
elif self._job_failed():
|
|
474
|
+
break
|
|
419
475
|
else:
|
|
420
476
|
self._job_track_running()
|
|
421
477
|
|
|
422
|
-
@bulk_retry_on_exception(
|
|
478
|
+
@bulk_retry_on_exception()
|
|
423
479
|
def create_job(self, stream_slice: Mapping[str, str], filter_field: str) -> None:
|
|
424
480
|
if stream_slice:
|
|
425
481
|
query = self.query.get(filter_field, stream_slice["start"], stream_slice["end"])
|
|
@@ -437,7 +493,7 @@ class ShopifyBulkManager:
|
|
|
437
493
|
if self._has_running_concurrent_job(errors):
|
|
438
494
|
# when the concurrent job takes place, another job could not be created
|
|
439
495
|
# we typically need to wait and retry, but no longer than 10 min. (see retry in `bulk_retry_on_exception`)
|
|
440
|
-
raise ShopifyBulkExceptions.BulkJobCreationFailedConcurrentError(f"Failed to create job for stream {self.http_client.
|
|
496
|
+
raise ShopifyBulkExceptions.BulkJobCreationFailedConcurrentError(f"Failed to create job for stream {self.http_client.name}")
|
|
441
497
|
elif self._should_switch_shop_name(response):
|
|
442
498
|
# assign new shop name, since the one that specified in `config` was redirected to the different one.
|
|
443
499
|
raise ShopifyBulkExceptions.BulkJobRedirectToOtherShopError(f"Switching the `store` name, redirected to: {response.url}")
|
|
@@ -459,9 +515,9 @@ class ShopifyBulkManager:
|
|
|
459
515
|
self._job_id = bulk_response.get("id")
|
|
460
516
|
self._job_created_at = bulk_response.get("createdAt")
|
|
461
517
|
self._job_state = ShopifyBulkJobStatus.CREATED.value
|
|
462
|
-
|
|
518
|
+
LOGGER.info(f"Stream: `{self.http_client.name}`, the BULK Job: `{self._job_id}` is {ShopifyBulkJobStatus.CREATED.value}")
|
|
463
519
|
|
|
464
|
-
def job_size_normalize(self, start: datetime, end: datetime) ->
|
|
520
|
+
def job_size_normalize(self, start: datetime, end: datetime) -> None:
|
|
465
521
|
# adjust slice size when it's bigger than the loop point when it should end,
|
|
466
522
|
# to preserve correct job size adjustments when this is the only job we need to run, based on STATE provided
|
|
467
523
|
requested_slice_size = (end - start).total_days()
|
|
@@ -471,9 +527,45 @@ class ShopifyBulkManager:
|
|
|
471
527
|
step = self._job_size if self._job_size else self._job_size_min
|
|
472
528
|
return slice_start.add(days=step)
|
|
473
529
|
|
|
474
|
-
def
|
|
530
|
+
def _adjust_slice_end(
|
|
531
|
+
self, slice_end: datetime, checkpointed_cursor: Optional[str] = None, filter_checkpointed_cursor: Optional[str] = None
|
|
532
|
+
) -> datetime:
|
|
533
|
+
"""
|
|
534
|
+
Choose between the existing `slice_end` value or `checkpointed_cursor` value or `filter_checkpointed_cursor` value, if provided.
|
|
535
|
+
|
|
536
|
+
Optionally: raises the `transient` error if the checkpoint collision occurs.
|
|
537
|
+
|
|
538
|
+
Note: filter_checkpointed_cursor is only used when cursor field is ID for streams like Customer Address etc.
|
|
539
|
+
This method should return a datetime from last checkpointed value to adjust slice end, when cursor value is ID (int type)
|
|
540
|
+
method gets end datetime from filter_checkpointed_cursor, which is value from filter field from last record.
|
|
541
|
+
See https://github.com/airbytehq/oncall/issues/9052 for more details.
|
|
542
|
+
"""
|
|
543
|
+
|
|
544
|
+
if checkpointed_cursor:
|
|
545
|
+
if self._checkpoint_cursor_has_collision(checkpointed_cursor):
|
|
546
|
+
raise ShopifyBulkExceptions.BulkJobCheckpointCollisionError(
|
|
547
|
+
f"The stream: `{self.http_client.name}` checkpoint collision is detected. Try to increase the `BULK Job checkpoint (rows collected)` to the bigger value. The stream will be synced again during the next sync attempt."
|
|
548
|
+
)
|
|
549
|
+
# set the checkpointed cursor value
|
|
550
|
+
self._set_last_checkpoint_cursor_value(checkpointed_cursor)
|
|
551
|
+
if isinstance(checkpointed_cursor, str):
|
|
552
|
+
return pdm.parse(checkpointed_cursor)
|
|
553
|
+
if isinstance(checkpointed_cursor, int):
|
|
554
|
+
return pdm.parse(filter_checkpointed_cursor)
|
|
555
|
+
|
|
556
|
+
return slice_end
|
|
557
|
+
|
|
558
|
+
def get_adjusted_job_end(
|
|
559
|
+
self,
|
|
560
|
+
slice_start: datetime,
|
|
561
|
+
slice_end: datetime,
|
|
562
|
+
checkpointed_cursor: Optional[str] = None,
|
|
563
|
+
filter_checkpointed_cursor: Optional[str] = None,
|
|
564
|
+
) -> datetime:
|
|
475
565
|
if self._job_adjust_slice_from_checkpoint:
|
|
476
|
-
|
|
566
|
+
# set the checkpointing to default, before the next slice is emitted, to avoid inf.loop
|
|
567
|
+
self._reset_checkpointing()
|
|
568
|
+
return self._adjust_slice_end(slice_end, checkpointed_cursor, filter_checkpointed_cursor)
|
|
477
569
|
|
|
478
570
|
if self._is_long_running_job:
|
|
479
571
|
self._job_size_reduce_next()
|
|
@@ -481,8 +573,25 @@ class ShopifyBulkManager:
|
|
|
481
573
|
|
|
482
574
|
return slice_end
|
|
483
575
|
|
|
576
|
+
def _emit_final_job_message(self, job_current_elapsed_time: int) -> None:
|
|
577
|
+
final_message = f"Stream: `{self.http_client.name}`, the BULK Job: `{self._job_id}` time elapsed: {job_current_elapsed_time} sec."
|
|
578
|
+
|
|
579
|
+
if self._job_any_lines_collected:
|
|
580
|
+
lines_collected_message = f" Rows collected: {self._job_last_rec_count} --> records: `{self.record_producer.record_composed}`."
|
|
581
|
+
final_message = final_message + lines_collected_message
|
|
582
|
+
|
|
583
|
+
# emit final Bulk job status message
|
|
584
|
+
LOGGER.info(f"{final_message}")
|
|
585
|
+
|
|
586
|
+
def _process_bulk_results(self) -> Iterable[Mapping[str, Any]]:
|
|
587
|
+
if self._job_result_filename:
|
|
588
|
+
# produce records from saved bulk job result
|
|
589
|
+
yield from self.record_producer.read_file(self._job_result_filename)
|
|
590
|
+
else:
|
|
591
|
+
yield from []
|
|
592
|
+
|
|
484
593
|
@limiter.balance_rate_limit(api_type=ApiTypeEnum.graphql.value)
|
|
485
|
-
def
|
|
594
|
+
def job_get_results(self) -> Optional[Iterable[Mapping[str, Any]]]:
|
|
486
595
|
"""
|
|
487
596
|
This method checks the status for the `CREATED` Shopify BULK Job, using it's `ID`.
|
|
488
597
|
The time spent for the Job execution is tracked to understand the effort.
|
|
@@ -492,7 +601,7 @@ class ShopifyBulkManager:
|
|
|
492
601
|
try:
|
|
493
602
|
# track created job until it's COMPLETED
|
|
494
603
|
self._job_check_state()
|
|
495
|
-
|
|
604
|
+
yield from self._process_bulk_results()
|
|
496
605
|
except (
|
|
497
606
|
ShopifyBulkExceptions.BulkJobFailed,
|
|
498
607
|
ShopifyBulkExceptions.BulkJobTimout,
|
|
@@ -504,9 +613,8 @@ class ShopifyBulkManager:
|
|
|
504
613
|
raise bulk_job_error
|
|
505
614
|
finally:
|
|
506
615
|
job_current_elapsed_time = round((time() - job_started), 3)
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
)
|
|
616
|
+
# emit the final Bulk Job log message
|
|
617
|
+
self._emit_final_job_message(job_current_elapsed_time)
|
|
510
618
|
# check whether or not we should expand or reduce the size of the slice
|
|
511
619
|
self.__adjust_job_size(job_current_elapsed_time)
|
|
512
620
|
# reset the state for COMPLETED job
|