arcane-ingestion 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arcane_ingestion-0.1.0/PKG-INFO +24 -0
- arcane_ingestion-0.1.0/README.md +5 -0
- arcane_ingestion-0.1.0/arcane/ingestion/__init__.py +3 -0
- arcane_ingestion-0.1.0/arcane/ingestion/datastore_lgp.py +29 -0
- arcane_ingestion-0.1.0/arcane/ingestion/exception.py +3 -0
- arcane_ingestion-0.1.0/arcane/ingestion/execution_handler.py +90 -0
- arcane_ingestion-0.1.0/pyproject.toml +25 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: arcane-ingestion
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Utils function for ingestion entity
|
|
5
|
+
Author: Arcane
|
|
6
|
+
Author-email: product@wearcane.com
|
|
7
|
+
Requires-Python: >=3.11,<4.0
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
13
|
+
Requires-Dist: arcane-datastore (>=1.1.15,<2.0.0)
|
|
14
|
+
Requires-Dist: arcane-pubsub (>=1.5.0,<2.0.0)
|
|
15
|
+
Requires-Dist: python-dateutil (>=2.7,<3.0)
|
|
16
|
+
Requires-Dist: pytz (>=2024.2,<2025.0)
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
# Arcane ingestion README
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
## Release history
|
|
23
|
+
To see changes, please see CHANGELOG.md
|
|
24
|
+
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from typing import Optional
|
|
3
|
+
import pytz
|
|
4
|
+
|
|
5
|
+
from google.cloud.datastore import Entity
|
|
6
|
+
|
|
7
|
+
from arcane.datastore import DATA_INGESTION_KIND, Client as DatastoreClient
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def set_ingestion_execution_info_as_start(
|
|
11
|
+
datastore_client: DatastoreClient,
|
|
12
|
+
ingestion: Entity,
|
|
13
|
+
current_time: Optional[datetime] = None
|
|
14
|
+
):
|
|
15
|
+
ingestion_id = ingestion['id']
|
|
16
|
+
if not current_time:
|
|
17
|
+
current_time = datetime.now().replace(microsecond=0).astimezone(pytz.utc)
|
|
18
|
+
execution_info = {
|
|
19
|
+
**ingestion.get('execution_info', {}),
|
|
20
|
+
"status": "running",
|
|
21
|
+
"timestamp": current_time,
|
|
22
|
+
"errors": []
|
|
23
|
+
}
|
|
24
|
+
updated_properties = dict(
|
|
25
|
+
execution_info=datastore_client.convert_input_to_excluded_entity(
|
|
26
|
+
execution_info)
|
|
27
|
+
)
|
|
28
|
+
datastore_client.save_entity_with_transactions(
|
|
29
|
+
ingestion_id, updated_properties, DATA_INGESTION_KIND)
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from datetime import datetime, timedelta, timezone
|
|
2
|
+
import logging
|
|
3
|
+
from dateutil.parser import isoparse
|
|
4
|
+
import pytz
|
|
5
|
+
|
|
6
|
+
from google.cloud.datastore import Entity
|
|
7
|
+
|
|
8
|
+
from arcane.datastore import Client as DatastoreClient
|
|
9
|
+
from arcane.pubsub import Client as PubSubClient
|
|
10
|
+
|
|
11
|
+
from .datastore_lgp import set_ingestion_execution_info_as_start
|
|
12
|
+
from .exception import SkipExecutionError
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _get_delta_time_for_ingestion_type(data_ingestion_type: str) -> timedelta:
|
|
16
|
+
"""Determine the delta time to consider for skipping execution based on the ingestion type. This helps to prevent over-retrying for expensive or long-running ingestions."""
|
|
17
|
+
delta = timedelta(minutes=30)
|
|
18
|
+
# Because bigquery ingestion is expensive, we prevent overretrying
|
|
19
|
+
# Because HTTP ingestion can run during 30 minutes and can be retry 3 times
|
|
20
|
+
if data_ingestion_type == 'BIG_QUERY' or data_ingestion_type == 'HTTP':
|
|
21
|
+
delta = timedelta(minutes=90)
|
|
22
|
+
# Because SHOPIFY_API ingestion can run during several minutes and can be retry 5 times (5 bulk exec per token)
|
|
23
|
+
elif data_ingestion_type == 'SHOPIFY_API':
|
|
24
|
+
delta = timedelta(minutes=60)
|
|
25
|
+
return delta
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def handle_start_execution(
|
|
30
|
+
body: dict,
|
|
31
|
+
entity: Entity,
|
|
32
|
+
pubsub_client: PubSubClient,
|
|
33
|
+
datastore_client: DatastoreClient,
|
|
34
|
+
project_id: str,
|
|
35
|
+
pf_monitoring_topic: str,
|
|
36
|
+
ingestion_type: str,
|
|
37
|
+
monitoring_step: str
|
|
38
|
+
):
|
|
39
|
+
|
|
40
|
+
"""
|
|
41
|
+
Handle the start of an execution for an entity.
|
|
42
|
+
This function validates the schedule version, checks if the entity is already running,
|
|
43
|
+
updates the execution status, and publishes a monitoring event for product flow services.
|
|
44
|
+
Args:
|
|
45
|
+
body (dict): The request body containing monitoring_id, entity_id, and schedule_version.
|
|
46
|
+
entity (Entity): The entity object containing parameters and execution information.
|
|
47
|
+
pubsub_client (PubSubClient): Client for publishing messages to Pub/Sub topics.
|
|
48
|
+
datastore_client (DatastoreClient): Client for interacting with the datastore.
|
|
49
|
+
project_id (str): The GCP project ID.
|
|
50
|
+
pf_monitoring_topic (str): The Pub/Sub topic name for product flow monitoring.
|
|
51
|
+
ingestion_type (str): The type of ingestion (e.g., 'BIG_QUERY', 'HTTP', 'SHOPIFY_API') to determine the waiting queue time.
|
|
52
|
+
monitoring_step (str): The step name to include in the monitoring event.
|
|
53
|
+
Raises:
|
|
54
|
+
SkipExecutionError: If the schedule version is outdated or if a previous execution
|
|
55
|
+
is still running (less than the allowed delta time for the ingestion type).
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
monitoring_id = body["monitoring_id"]
|
|
59
|
+
entity_id = int(body["entity_id"])
|
|
60
|
+
|
|
61
|
+
schedule_version = body['schedule_version']
|
|
62
|
+
manual_execution = body.get('manual_execution', False)
|
|
63
|
+
if manual_execution:
|
|
64
|
+
logging.info(f'Manual execution for entity with id {entity_id}. Running without checking schedule version and execution status.')
|
|
65
|
+
else:
|
|
66
|
+
if schedule_version != entity['parameters']['schedule_version']:
|
|
67
|
+
logging.info(f'Entity with id {entity_id} has its schedule info. Previous version {schedule_version} and {entity["parameters"]["schedule_version"]}')
|
|
68
|
+
raise SkipExecutionError(f'Schedule info is outdated. Previous version {schedule_version} and current version {entity["parameters"]["schedule_version"]}')
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
if entity.get('execution_info') is not None and 'status' in entity['execution_info']:
|
|
72
|
+
if entity['execution_info']['status'] == 'running':
|
|
73
|
+
timestamp = entity['execution_info'].get('timestamp')
|
|
74
|
+
if timestamp and not isinstance(timestamp, datetime):
|
|
75
|
+
timestamp = isoparse(timestamp).astimezone(pytz.utc)
|
|
76
|
+
if timestamp and (datetime.now(timezone.utc) - timestamp).total_seconds() < _get_delta_time_for_ingestion_type(ingestion_type).total_seconds():
|
|
77
|
+
logging.info(f'Entity with id {entity_id} is already running since {timestamp}. Skipping execution.')
|
|
78
|
+
raise SkipExecutionError('Previous execution is still running')
|
|
79
|
+
|
|
80
|
+
set_ingestion_execution_info_as_start(datastore_client, entity)
|
|
81
|
+
if entity['service'] != 'datalab':
|
|
82
|
+
pubsub_client.pubsub_publish_pf_monitoring(
|
|
83
|
+
monitoring_id=monitoring_id,
|
|
84
|
+
entity_id=str(entity_id),
|
|
85
|
+
step=monitoring_step,
|
|
86
|
+
status='start',
|
|
87
|
+
project_id=project_id,
|
|
88
|
+
topic=pf_monitoring_topic
|
|
89
|
+
)
|
|
90
|
+
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "arcane-ingestion"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Utils function for ingestion entity"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = ["Arcane <product@wearcane.com>"]
|
|
7
|
+
packages = [
|
|
8
|
+
{ include = "arcane" }
|
|
9
|
+
]
|
|
10
|
+
|
|
11
|
+
[tool.poetry.dependencies]
|
|
12
|
+
python = "^3.11"
|
|
13
|
+
arcane-datastore = "^1.1.15"
|
|
14
|
+
arcane-pubsub = "^1.5.0"
|
|
15
|
+
python-dateutil = "^2.7"
|
|
16
|
+
pytz = "^2024.2"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
[tool.poetry.group.dev.dependencies]
|
|
20
|
+
pytest = "^7.0"
|
|
21
|
+
pytest-mock = "^3.10"
|
|
22
|
+
|
|
23
|
+
[build-system]
|
|
24
|
+
requires = ["poetry-core>=1.0.0"]
|
|
25
|
+
build-backend = "poetry.core.masonry.api"
|