arcane-ingestion 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,24 @@
1
+ Metadata-Version: 2.4
2
+ Name: arcane-ingestion
3
+ Version: 0.1.0
4
+ Summary: Utils function for ingestion entity
5
+ Author: Arcane
6
+ Author-email: product@wearcane.com
7
+ Requires-Python: >=3.11,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.11
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Classifier: Programming Language :: Python :: 3.13
12
+ Classifier: Programming Language :: Python :: 3.14
13
+ Requires-Dist: arcane-datastore (>=1.1.15,<2.0.0)
14
+ Requires-Dist: arcane-pubsub (>=1.5.0,<2.0.0)
15
+ Requires-Dist: python-dateutil (>=2.7,<3.0)
16
+ Requires-Dist: pytz (>=2024.2,<2025.0)
17
+ Description-Content-Type: text/markdown
18
+
19
+ # Arcane ingestion README
20
+
21
+
22
+ ## Release history
23
+ To see changes, please see CHANGELOG.md
24
+
@@ -0,0 +1,5 @@
1
+ # Arcane ingestion README
2
+
3
+
4
+ ## Release history
5
+ To see changes, please see CHANGELOG.md
@@ -0,0 +1,3 @@
1
+ from .datastore_lgp import *
2
+ from .execution_handler import *
3
+ from .exception import *
@@ -0,0 +1,29 @@
1
+ from datetime import datetime
2
+ from typing import Optional
3
+ import pytz
4
+
5
+ from google.cloud.datastore import Entity
6
+
7
+ from arcane.datastore import DATA_INGESTION_KIND, Client as DatastoreClient
8
+
9
+
10
+ def set_ingestion_execution_info_as_start(
11
+ datastore_client: DatastoreClient,
12
+ ingestion: Entity,
13
+ current_time: Optional[datetime] = None
14
+ ):
15
+ ingestion_id = ingestion['id']
16
+ if not current_time:
17
+ current_time = datetime.now().replace(microsecond=0).astimezone(pytz.utc)
18
+ execution_info = {
19
+ **ingestion.get('execution_info', {}),
20
+ "status": "running",
21
+ "timestamp": current_time,
22
+ "errors": []
23
+ }
24
+ updated_properties = dict(
25
+ execution_info=datastore_client.convert_input_to_excluded_entity(
26
+ execution_info)
27
+ )
28
+ datastore_client.save_entity_with_transactions(
29
+ ingestion_id, updated_properties, DATA_INGESTION_KIND)
@@ -0,0 +1,3 @@
1
+ class SkipExecutionError(Exception):
2
+ """Custom exception to indicate that an execution should be skipped due to outdated schedule information or concurrent execution."""
3
+ pass
@@ -0,0 +1,90 @@
1
+ from datetime import datetime, timedelta, timezone
2
+ import logging
3
+ from dateutil.parser import isoparse
4
+ import pytz
5
+
6
+ from google.cloud.datastore import Entity
7
+
8
+ from arcane.datastore import Client as DatastoreClient
9
+ from arcane.pubsub import Client as PubSubClient
10
+
11
+ from .datastore_lgp import set_ingestion_execution_info_as_start
12
+ from .exception import SkipExecutionError
13
+
14
+
15
+ def _get_delta_time_for_ingestion_type(data_ingestion_type: str) -> timedelta:
16
+ """Determine the delta time to consider for skipping execution based on the ingestion type. This helps to prevent over-retrying for expensive or long-running ingestions."""
17
+ delta = timedelta(minutes=30)
18
+ # Because bigquery ingestion is expensive, we prevent overretrying
19
+ # Because HTTP ingestion can run during 30 minutes and can be retry 3 times
20
+ if data_ingestion_type == 'BIG_QUERY' or data_ingestion_type == 'HTTP':
21
+ delta = timedelta(minutes=90)
22
+ # Because SHOPIFY_API ingestion can run during several minutes and can be retry 5 times (5 bulk exec per token)
23
+ elif data_ingestion_type == 'SHOPIFY_API':
24
+ delta = timedelta(minutes=60)
25
+ return delta
26
+
27
+
28
+
29
+ def handle_start_execution(
30
+ body: dict,
31
+ entity: Entity,
32
+ pubsub_client: PubSubClient,
33
+ datastore_client: DatastoreClient,
34
+ project_id: str,
35
+ pf_monitoring_topic: str,
36
+ ingestion_type: str,
37
+ monitoring_step: str
38
+ ):
39
+
40
+ """
41
+ Handle the start of an execution for an entity.
42
+ This function validates the schedule version, checks if the entity is already running,
43
+ updates the execution status, and publishes a monitoring event for product flow services.
44
+ Args:
45
+ body (dict): The request body containing monitoring_id, entity_id, and schedule_version.
46
+ entity (Entity): The entity object containing parameters and execution information.
47
+ pubsub_client (PubSubClient): Client for publishing messages to Pub/Sub topics.
48
+ datastore_client (DatastoreClient): Client for interacting with the datastore.
49
+ project_id (str): The GCP project ID.
50
+ pf_monitoring_topic (str): The Pub/Sub topic name for product flow monitoring.
51
+ ingestion_type (str): The type of ingestion (e.g., 'BIG_QUERY', 'HTTP', 'SHOPIFY_API') to determine the waiting queue time.
52
+ monitoring_step (str): The step name to include in the monitoring event.
53
+ Raises:
54
+ SkipExecutionError: If the schedule version is outdated or if a previous execution
55
+ is still running (less than the allowed delta time for the ingestion type).
56
+ """
57
+
58
+ monitoring_id = body["monitoring_id"]
59
+ entity_id = int(body["entity_id"])
60
+
61
+ schedule_version = body['schedule_version']
62
+ manual_execution = body.get('manual_execution', False)
63
+ if manual_execution:
64
+ logging.info(f'Manual execution for entity with id {entity_id}. Running without checking schedule version and execution status.')
65
+ else:
66
+ if schedule_version != entity['parameters']['schedule_version']:
67
+ logging.info(f'Entity with id {entity_id} has its schedule info. Previous version {schedule_version} and {entity["parameters"]["schedule_version"]}')
68
+ raise SkipExecutionError(f'Schedule info is outdated. Previous version {schedule_version} and current version {entity["parameters"]["schedule_version"]}')
69
+
70
+
71
+ if entity.get('execution_info') is not None and 'status' in entity['execution_info']:
72
+ if entity['execution_info']['status'] == 'running':
73
+ timestamp = entity['execution_info'].get('timestamp')
74
+ if timestamp and not isinstance(timestamp, datetime):
75
+ timestamp = isoparse(timestamp).astimezone(pytz.utc)
76
+ if timestamp and (datetime.now(timezone.utc) - timestamp).total_seconds() < _get_delta_time_for_ingestion_type(ingestion_type).total_seconds():
77
+ logging.info(f'Entity with id {entity_id} is already running since {timestamp}. Skipping execution.')
78
+ raise SkipExecutionError('Previous execution is still running')
79
+
80
+ set_ingestion_execution_info_as_start(datastore_client, entity)
81
+ if entity['service'] != 'datalab':
82
+ pubsub_client.pubsub_publish_pf_monitoring(
83
+ monitoring_id=monitoring_id,
84
+ entity_id=str(entity_id),
85
+ step=monitoring_step,
86
+ status='start',
87
+ project_id=project_id,
88
+ topic=pf_monitoring_topic
89
+ )
90
+
@@ -0,0 +1,25 @@
1
+ [tool.poetry]
2
+ name = "arcane-ingestion"
3
+ version = "0.1.0"
4
+ description = "Utils function for ingestion entity"
5
+ readme = "README.md"
6
+ authors = ["Arcane <product@wearcane.com>"]
7
+ packages = [
8
+ { include = "arcane" }
9
+ ]
10
+
11
+ [tool.poetry.dependencies]
12
+ python = "^3.11"
13
+ arcane-datastore = "^1.1.15"
14
+ arcane-pubsub = "^1.5.0"
15
+ python-dateutil = "^2.7"
16
+ pytz = "^2024.2"
17
+
18
+
19
+ [tool.poetry.group.dev.dependencies]
20
+ pytest = "^7.0"
21
+ pytest-mock = "^3.10"
22
+
23
+ [build-system]
24
+ requires = ["poetry-core>=1.0.0"]
25
+ build-backend = "poetry.core.masonry.api"