scalable-pypeline 1.1.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pypeline/__init__.py +1 -0
- pypeline/celery.py +270 -0
- pypeline/celery_beat.py +254 -0
- pypeline/cli/__init__.py +0 -0
- pypeline/cli/config_server.py +48 -0
- pypeline/cli/core.py +32 -0
- pypeline/cli/deploy.py +138 -0
- pypeline/cloud.py +80 -0
- pypeline/constants.py +139 -0
- pypeline/deploy.py +167 -0
- pypeline/extensions.py +16 -0
- pypeline/flask/__init__.py +28 -0
- pypeline/flask/api/__init__.py +0 -0
- pypeline/flask/api/pipelines.py +245 -0
- pypeline/flask/api/schedules.py +67 -0
- pypeline/flask/api/utils.py +36 -0
- pypeline/flask/decorators.py +92 -0
- pypeline/flask/flask_sermos.py +219 -0
- pypeline/generators.py +196 -0
- pypeline/lib/__init__.py +0 -0
- pypeline/lib/config_server.py +159 -0
- pypeline/logging_config.py +171 -0
- pypeline/pipeline_config_schema.py +197 -0
- pypeline/schedule_config_schema.py +210 -0
- pypeline/sermos_yaml.py +737 -0
- pypeline/utils/__init__.py +0 -0
- pypeline/utils/config_utils.py +327 -0
- pypeline/utils/graph_utils.py +144 -0
- pypeline/utils/module_utils.py +119 -0
- pypeline/utils/task_utils.py +803 -0
- scalable_pypeline-1.1.0.dist-info/LICENSE +177 -0
- scalable_pypeline-1.1.0.dist-info/METADATA +166 -0
- scalable_pypeline-1.1.0.dist-info/RECORD +38 -0
- scalable_pypeline-1.1.0.dist-info/WHEEL +6 -0
- scalable_pypeline-1.1.0.dist-info/entry_points.txt +2 -0
- scalable_pypeline-1.1.0.dist-info/top_level.txt +2 -0
- tests/fixtures/__init__.py +1 -0
- tests/fixtures/s3_fixtures.py +52 -0
pypeline/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = '1.1.0'
|
pypeline/celery.py
ADDED
@@ -0,0 +1,270 @@
|
|
1
|
+
""" Configure and instantiate Celery
|
2
|
+
"""
|
3
|
+
import os
|
4
|
+
|
5
|
+
from pypeline.constants import DEFAULT_RETRY_TASK_MAX_TTL, DEFAULT_MAX_RETRY
|
6
|
+
|
7
|
+
if os.environ.get('USE_GEVENT', "False").lower() == 'true':
|
8
|
+
from gevent import monkey
|
9
|
+
monkey.patch_all()
|
10
|
+
|
11
|
+
import random
|
12
|
+
import time
|
13
|
+
|
14
|
+
from celery_dyrygent.tasks import register_workflow_processor
|
15
|
+
|
16
|
+
import sys
|
17
|
+
import logging
|
18
|
+
from typing import List
|
19
|
+
from celery import Celery
|
20
|
+
from pypeline.logging_config import setup_logging
|
21
|
+
from pypeline.utils.module_utils import SermosModuleLoader
|
22
|
+
from pypeline.utils.task_utils import PipelineGenerator, PipelineResult, \
|
23
|
+
get_service_config_for_worker
|
24
|
+
from pypeline.extensions import sermos_config, sermos_client_version
|
25
|
+
from pypeline import __version__
|
26
|
+
|
27
|
+
logger = logging.getLogger('celery')
|
28
|
+
ENABLE_TOOLS = str(os.environ.get('ENABLE_TOOLS', 'false')).lower() == 'true'
|
29
|
+
CELERY_TASKS_ACK_LATE = str(os.environ.get('CELERY_TASKS_ACK_LATE', 'false')).lower() == 'true'
|
30
|
+
LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO")
|
31
|
+
OVERLOAD_ES = os.environ.get('ENV', 'production').lower() == 'production'
|
32
|
+
PIPELINE_CHORD_COMPRESSION = os.environ.get('PIPELINE_CHORD_COMPRESSION', None)
|
33
|
+
|
34
|
+
setup_logging(app_version=__version__,
|
35
|
+
client_version=sermos_client_version,
|
36
|
+
default_level=LOG_LEVEL,
|
37
|
+
overload_elasticsearch=OVERLOAD_ES,
|
38
|
+
establish_logging_config=True)
|
39
|
+
|
40
|
+
|
41
|
+
def pipeline_retry(event: dict):
|
42
|
+
""" Handle pipeline retry and deadletter logic.
|
43
|
+
"""
|
44
|
+
access_key = event.get('access_key', None)
|
45
|
+
pipeline_id = event.get('pipeline_id', None)
|
46
|
+
execution_id = event.get('execution_id', None)
|
47
|
+
if pipeline_id is None or execution_id is None:
|
48
|
+
logger.error(f"Unable to retry pipeline {pipeline_id} / "
|
49
|
+
f"execution {execution_id}.")
|
50
|
+
return False
|
51
|
+
|
52
|
+
# generate_chain() will return `None` if the pipeline has exceeded
|
53
|
+
# max retry count or other erorrs happen.
|
54
|
+
gen = PipelineGenerator(pipeline_id=pipeline_id,
|
55
|
+
access_key=access_key,
|
56
|
+
execution_id=execution_id,
|
57
|
+
queue=event.get('queue', None),
|
58
|
+
default_task_ttl=event.get('default_task_ttl',
|
59
|
+
None),
|
60
|
+
add_retry=event.get('add_retry', False),
|
61
|
+
chain_payload=event.get('chain_payload', None))
|
62
|
+
|
63
|
+
if gen.good_to_go:
|
64
|
+
chain = gen.generate_chain()
|
65
|
+
if chain is not None:
|
66
|
+
# Exponential backoff
|
67
|
+
exponential_backoff = min((3 ** gen.pipeline_wrapper.retry_count) +
|
68
|
+
(random.randint(0, 1000) / 1000),
|
69
|
+
DEFAULT_RETRY_TASK_MAX_TTL)
|
70
|
+
logger.debug(f"Exponential backoff sleep {exponential_backoff}")
|
71
|
+
time.sleep(exponential_backoff)
|
72
|
+
# Kick it off again.
|
73
|
+
chain.apply_async()
|
74
|
+
|
75
|
+
logger.warning(f"Pipeline retry was invoked for {pipeline_id} "
|
76
|
+
f"({execution_id})")
|
77
|
+
return True
|
78
|
+
|
79
|
+
|
80
|
+
def task_chain_regulator(*args, **kwargs):
|
81
|
+
""" Utility task to ensure celery properly waits between groups in a chain.
|
82
|
+
|
83
|
+
For a chain(), if each element is a group() then celery does not
|
84
|
+
properly adhere to the chain elements occurring sequentially. If you
|
85
|
+
insert a task that is not a group() in between, though, then the
|
86
|
+
chain operates as expected.
|
87
|
+
"""
|
88
|
+
return True
|
89
|
+
|
90
|
+
|
91
|
+
def pipeline_success(event: dict):
|
92
|
+
""" Utility task to ensure celery properly waits between groups in a chain.
|
93
|
+
|
94
|
+
For a chain(), if each element is a group() then celery does not
|
95
|
+
properly adhere to the chain elements occurring sequentially. If you
|
96
|
+
insert a task that is not a group() in between, though, then the
|
97
|
+
chain operates as expected.
|
98
|
+
"""
|
99
|
+
pr = PipelineResult(event['execution_id'])
|
100
|
+
pr.load()
|
101
|
+
pr.save(status='success')
|
102
|
+
|
103
|
+
|
104
|
+
class GenerateCeleryTasks(SermosModuleLoader):
|
105
|
+
""" Use the sermos.yaml configuration to turn customer methods into
|
106
|
+
decorated celery tasks that are available for work/pipelines
|
107
|
+
"""
|
108
|
+
def __init__(self, config: dict, celery_instance: Celery):
|
109
|
+
super(GenerateCeleryTasks, self).__init__()
|
110
|
+
self.config = config if config else {}
|
111
|
+
self.celery = celery_instance
|
112
|
+
|
113
|
+
def _get_default_tasks(self) -> List[dict]:
|
114
|
+
""" Sermos provides default tasks that all workers should know about.
|
115
|
+
"""
|
116
|
+
return [{
|
117
|
+
'handler': 'sermos.celery.pipeline_retry'
|
118
|
+
}, {
|
119
|
+
'handler': 'sermos.celery.task_chain_regulator'
|
120
|
+
}, {
|
121
|
+
'handler': 'sermos.celery.pipeline_success'
|
122
|
+
}]
|
123
|
+
|
124
|
+
def generate(self):
|
125
|
+
""" Loads methods based on sermos config file and decorates them as
|
126
|
+
celery tasks.
|
127
|
+
|
128
|
+
Customer's methods:
|
129
|
+
--------------------------------
|
130
|
+
def demo_task(*args, **kwargs):
|
131
|
+
return True
|
132
|
+
|
133
|
+
Turns into the equivallent of:
|
134
|
+
--------------------------------
|
135
|
+
@celery.task(queue='queue-name')
|
136
|
+
def demo_task(*args, **kwargs):t
|
137
|
+
return True
|
138
|
+
"""
|
139
|
+
# Set in k8s deployment as an environment variable when Sermos Cloud
|
140
|
+
# generates the final secrets.yaml file. The name comes from the user's
|
141
|
+
# sermos.yaml file based on serviceConfig[].name. Each 'worker' will
|
142
|
+
# have a single name and each individually registers tasks through its
|
143
|
+
# registeredTasks list. This allows each worker to only attempt
|
144
|
+
# bootstrapping those tasks that are relevant to the worker and not, for
|
145
|
+
# example, attempt to import a package that's not used by this worker
|
146
|
+
service = get_service_config_for_worker(self.config)
|
147
|
+
if not service:
|
148
|
+
return
|
149
|
+
for task in service.get('registeredTasks', []):
|
150
|
+
try:
|
151
|
+
worker_path = task['handler'] # Required, no default
|
152
|
+
|
153
|
+
tmp_handler = self.get_callable(worker_path)
|
154
|
+
|
155
|
+
# Decorate the method as a celery task along with a default
|
156
|
+
# queue if provided in config. Set ChainedTask as the base
|
157
|
+
# which allows chained tasks to pass kwargs correctly.
|
158
|
+
tmp_handler = self.celery.task(tmp_handler)
|
159
|
+
except Exception as e:
|
160
|
+
logger.warning(f"Unable to add a task to celery: {e}")
|
161
|
+
# Sermos provides default tasks that all workers should know about, add
|
162
|
+
# them here.
|
163
|
+
for task in self._get_default_tasks():
|
164
|
+
tmp_handler = self.get_callable(task['handler'])
|
165
|
+
tmp_handler = self.celery.task(tmp_handler)
|
166
|
+
|
167
|
+
|
168
|
+
def configure_celery(celery: Celery):
|
169
|
+
""" Configure Sermos-compatible Celery instance. Primarily this means
|
170
|
+
compatibility with Pipelines and Scheduled Tasks through injecting the
|
171
|
+
event kwarg. Also sets prebaked defaults that can be overloaded by user.
|
172
|
+
"""
|
173
|
+
REDIS_URL = os.environ.get('REDIS_URL', 'redis://localhost:6379/0')
|
174
|
+
CELERY_BROKER_URL = os.environ.get('CELERY_BROKER_URL', REDIS_URL)
|
175
|
+
CELERY_RESULT_BACKEND = os.environ.get('CELERY_RESULT_BACKEND', REDIS_URL)
|
176
|
+
TaskBase = celery.Task
|
177
|
+
|
178
|
+
class ChainedTask(TaskBase):
|
179
|
+
""" A Celery Task that is used as the _base_ for all dynamically
|
180
|
+
generated tasks (by GenerateCeleryTasks().generate()). This injects
|
181
|
+
`event` into every task's signature, which allows pipelines to pass
|
182
|
+
event information easily through a chain.
|
183
|
+
"""
|
184
|
+
abstract = True
|
185
|
+
autoretry_for = (Exception,)
|
186
|
+
max_retries = DEFAULT_MAX_RETRY
|
187
|
+
retry_backoff = True
|
188
|
+
retry_jitter = True
|
189
|
+
|
190
|
+
def __call__(self, *args, **kwargs):
|
191
|
+
""" Allow the return value of one task to update the kwargs of a
|
192
|
+
subsequent task if it's a dictionary. Important to the function
|
193
|
+
of a pipeline to allow event information to flow easily.
|
194
|
+
"""
|
195
|
+
# Inject app context
|
196
|
+
if len(args) == 1 and isinstance(args[0], dict):
|
197
|
+
kwargs.update(args[0])
|
198
|
+
args = ()
|
199
|
+
|
200
|
+
# Event holds information used in PipelineRunWrapper and
|
201
|
+
# other areas.
|
202
|
+
if 'event' not in kwargs.keys():
|
203
|
+
kwargs['event'] = {}
|
204
|
+
# This is a special worker from dyrygent that orchestrates our
|
205
|
+
# pipelines. It provides a patch in fix for celery's poor
|
206
|
+
# implementation of Canvas work-flows
|
207
|
+
if self.__name__ == 'workflow_processor':
|
208
|
+
kwargs.pop('event', None)
|
209
|
+
return super(ChainedTask, self).__call__(*args, **kwargs)
|
210
|
+
|
211
|
+
celery.Task = ChainedTask
|
212
|
+
|
213
|
+
# Configure the broker and tasks
|
214
|
+
celery.conf.broker_url = CELERY_BROKER_URL
|
215
|
+
|
216
|
+
# Use our custom database scheduler for dynamic celery beat updates.
|
217
|
+
celery.conf.beat_scheduler =\
|
218
|
+
'sermos.celery_beat:SermosScheduler'
|
219
|
+
|
220
|
+
# Reasonable defaults, override as necessary
|
221
|
+
celery.conf.worker_redirect_stdouts = True
|
222
|
+
celery.conf.worker_redirect_stdouts_level = LOG_LEVEL
|
223
|
+
celery.conf.worker_hijack_root_logger = False
|
224
|
+
|
225
|
+
if PIPELINE_CHORD_COMPRESSION:
|
226
|
+
celery.conf.task_compression = PIPELINE_CHORD_COMPRESSION
|
227
|
+
|
228
|
+
# NOTE: The broker URL may not be the best result backend. For example,
|
229
|
+
# When using Rabbit as the broker (recommended), you should use Redis
|
230
|
+
# as the result backend, as Rabbit has horrible support as backend.
|
231
|
+
celery.conf.result_backend = CELERY_RESULT_BACKEND
|
232
|
+
celery.conf.task_ignore_result = False # Must not ignore for Chords
|
233
|
+
celery.conf.task_acks_late = False # Check per worker
|
234
|
+
celery.conf.result_expires = int(
|
235
|
+
os.environ.get('CELERY_RESULT_EXPIRES', 10800)) # 3 hours by default
|
236
|
+
celery.conf.broker_pool_limit = int(os.environ.get('BROKER_POOL_LIMIT',
|
237
|
+
10))
|
238
|
+
celery.conf.worker_max_tasks_per_child = int(
|
239
|
+
os.environ.get('MAX_TASKS_PER_CHILD', 100))
|
240
|
+
celery.conf.task_soft_time_limit =\
|
241
|
+
int(os.environ.get('TASK_TIMEOUT_SECONDS', 3600))
|
242
|
+
celery.conf.task_time_limit =\
|
243
|
+
int(os.environ.get('TASK_TIMEOUT_SECONDS', 3600)) + 10 # Cleanup buffer
|
244
|
+
celery.conf.task_acks_late = CELERY_TASKS_ACK_LATE
|
245
|
+
celery.conf.task_serializer = 'json'
|
246
|
+
celery.conf.result_serializer = 'json'
|
247
|
+
celery.conf.accept_content = ['json']
|
248
|
+
# Required config options for some brokers we use frequently.
|
249
|
+
transport_options = {}
|
250
|
+
celery.conf.broker_transport_options = transport_options
|
251
|
+
|
252
|
+
# Sermos generally has long-running tasks (relatively speaking), so
|
253
|
+
# limit number of jobs a worker can reserve. This may not be true for
|
254
|
+
# all tasks, so configure this on a per application basis. In the event
|
255
|
+
# mutltiple task kinds exist in an application (short and long), see
|
256
|
+
# http://docs.celeryproject.org/en/latest/userguide/optimizing.html#optimizing-prefetch-limit
|
257
|
+
# for some guidance on combining multiple workers and routing tasks.
|
258
|
+
# TODO make configurable from env
|
259
|
+
celery.conf.worker_prefetch_multiplier = 1
|
260
|
+
|
261
|
+
# Add our application's workers & any other tasks to be made
|
262
|
+
# available
|
263
|
+
register_workflow_processor(celery)
|
264
|
+
try:
|
265
|
+
GenerateCeleryTasks(sermos_config, celery).generate()
|
266
|
+
except Exception as e:
|
267
|
+
logger.error(f"Unable to dynamically generate celery tasks: {e}")
|
268
|
+
sys.exit(1)
|
269
|
+
|
270
|
+
return celery
|
pypeline/celery_beat.py
ADDED
@@ -0,0 +1,254 @@
|
|
1
|
+
""" Custom Sermos Scheduler and Celery Entry classes used for dynamic beat.
|
2
|
+
"""
|
3
|
+
import datetime
|
4
|
+
import os
|
5
|
+
import logging
|
6
|
+
from rhodb.redis_conf import RedisConnector
|
7
|
+
from celery.beat import Scheduler, ScheduleEntry
|
8
|
+
from celery import current_app
|
9
|
+
from celery.utils.time import is_naive
|
10
|
+
from celery.schedules import schedule as c_schedule, crontab as c_crontab
|
11
|
+
from pypeline.utils.config_utils import retrieve_latest_schedule_config, \
|
12
|
+
update_schedule_config
|
13
|
+
from pypeline.constants import CONFIG_REFRESH_RATE, SCHEDULE_DATE_FORMAT, \
|
14
|
+
USING_SERMOS_CLOUD
|
15
|
+
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
redis_conn = RedisConnector().get_connection()
|
18
|
+
|
19
|
+
|
20
|
+
def convert_to_datetime(
|
21
|
+
datetime_str: str,
|
22
|
+
datetime_format: str = SCHEDULE_DATE_FORMAT) -> datetime.datetime:
|
23
|
+
""" Accept a string in the standard format and return a datetime object
|
24
|
+
"""
|
25
|
+
return datetime.datetime.strptime(datetime_str, datetime_format)
|
26
|
+
|
27
|
+
|
28
|
+
def instantiate_celery_schedule(schedule_entry: dict) -> c_schedule:
|
29
|
+
""" From a schedule entry and the full schedule from Sermos, create a
|
30
|
+
celery `schedule` object.
|
31
|
+
"""
|
32
|
+
scheduleType = schedule_entry['config']['scheduleType']
|
33
|
+
|
34
|
+
if scheduleType == 'interval':
|
35
|
+
# Create a timedelta object
|
36
|
+
period = schedule_entry['config']['schedule']['period']
|
37
|
+
every = schedule_entry['config']['schedule']['every']
|
38
|
+
the_delta = datetime.timedelta(**{period: every})
|
39
|
+
# Instantiate the celery schedule object
|
40
|
+
return c_schedule(run_every=the_delta)
|
41
|
+
|
42
|
+
if scheduleType == 'crontab':
|
43
|
+
return c_crontab(
|
44
|
+
minute=schedule_entry['config']['schedule']['minute'],
|
45
|
+
hour=schedule_entry['config']['schedule']['hour'],
|
46
|
+
day_of_week=schedule_entry['config']['schedule']['dayOfWeek'],
|
47
|
+
day_of_month=schedule_entry['config']['schedule']['dayOfMonth'],
|
48
|
+
month_of_year=schedule_entry['config']['schedule']['monthOfYear'])
|
49
|
+
|
50
|
+
raise ValueError(f"Unsupported scheduleType ({scheduleType} ...")
|
51
|
+
|
52
|
+
|
53
|
+
class SermosEntry(ScheduleEntry):
|
54
|
+
""" Create a beat entry with additional functionality for Sermos scheduler.
|
55
|
+
|
56
|
+
https://docs.celeryproject.org/en/latest/userguide/periodic-tasks.html
|
57
|
+
"""
|
58
|
+
def __init__(self, schedule_entry: dict = None, **kwargs):
|
59
|
+
schedule_entry = schedule_entry if schedule_entry else {}
|
60
|
+
if schedule_entry:
|
61
|
+
# This event is being instantiated directly with the Sermos
|
62
|
+
# schedule entry
|
63
|
+
celery_schedule = instantiate_celery_schedule(schedule_entry)
|
64
|
+
|
65
|
+
# celery.beat.ScheduleEntry expects these keys in a dictionary
|
66
|
+
# called `options`. See
|
67
|
+
# https://docs.celeryproject.org/en/stable/userguide/calling.html
|
68
|
+
# In the case of Sermos, we require the queue in the
|
69
|
+
# ScheduleEntrySchema, others are all optional.
|
70
|
+
options = dict()
|
71
|
+
optional_keys = ('queue', 'exchange', 'routing_key', 'expires')
|
72
|
+
for key in optional_keys:
|
73
|
+
value = schedule_entry['config'].get(key, None)
|
74
|
+
if value is not None:
|
75
|
+
options[key] = value
|
76
|
+
|
77
|
+
last_run_at = schedule_entry.get('lastRunAt')
|
78
|
+
if last_run_at is None:
|
79
|
+
last_run_at = current_app.now()
|
80
|
+
schedule_entry['lastRunAt'] = last_run_at
|
81
|
+
if isinstance(schedule_entry['lastRunAt'], str):
|
82
|
+
last_run_at = convert_to_datetime(
|
83
|
+
schedule_entry['lastRunAt'])
|
84
|
+
|
85
|
+
# Verify times are accurate
|
86
|
+
orig = last_run_at
|
87
|
+
if not is_naive(last_run_at):
|
88
|
+
last_run_at = last_run_at.replace(tzinfo=None)
|
89
|
+
assert orig.hour == last_run_at.hour # timezone sanity
|
90
|
+
|
91
|
+
if USING_SERMOS_CLOUD:
|
92
|
+
# We need to keep track of the id because this used to send
|
93
|
+
# updates to sermos cloud. The name can't be concatenated
|
94
|
+
# with the sermos id or else it will be created as duplicate
|
95
|
+
# celery beat task.
|
96
|
+
name = schedule_entry['name']
|
97
|
+
self.sermos_id = schedule_entry['id']
|
98
|
+
else:
|
99
|
+
name = schedule_entry['name']
|
100
|
+
|
101
|
+
super().__init__(app=current_app._get_current_object(),
|
102
|
+
name=name,
|
103
|
+
task=schedule_entry['config']['task'],
|
104
|
+
args=schedule_entry.get('args', None),
|
105
|
+
kwargs=schedule_entry.get('kwargs', None),
|
106
|
+
options=options,
|
107
|
+
schedule=celery_schedule,
|
108
|
+
last_run_at=last_run_at,
|
109
|
+
total_run_count=schedule_entry.get(
|
110
|
+
'totalRunCount', 0))
|
111
|
+
else:
|
112
|
+
# This is a task issued directly by celery's scheduler so won't
|
113
|
+
# have the schedule_entry argument. Still not entirely clear why
|
114
|
+
# this is seen. Pop the id before initializing the super class.
|
115
|
+
# Add it back after so we can keep sermos up to date w/ config.
|
116
|
+
if USING_SERMOS_CLOUD:
|
117
|
+
sermos_id = kwargs.pop('sermos_id')
|
118
|
+
super().__init__(**kwargs)
|
119
|
+
self.sermos_id = sermos_id
|
120
|
+
else:
|
121
|
+
super().__init__(**kwargs)
|
122
|
+
|
123
|
+
# Ensure all events have 'event' key - this is populated by ChainedTask
|
124
|
+
if 'event' not in self.kwargs.keys():
|
125
|
+
self.kwargs['event'] = {}
|
126
|
+
|
127
|
+
|
128
|
+
class SermosScheduler(Scheduler):
|
129
|
+
""" Sermos' implementation of a Celery Scheduler. Leverages a Sermos
|
130
|
+
configuration server to provide the up-to-date schedule and provides to
|
131
|
+
this scheduler for in-memory tracking.
|
132
|
+
"""
|
133
|
+
Entry = SermosEntry
|
134
|
+
_last_refresh = None # Internal time keeper for Sermos syncing
|
135
|
+
_refresh_rate = CONFIG_REFRESH_RATE * 1000000 # Turn to microseconds
|
136
|
+
_schedule = None # Holds latest Celery schedule with only enabled tasks
|
137
|
+
_schedule_full = None # Holds latest schedule, regardless of enabled
|
138
|
+
_initial_read = True # Set to False upon initial bootstrapping
|
139
|
+
|
140
|
+
def __init__(self, *args, **kwargs):
|
141
|
+
logger.info("Initializing SermosScheduler ...")
|
142
|
+
# This step ensures the latest schedule is pulled from Sermos/cache
|
143
|
+
# and bootstraps the local time checker we use.
|
144
|
+
self.set_under_schedule()
|
145
|
+
self._last_refresh = datetime.datetime.utcnow()
|
146
|
+
|
147
|
+
# Default 60 second max interval here so our schedule is always
|
148
|
+
# forced to be up to date.
|
149
|
+
max_interval = int(
|
150
|
+
os.environ.get('CELERY_BEAT_SYNC_MAX_INTERVAL',
|
151
|
+
CONFIG_REFRESH_RATE))
|
152
|
+
kwargs['max_interval'] = max_interval
|
153
|
+
|
154
|
+
kwargs['schedule'] = self._schedule
|
155
|
+
Scheduler.__init__(self, *args, **kwargs)
|
156
|
+
|
157
|
+
def set_under_schedule(self):
|
158
|
+
""" Parse the latest schedule config and set self._schedule with parsed
|
159
|
+
schedule including only those that are enabled.
|
160
|
+
"""
|
161
|
+
s = {}
|
162
|
+
s_full = []
|
163
|
+
s_full_orig = [s.copy() for s in self._schedule_full
|
164
|
+
] if self._schedule_full else []
|
165
|
+
latest_schedule = retrieve_latest_schedule_config()
|
166
|
+
for sched in latest_schedule:
|
167
|
+
s_full.append(sched) # Append to full list regardless of enabled
|
168
|
+
if sched['enabled']:
|
169
|
+
s[sched['name']] = SermosEntry(sched)
|
170
|
+
self._schedule = s
|
171
|
+
self._schedule_full = s_full
|
172
|
+
|
173
|
+
# Report if schedule changed
|
174
|
+
if self._schedule_full != s_full_orig:
|
175
|
+
logger.info("SermosScheduler: Schedule updated ...")
|
176
|
+
logger.info(f"SermosScheduler: {self._schedule}")
|
177
|
+
|
178
|
+
def get_current_sermos_schedule(self):
|
179
|
+
""" Unpack Celery's current representation of the schedule into Sermos
|
180
|
+
format. This is used to send updates back to Sermos related to dynamic
|
181
|
+
properties such as last_run_at and total_run_count.
|
182
|
+
"""
|
183
|
+
|
184
|
+
sched = {'schedules': []}
|
185
|
+
for entry_name, entry in self.schedule.items():
|
186
|
+
sched['schedules'].append({
|
187
|
+
'id': entry.sermos_id,
|
188
|
+
'lastRunAt': entry.last_run_at.isoformat(),
|
189
|
+
'totalRunCount': entry.total_run_count
|
190
|
+
})
|
191
|
+
|
192
|
+
return sched
|
193
|
+
|
194
|
+
def setup_schedule(self):
|
195
|
+
self.install_default_entries(self.data)
|
196
|
+
# Overload default behavior and instead bootstrap with our _schedule
|
197
|
+
# instead of app.conf.beat_schedule.
|
198
|
+
self.merge_inplace(self._schedule)
|
199
|
+
|
200
|
+
def should_refresh(self):
|
201
|
+
""" Determine if enough time has elapsed to perform a schedule refresh.
|
202
|
+
|
203
|
+
We turn everything into microseconds so we don't spam external services
|
204
|
+
intra-second as most of the time, more than one task exists in the
|
205
|
+
schedule and therefore we need to check the scheduler's `schedule`
|
206
|
+
on each task very rapidly when issuing tasks.
|
207
|
+
"""
|
208
|
+
now = datetime.datetime.utcnow()
|
209
|
+
microseconds_since_last_refresh = float(
|
210
|
+
str((now - self._last_refresh).seconds) + "." +
|
211
|
+
str((now - self._last_refresh).microseconds)) * 1000000
|
212
|
+
res = bool(microseconds_since_last_refresh > self._refresh_rate)
|
213
|
+
if res is True:
|
214
|
+
self._last_refresh = now - datetime.timedelta(milliseconds=1)
|
215
|
+
return res
|
216
|
+
|
217
|
+
def sync(self):
|
218
|
+
""" Sync local schedule with Sermos and update Celery's representation
|
219
|
+
TODO check this vis-a-vis local vs cloud
|
220
|
+
"""
|
221
|
+
if self.schedule and USING_SERMOS_CLOUD:
|
222
|
+
update_schedule_config(self.get_current_sermos_schedule())
|
223
|
+
self.set_under_schedule() # Internal representation
|
224
|
+
self.merge_inplace(self._schedule) # Celery representation
|
225
|
+
|
226
|
+
def get_schedule(self):
|
227
|
+
""" Overload default Scheduler get_schedule method to check for updates
|
228
|
+
|
229
|
+
Note: Celery uses a property function, e.g.:
|
230
|
+
https://www.tutorialsteacher.com/python/property-function
|
231
|
+
for getting/setting the schedule internally. We only override the
|
232
|
+
get_schedule method here.
|
233
|
+
"""
|
234
|
+
update = False
|
235
|
+
if self._initial_read:
|
236
|
+
logger.info('SermosScheduler: Initial read ...')
|
237
|
+
update = True
|
238
|
+
self._initial_read = False
|
239
|
+
elif self.should_refresh():
|
240
|
+
logger.info('SermosScheduler: Refreshing schedule ...')
|
241
|
+
update = True
|
242
|
+
|
243
|
+
if update:
|
244
|
+
self.sync()
|
245
|
+
|
246
|
+
return self._schedule
|
247
|
+
|
248
|
+
def set_schedule(self, schedule):
|
249
|
+
""" Redefine Celery set_schedule method
|
250
|
+
"""
|
251
|
+
self.data = schedule
|
252
|
+
|
253
|
+
# Redefine Celery schedule property()
|
254
|
+
schedule = property(get_schedule, set_schedule)
|
pypeline/cli/__init__.py
ADDED
File without changes
|
@@ -0,0 +1,48 @@
|
|
1
|
+
""" Command Line Utilities for starting the local configuration server
|
2
|
+
"""
|
3
|
+
import logging
|
4
|
+
import click
|
5
|
+
from pypeline.lib.config_server import api, set_api_config
|
6
|
+
|
7
|
+
logger = logging.getLogger(__name__)
|
8
|
+
|
9
|
+
|
10
|
+
@click.group()
|
11
|
+
def config_server():
|
12
|
+
""" Deployment command group.
|
13
|
+
"""
|
14
|
+
|
15
|
+
|
16
|
+
@config_server.command()
|
17
|
+
@click.option('--base-dir', required=False, default=None)
|
18
|
+
@click.option('--pipelines-yaml', required=False, default=None)
|
19
|
+
@click.option('--schedules-json', required=False, default=None)
|
20
|
+
@click.option('--port', required=False, default=8000)
|
21
|
+
def local_config_api(base_dir: str = None,
|
22
|
+
pipelines_yaml: str = None,
|
23
|
+
schedules_json: str = None,
|
24
|
+
port: int = 8000):
|
25
|
+
""" Start a local configuration API server for development.
|
26
|
+
|
27
|
+
This will use the provided pipelines.yaml and schedules.json file to
|
28
|
+
mock the API endpoints available to managed Sermos Deployments. To use
|
29
|
+
for development, make sure to set the DEFAULT_BASE_URL in your application's
|
30
|
+
environment (e.g. DEFAULT_BASE_URL=http://localhost:8000/api/v1/)
|
31
|
+
|
32
|
+
Arguments::
|
33
|
+
|
34
|
+
base-dir (optional): Directory name where your development config
|
35
|
+
files reside. Defaults to `dev`.
|
36
|
+
|
37
|
+
pipelines-yaml (optional): Path to find your `pipelines.yaml`
|
38
|
+
configuration file. Defaults to `pipelines.yaml`
|
39
|
+
|
40
|
+
schedules-json (optional): Path to find your `schedules.json`
|
41
|
+
configuration file. Defaults to `schedules.json`
|
42
|
+
"""
|
43
|
+
click.echo("Starting Local Sermos Configuration Server ...")
|
44
|
+
set_api_config(base_dir=base_dir,
|
45
|
+
pipelines_yaml=pipelines_yaml,
|
46
|
+
schedules_json=schedules_json)
|
47
|
+
|
48
|
+
api.run(port=port, host='0.0.0.0')
|
pypeline/cli/core.py
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
""" Primary CLI group entrypoint
|
2
|
+
"""
|
3
|
+
import logging
|
4
|
+
import click
|
5
|
+
from pypeline.logging_config import setup_logging
|
6
|
+
|
7
|
+
setup_logging(default_level='INFO', establish_logging_config=False)
|
8
|
+
logger = logging.getLogger(__name__)
|
9
|
+
|
10
|
+
# Not all CLI tools will be functional / available depending on which extras
|
11
|
+
# are installed. For example, the config server won't work if the `workers`
|
12
|
+
# extra isn't available, which installs celery and networkx.
|
13
|
+
collection = []
|
14
|
+
|
15
|
+
warning_msg = "{} CLI tools are not available. This is most "\
|
16
|
+
"likely due to a missing import. Verify you have the correct "\
|
17
|
+
"Sermos extras installed."
|
18
|
+
try:
|
19
|
+
from pypeline.cli.deploy import deployment
|
20
|
+
collection.append(deployment)
|
21
|
+
except ImportError as e:
|
22
|
+
logger.warning(warning_msg.format("Deployment"))
|
23
|
+
logger.warning(f"{e}")
|
24
|
+
|
25
|
+
try:
|
26
|
+
from pypeline.cli.config_server import config_server
|
27
|
+
collection.append(config_server)
|
28
|
+
except ImportError as e:
|
29
|
+
logger.warning(warning_msg.format("Configuration Server"))
|
30
|
+
logger.warning(f"{e}")
|
31
|
+
|
32
|
+
sermos = click.CommandCollection(sources=collection)
|