scalable-pypeline 1.2.3__py2.py3-none-any.whl → 2.0.2__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pypeline/__init__.py +1 -1
- pypeline/barrier.py +34 -0
- pypeline/composition.py +349 -0
- pypeline/constants.py +51 -84
- pypeline/dramatiq.py +470 -0
- pypeline/extensions.py +9 -8
- pypeline/flask/__init__.py +3 -5
- pypeline/flask/api/pipelines.py +109 -148
- pypeline/flask/api/schedules.py +14 -39
- pypeline/flask/decorators.py +18 -53
- pypeline/flask/flask_pypeline.py +156 -0
- pypeline/middleware.py +61 -0
- pypeline/pipeline_config_schema.py +105 -92
- pypeline/pypeline_yaml.py +458 -0
- pypeline/schedule_config_schema.py +35 -120
- pypeline/utils/config_utils.py +52 -310
- pypeline/utils/module_utils.py +35 -71
- pypeline/utils/pipeline_utils.py +161 -0
- scalable_pypeline-2.0.2.dist-info/METADATA +217 -0
- scalable_pypeline-2.0.2.dist-info/RECORD +27 -0
- scalable_pypeline-2.0.2.dist-info/entry_points.txt +3 -0
- tests/fixtures/__init__.py +0 -1
- pypeline/celery.py +0 -206
- pypeline/celery_beat.py +0 -254
- pypeline/flask/api/utils.py +0 -35
- pypeline/flask/flask_sermos.py +0 -156
- pypeline/generators.py +0 -196
- pypeline/logging_config.py +0 -171
- pypeline/pipeline/__init__.py +0 -0
- pypeline/pipeline/chained_task.py +0 -70
- pypeline/pipeline/generator.py +0 -254
- pypeline/sermos_yaml.py +0 -442
- pypeline/utils/graph_utils.py +0 -144
- pypeline/utils/task_utils.py +0 -552
- scalable_pypeline-1.2.3.dist-info/METADATA +0 -163
- scalable_pypeline-1.2.3.dist-info/RECORD +0 -33
- scalable_pypeline-1.2.3.dist-info/entry_points.txt +0 -2
- tests/fixtures/s3_fixtures.py +0 -52
- {scalable_pypeline-1.2.3.dist-info → scalable_pypeline-2.0.2.dist-info}/LICENSE +0 -0
- {scalable_pypeline-1.2.3.dist-info → scalable_pypeline-2.0.2.dist-info}/WHEEL +0 -0
- {scalable_pypeline-1.2.3.dist-info → scalable_pypeline-2.0.2.dist-info}/top_level.txt +0 -0
pypeline/celery_beat.py
DELETED
@@ -1,254 +0,0 @@
|
|
1
|
-
""" Custom Sermos Scheduler and Celery Entry classes used for dynamic beat.
|
2
|
-
"""
|
3
|
-
import datetime
|
4
|
-
import os
|
5
|
-
import logging
|
6
|
-
from rhodb.redis_conf import RedisConnector
|
7
|
-
from celery.beat import Scheduler, ScheduleEntry
|
8
|
-
from celery import current_app
|
9
|
-
from celery.utils.time import is_naive
|
10
|
-
from celery.schedules import schedule as c_schedule, crontab as c_crontab
|
11
|
-
from pypeline.utils.config_utils import retrieve_latest_schedule_config, \
|
12
|
-
update_schedule_config
|
13
|
-
from pypeline.constants import CONFIG_REFRESH_RATE, SCHEDULE_DATE_FORMAT, \
|
14
|
-
USING_SERMOS_CLOUD
|
15
|
-
|
16
|
-
logger = logging.getLogger(__name__)
|
17
|
-
redis_conn = RedisConnector().get_connection()
|
18
|
-
|
19
|
-
|
20
|
-
def convert_to_datetime(
|
21
|
-
datetime_str: str,
|
22
|
-
datetime_format: str = SCHEDULE_DATE_FORMAT) -> datetime.datetime:
|
23
|
-
""" Accept a string in the standard format and return a datetime object
|
24
|
-
"""
|
25
|
-
return datetime.datetime.strptime(datetime_str, datetime_format)
|
26
|
-
|
27
|
-
|
28
|
-
def instantiate_celery_schedule(schedule_entry: dict) -> c_schedule:
|
29
|
-
""" From a schedule entry and the full schedule from Sermos, create a
|
30
|
-
celery `schedule` object.
|
31
|
-
"""
|
32
|
-
scheduleType = schedule_entry['config']['scheduleType']
|
33
|
-
|
34
|
-
if scheduleType == 'interval':
|
35
|
-
# Create a timedelta object
|
36
|
-
period = schedule_entry['config']['schedule']['period']
|
37
|
-
every = schedule_entry['config']['schedule']['every']
|
38
|
-
the_delta = datetime.timedelta(**{period: every})
|
39
|
-
# Instantiate the celery schedule object
|
40
|
-
return c_schedule(run_every=the_delta)
|
41
|
-
|
42
|
-
if scheduleType == 'crontab':
|
43
|
-
return c_crontab(
|
44
|
-
minute=schedule_entry['config']['schedule']['minute'],
|
45
|
-
hour=schedule_entry['config']['schedule']['hour'],
|
46
|
-
day_of_week=schedule_entry['config']['schedule']['dayOfWeek'],
|
47
|
-
day_of_month=schedule_entry['config']['schedule']['dayOfMonth'],
|
48
|
-
month_of_year=schedule_entry['config']['schedule']['monthOfYear'])
|
49
|
-
|
50
|
-
raise ValueError(f"Unsupported scheduleType ({scheduleType} ...")
|
51
|
-
|
52
|
-
|
53
|
-
class SermosEntry(ScheduleEntry):
|
54
|
-
""" Create a beat entry with additional functionality for Sermos scheduler.
|
55
|
-
|
56
|
-
https://docs.celeryproject.org/en/latest/userguide/periodic-tasks.html
|
57
|
-
"""
|
58
|
-
def __init__(self, schedule_entry: dict = None, **kwargs):
|
59
|
-
schedule_entry = schedule_entry if schedule_entry else {}
|
60
|
-
if schedule_entry:
|
61
|
-
# This event is being instantiated directly with the Sermos
|
62
|
-
# schedule entry
|
63
|
-
celery_schedule = instantiate_celery_schedule(schedule_entry)
|
64
|
-
|
65
|
-
# celery.beat.ScheduleEntry expects these keys in a dictionary
|
66
|
-
# called `options`. See
|
67
|
-
# https://docs.celeryproject.org/en/stable/userguide/calling.html
|
68
|
-
# In the case of Sermos, we require the queue in the
|
69
|
-
# ScheduleEntrySchema, others are all optional.
|
70
|
-
options = dict()
|
71
|
-
optional_keys = ('queue', 'exchange', 'routing_key', 'expires')
|
72
|
-
for key in optional_keys:
|
73
|
-
value = schedule_entry['config'].get(key, None)
|
74
|
-
if value is not None:
|
75
|
-
options[key] = value
|
76
|
-
|
77
|
-
last_run_at = schedule_entry.get('lastRunAt')
|
78
|
-
if last_run_at is None:
|
79
|
-
last_run_at = current_app.now()
|
80
|
-
schedule_entry['lastRunAt'] = last_run_at
|
81
|
-
if isinstance(schedule_entry['lastRunAt'], str):
|
82
|
-
last_run_at = convert_to_datetime(
|
83
|
-
schedule_entry['lastRunAt'])
|
84
|
-
|
85
|
-
# Verify times are accurate
|
86
|
-
orig = last_run_at
|
87
|
-
if not is_naive(last_run_at):
|
88
|
-
last_run_at = last_run_at.replace(tzinfo=None)
|
89
|
-
assert orig.hour == last_run_at.hour # timezone sanity
|
90
|
-
|
91
|
-
if USING_SERMOS_CLOUD:
|
92
|
-
# We need to keep track of the id because this used to send
|
93
|
-
# updates to sermos cloud. The name can't be concatenated
|
94
|
-
# with the sermos id or else it will be created as duplicate
|
95
|
-
# celery beat task.
|
96
|
-
name = schedule_entry['name']
|
97
|
-
self.sermos_id = schedule_entry['id']
|
98
|
-
else:
|
99
|
-
name = schedule_entry['name']
|
100
|
-
|
101
|
-
super().__init__(app=current_app._get_current_object(),
|
102
|
-
name=name,
|
103
|
-
task=schedule_entry['config']['task'],
|
104
|
-
args=schedule_entry.get('args', None),
|
105
|
-
kwargs=schedule_entry.get('kwargs', None),
|
106
|
-
options=options,
|
107
|
-
schedule=celery_schedule,
|
108
|
-
last_run_at=last_run_at,
|
109
|
-
total_run_count=schedule_entry.get(
|
110
|
-
'totalRunCount', 0))
|
111
|
-
else:
|
112
|
-
# This is a task issued directly by celery's scheduler so won't
|
113
|
-
# have the schedule_entry argument. Still not entirely clear why
|
114
|
-
# this is seen. Pop the id before initializing the super class.
|
115
|
-
# Add it back after so we can keep sermos up to date w/ config.
|
116
|
-
if USING_SERMOS_CLOUD:
|
117
|
-
sermos_id = kwargs.pop('sermos_id')
|
118
|
-
super().__init__(**kwargs)
|
119
|
-
self.sermos_id = sermos_id
|
120
|
-
else:
|
121
|
-
super().__init__(**kwargs)
|
122
|
-
|
123
|
-
# Ensure all events have 'event' key - this is populated by ChainedTask
|
124
|
-
if 'event' not in self.kwargs.keys():
|
125
|
-
self.kwargs['event'] = {}
|
126
|
-
|
127
|
-
|
128
|
-
class SermosScheduler(Scheduler):
|
129
|
-
""" Sermos' implementation of a Celery Scheduler. Leverages a Sermos
|
130
|
-
configuration server to provide the up-to-date schedule and provides to
|
131
|
-
this scheduler for in-memory tracking.
|
132
|
-
"""
|
133
|
-
Entry = SermosEntry
|
134
|
-
_last_refresh = None # Internal time keeper for Sermos syncing
|
135
|
-
_refresh_rate = CONFIG_REFRESH_RATE * 1000000 # Turn to microseconds
|
136
|
-
_schedule = None # Holds latest Celery schedule with only enabled tasks
|
137
|
-
_schedule_full = None # Holds latest schedule, regardless of enabled
|
138
|
-
_initial_read = True # Set to False upon initial bootstrapping
|
139
|
-
|
140
|
-
def __init__(self, *args, **kwargs):
|
141
|
-
logger.info("Initializing SermosScheduler ...")
|
142
|
-
# This step ensures the latest schedule is pulled from Sermos/cache
|
143
|
-
# and bootstraps the local time checker we use.
|
144
|
-
self.set_under_schedule()
|
145
|
-
self._last_refresh = datetime.datetime.utcnow()
|
146
|
-
|
147
|
-
# Default 60 second max interval here so our schedule is always
|
148
|
-
# forced to be up to date.
|
149
|
-
max_interval = int(
|
150
|
-
os.environ.get('CELERY_BEAT_SYNC_MAX_INTERVAL',
|
151
|
-
CONFIG_REFRESH_RATE))
|
152
|
-
kwargs['max_interval'] = max_interval
|
153
|
-
|
154
|
-
kwargs['schedule'] = self._schedule
|
155
|
-
Scheduler.__init__(self, *args, **kwargs)
|
156
|
-
|
157
|
-
def set_under_schedule(self):
|
158
|
-
""" Parse the latest schedule config and set self._schedule with parsed
|
159
|
-
schedule including only those that are enabled.
|
160
|
-
"""
|
161
|
-
s = {}
|
162
|
-
s_full = []
|
163
|
-
s_full_orig = [s.copy() for s in self._schedule_full
|
164
|
-
] if self._schedule_full else []
|
165
|
-
latest_schedule = retrieve_latest_schedule_config()
|
166
|
-
for sched in latest_schedule:
|
167
|
-
s_full.append(sched) # Append to full list regardless of enabled
|
168
|
-
if sched['enabled']:
|
169
|
-
s[sched['name']] = SermosEntry(sched)
|
170
|
-
self._schedule = s
|
171
|
-
self._schedule_full = s_full
|
172
|
-
|
173
|
-
# Report if schedule changed
|
174
|
-
if self._schedule_full != s_full_orig:
|
175
|
-
logger.info("SermosScheduler: Schedule updated ...")
|
176
|
-
logger.info(f"SermosScheduler: {self._schedule}")
|
177
|
-
|
178
|
-
def get_current_sermos_schedule(self):
|
179
|
-
""" Unpack Celery's current representation of the schedule into Sermos
|
180
|
-
format. This is used to send updates back to Sermos related to dynamic
|
181
|
-
properties such as last_run_at and total_run_count.
|
182
|
-
"""
|
183
|
-
|
184
|
-
sched = {'schedules': []}
|
185
|
-
for entry_name, entry in self.schedule.items():
|
186
|
-
sched['schedules'].append({
|
187
|
-
'id': entry.sermos_id,
|
188
|
-
'lastRunAt': entry.last_run_at.isoformat(),
|
189
|
-
'totalRunCount': entry.total_run_count
|
190
|
-
})
|
191
|
-
|
192
|
-
return sched
|
193
|
-
|
194
|
-
def setup_schedule(self):
|
195
|
-
self.install_default_entries(self.data)
|
196
|
-
# Overload default behavior and instead bootstrap with our _schedule
|
197
|
-
# instead of app.conf.beat_schedule.
|
198
|
-
self.merge_inplace(self._schedule)
|
199
|
-
|
200
|
-
def should_refresh(self):
|
201
|
-
""" Determine if enough time has elapsed to perform a schedule refresh.
|
202
|
-
|
203
|
-
We turn everything into microseconds so we don't spam external services
|
204
|
-
intra-second as most of the time, more than one task exists in the
|
205
|
-
schedule and therefore we need to check the scheduler's `schedule`
|
206
|
-
on each task very rapidly when issuing tasks.
|
207
|
-
"""
|
208
|
-
now = datetime.datetime.utcnow()
|
209
|
-
microseconds_since_last_refresh = float(
|
210
|
-
str((now - self._last_refresh).seconds) + "." +
|
211
|
-
str((now - self._last_refresh).microseconds)) * 1000000
|
212
|
-
res = bool(microseconds_since_last_refresh > self._refresh_rate)
|
213
|
-
if res is True:
|
214
|
-
self._last_refresh = now - datetime.timedelta(milliseconds=1)
|
215
|
-
return res
|
216
|
-
|
217
|
-
def sync(self):
|
218
|
-
""" Sync local schedule with Sermos and update Celery's representation
|
219
|
-
TODO check this vis-a-vis local vs cloud
|
220
|
-
"""
|
221
|
-
if self.schedule and USING_SERMOS_CLOUD:
|
222
|
-
update_schedule_config(self.get_current_sermos_schedule())
|
223
|
-
self.set_under_schedule() # Internal representation
|
224
|
-
self.merge_inplace(self._schedule) # Celery representation
|
225
|
-
|
226
|
-
def get_schedule(self):
|
227
|
-
""" Overload default Scheduler get_schedule method to check for updates
|
228
|
-
|
229
|
-
Note: Celery uses a property function, e.g.:
|
230
|
-
https://www.tutorialsteacher.com/python/property-function
|
231
|
-
for getting/setting the schedule internally. We only override the
|
232
|
-
get_schedule method here.
|
233
|
-
"""
|
234
|
-
update = False
|
235
|
-
if self._initial_read:
|
236
|
-
logger.info('SermosScheduler: Initial read ...')
|
237
|
-
update = True
|
238
|
-
self._initial_read = False
|
239
|
-
elif self.should_refresh():
|
240
|
-
logger.info('SermosScheduler: Refreshing schedule ...')
|
241
|
-
update = True
|
242
|
-
|
243
|
-
if update:
|
244
|
-
self.sync()
|
245
|
-
|
246
|
-
return self._schedule
|
247
|
-
|
248
|
-
def set_schedule(self, schedule):
|
249
|
-
""" Redefine Celery set_schedule method
|
250
|
-
"""
|
251
|
-
self.data = schedule
|
252
|
-
|
253
|
-
# Redefine Celery schedule property()
|
254
|
-
schedule = property(get_schedule, set_schedule)
|
pypeline/flask/api/utils.py
DELETED
@@ -1,35 +0,0 @@
|
|
1
|
-
""" Utilities for Sermos APIs and interacting with Pipelines/Schedules
|
2
|
-
"""
|
3
|
-
import logging
|
4
|
-
from typing import Union
|
5
|
-
|
6
|
-
from pypeline.pipeline.generator import PipelineGenerator
|
7
|
-
|
8
|
-
logger = logging.getLogger(__name__)
|
9
|
-
|
10
|
-
|
11
|
-
def chain_helper(pipeline_id: str,
|
12
|
-
access_key: Union[str, None] = None,
|
13
|
-
chain_payload: Union[dict, None] = None,
|
14
|
-
queue: Union[str, None] = None,
|
15
|
-
default_task_ttl: int = None):
|
16
|
-
""" Helper method to generate a pipeline chain *with* error handling.
|
17
|
-
|
18
|
-
Usage:
|
19
|
-
my_chain = chain_helper('pipeline-name')
|
20
|
-
my_chain.delay()
|
21
|
-
"""
|
22
|
-
# Get our pipeline. The PipelineGenerator will use the PipelineRunWrapper
|
23
|
-
# to cache this "run" of the pipeline.
|
24
|
-
gen = PipelineGenerator(pipeline_id,
|
25
|
-
access_key=access_key,
|
26
|
-
queue=queue,
|
27
|
-
default_task_ttl=default_task_ttl,
|
28
|
-
chain_payload=chain_payload)
|
29
|
-
if gen.good_to_go:
|
30
|
-
# Generate our 'chain', which is the grouping of celery constructs that
|
31
|
-
# allows our dag to run asynchronously and synchronously according to
|
32
|
-
# the adjacency list defined in our pipeline configuration.
|
33
|
-
gen.generate_chain()
|
34
|
-
|
35
|
-
return gen
|
pypeline/flask/flask_sermos.py
DELETED
@@ -1,156 +0,0 @@
|
|
1
|
-
""" Sermos implementation as a Flask extension
|
2
|
-
"""
|
3
|
-
import os
|
4
|
-
if os.getenv('USE_GEVENT', 'false').lower() == 'true':
|
5
|
-
import gevent.monkey
|
6
|
-
gevent.monkey.patch_all()
|
7
|
-
|
8
|
-
import logging
|
9
|
-
from flask import Flask
|
10
|
-
from werkzeug.middleware.proxy_fix import ProxyFix
|
11
|
-
from flask_smorest import Api, Blueprint
|
12
|
-
from pypeline.extensions import sermos_config
|
13
|
-
from pypeline.constants import DEFAULT_OPENAPI_CONFIG
|
14
|
-
from pypeline import __version__
|
15
|
-
|
16
|
-
logger = logging.getLogger(__name__)
|
17
|
-
|
18
|
-
|
19
|
-
class FlaskSermos:
|
20
|
-
""" Sermos Flask extension.
|
21
|
-
"""
|
22
|
-
def __init__(self, app: Flask = None):
|
23
|
-
""" Class init
|
24
|
-
"""
|
25
|
-
self.app = app
|
26
|
-
self.sermos_config = sermos_config if sermos_config is not None else {}
|
27
|
-
|
28
|
-
if app is not None:
|
29
|
-
self.init_app(app)
|
30
|
-
|
31
|
-
def init_app(self, app: Flask, init_api: bool = False):
|
32
|
-
""" Sermos bootstrapping process.
|
33
|
-
|
34
|
-
Application config variables to set include:
|
35
|
-
|
36
|
-
SERMOS_CLIENT_VERSION (default: v?.?.?)
|
37
|
-
SERMOS_HIJACK_ROOT_LOGGER (default: False)
|
38
|
-
|
39
|
-
Optional, if `init_api` is True:
|
40
|
-
|
41
|
-
API_DOCUMENTATION_TITLE
|
42
|
-
API_DOCUMENTATION_DESCRIPTION
|
43
|
-
OPENAPI_VERSION
|
44
|
-
OPENAPI_URL_PREFIX
|
45
|
-
OPENAPI_SWAGGER_APP_NAME
|
46
|
-
OPENAPI_SWAGGER_UI_PATH
|
47
|
-
OPENAPI_SWAGGER_BASE_TEMPLATE
|
48
|
-
OPENAPI_SWAGGER_URL
|
49
|
-
OPENAPI_SWAGGER_UI_URL
|
50
|
-
SWAGGER_UI_DOC_EXPANSION
|
51
|
-
EXPLAIN_TEMPLATE_LOADING
|
52
|
-
|
53
|
-
Args:
|
54
|
-
app (Flask): Flask Application to initialize.
|
55
|
-
init_api (bool): If `True`, Sermos will initialize its
|
56
|
-
core APIs (including Pipelines, Scheduled Tasks, etc.) and
|
57
|
-
provide a pre-configured OpenAPI Spec/Swagger UI interface
|
58
|
-
available at the route defined in your application's config
|
59
|
-
under `OPENAPI_URL_PREFIX` (default `/api`). Refer to
|
60
|
-
[flask-smorest](https://flask-smorest.readthedocs.io/en/latest/openapi.html)
|
61
|
-
documentation for additional configuration options.
|
62
|
-
"""
|
63
|
-
# Ensure there's a SERMOS_CLIENT_VERSION on app config
|
64
|
-
app.config.setdefault(
|
65
|
-
'SERMOS_CLIENT_VERSION',
|
66
|
-
app.config.get("SERMOS_CLIENT_VERSION", "v?.?.?"))
|
67
|
-
|
68
|
-
app.wsgi_app = ProxyFix(app.wsgi_app)
|
69
|
-
app.url_map.strict_slashes = False
|
70
|
-
|
71
|
-
# Create and register the sermos blueprint
|
72
|
-
bp = Blueprint('sermos',
|
73
|
-
__name__,
|
74
|
-
template_folder='../templates',
|
75
|
-
static_folder='../static',
|
76
|
-
url_prefix='/sermos')
|
77
|
-
app.register_blueprint(bp)
|
78
|
-
|
79
|
-
# Bootstrap api if app requests
|
80
|
-
if init_api is True:
|
81
|
-
self._bootstrap_api(app)
|
82
|
-
|
83
|
-
def _bootstrap_api(self, app: Flask):
|
84
|
-
""" If initializing the API, we will create the core Sermos API paths
|
85
|
-
and initialize the default Swagger documentation.
|
86
|
-
"""
|
87
|
-
# Set sensible defaults for Swagger docs. Provided `app` will
|
88
|
-
# take precedent.
|
89
|
-
for swagger_config in DEFAULT_OPENAPI_CONFIG:
|
90
|
-
app.config.setdefault(
|
91
|
-
swagger_config[0],
|
92
|
-
app.config.get(swagger_config[0], swagger_config[1]))
|
93
|
-
|
94
|
-
# Attempt to override with values from client's sermos.yaml if
|
95
|
-
# they are available. This will add new tags and new docs if
|
96
|
-
# defined and add to the core Sermos API docs.
|
97
|
-
api_config = self.sermos_config.get('apiConfig', {})
|
98
|
-
api_docs = api_config.get('apiDocumentation', {})
|
99
|
-
|
100
|
-
custom_tags = api_config.get('prefixDescriptions', [])
|
101
|
-
|
102
|
-
app.config['SERMOS_CLIENT_VERSION'] = \
|
103
|
-
api_docs.get('version', None) \
|
104
|
-
if api_docs.get('version', None) is not None \
|
105
|
-
else app.config['SERMOS_CLIENT_VERSION']
|
106
|
-
|
107
|
-
app.config['API_DOCUMENTATION_TITLE'] = \
|
108
|
-
api_docs.get('title', None) \
|
109
|
-
if api_docs.get('title', None) is not None \
|
110
|
-
else app.config['API_DOCUMENTATION_TITLE']
|
111
|
-
|
112
|
-
app.config['API_DOCUMENTATION_DESCRIPTION'] = \
|
113
|
-
api_docs.get('description', None) \
|
114
|
-
if api_docs.get('description', None) is not None \
|
115
|
-
else app.config['API_DOCUMENTATION_DESCRIPTION']
|
116
|
-
|
117
|
-
# Set default Sermos Tags along with custom tags from sermos.yaml
|
118
|
-
tags = [{
|
119
|
-
'name': 'Pipelines',
|
120
|
-
'description': 'Operations related to Pipelines'
|
121
|
-
}, {
|
122
|
-
'name': 'Schedules',
|
123
|
-
'description': 'Operations related to Schedules'
|
124
|
-
}] + custom_tags
|
125
|
-
|
126
|
-
# Set up the initializing spec kwargs for API
|
127
|
-
spec_kwargs = {
|
128
|
-
'title': app.config['API_DOCUMENTATION_TITLE'],
|
129
|
-
'version': f"Sermos: {__version__} - "
|
130
|
-
f"Client: {app.config['SERMOS_CLIENT_VERSION']}",
|
131
|
-
'description': app.config['API_DOCUMENTATION_DESCRIPTION'],
|
132
|
-
'tags': tags
|
133
|
-
}
|
134
|
-
try:
|
135
|
-
api = Api()
|
136
|
-
api.init_app(app, spec_kwargs=spec_kwargs)
|
137
|
-
|
138
|
-
# Register available Sermos API Namespaces
|
139
|
-
self._register_api_namespaces(api)
|
140
|
-
|
141
|
-
except Exception as e:
|
142
|
-
api = None
|
143
|
-
logging.exception(f"Unable to initialize API ... {e}")
|
144
|
-
|
145
|
-
# Register the Sermos Core API as an extension for use in Client App
|
146
|
-
app.extensions.setdefault('sermos_core_api', api)
|
147
|
-
|
148
|
-
@staticmethod
|
149
|
-
def _register_api_namespaces(api: Api):
|
150
|
-
""" Register Default API namespaces
|
151
|
-
TODO add metrics APIs
|
152
|
-
"""
|
153
|
-
from pypeline.flask.api.pipelines import bp as pipelinesbp
|
154
|
-
api.register_blueprint(pipelinesbp)
|
155
|
-
from pypeline.flask.api.schedules import bp as schedulesbp
|
156
|
-
api.register_blueprint(schedulesbp)
|
pypeline/generators.py
DELETED
@@ -1,196 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
from boto3 import Session
|
3
|
-
import logging
|
4
|
-
|
5
|
-
logger = logging.getLogger(__name__)
|
6
|
-
|
7
|
-
|
8
|
-
class KeyGenerator(object):
|
9
|
-
""" Common functions for key generators.
|
10
|
-
"""
|
11
|
-
def __init__(self):
|
12
|
-
super(KeyGenerator, self).__init__()
|
13
|
-
self.hidden_files = ('.DS_Store', '.git', 'Icon', '.Dropbox')
|
14
|
-
|
15
|
-
def get_file_key(self, file_obj):
|
16
|
-
""" Required for each specific generator - how to extract key
|
17
|
-
"""
|
18
|
-
return file_obj
|
19
|
-
|
20
|
-
def get_file_name(self, file_obj):
|
21
|
-
""" Required for each specific generator - how to extract file name
|
22
|
-
"""
|
23
|
-
return file_obj
|
24
|
-
|
25
|
-
def get_file_size(self, base_path, file_obj):
|
26
|
-
""" Required for each specific generator - how to find file size (BYTES)
|
27
|
-
"""
|
28
|
-
return 0
|
29
|
-
|
30
|
-
def get_final_path(self, base_path, file_name, return_full_path):
|
31
|
-
""" Required for each specific generator - create final file path that
|
32
|
-
is added to list.
|
33
|
-
"""
|
34
|
-
if return_full_path:
|
35
|
-
return os.path.normpath(base_path + '/' + file_name)
|
36
|
-
|
37
|
-
return file_name
|
38
|
-
|
39
|
-
def list_iterator(self, all_files, base_path, limit=None, offset=None,
|
40
|
-
size_limit=None, return_full_path=True,
|
41
|
-
skip_common_hidden=True):
|
42
|
-
""" accept vars from everywhere to handle offset/limit/size logic
|
43
|
-
"""
|
44
|
-
filtered_files = []
|
45
|
-
try:
|
46
|
-
# Compile list of all files within limit/offset if those exist
|
47
|
-
idx = -1
|
48
|
-
listed_files = 0
|
49
|
-
offset_reached = False
|
50
|
-
for f in all_files:
|
51
|
-
this_key = self.get_file_key(f)
|
52
|
-
this_filename = self.get_file_name(f)
|
53
|
-
|
54
|
-
if skip_common_hidden and this_filename in self.hidden_files:
|
55
|
-
continue
|
56
|
-
|
57
|
-
idx += 1
|
58
|
-
if offset and idx >= int(offset):
|
59
|
-
offset_reached = True
|
60
|
-
|
61
|
-
if (limit and listed_files >= int(limit))\
|
62
|
-
or (offset and not offset_reached):
|
63
|
-
continue
|
64
|
-
|
65
|
-
# Verify filesize. Having some issues with large PDFs (process
|
66
|
-
# simply killed). So allow option of skipping files above certain
|
67
|
-
# size in megabytes.
|
68
|
-
if size_limit is not None:
|
69
|
-
size_in_bytes = self.get_file_size(base_path, f)
|
70
|
-
if size_in_bytes > size_limit:
|
71
|
-
continue
|
72
|
-
|
73
|
-
filtered_files.append(
|
74
|
-
self.get_final_path(base_path, this_key, return_full_path)
|
75
|
-
)
|
76
|
-
listed_files += 1
|
77
|
-
except Exception as e:
|
78
|
-
logger.error("Unable to list objects: {0}".format(e))
|
79
|
-
|
80
|
-
return filtered_files
|
81
|
-
|
82
|
-
|
83
|
-
class S3KeyGenerator(KeyGenerator):
|
84
|
-
""" Produce a list of object keys from S3.
|
85
|
-
"""
|
86
|
-
def __init__(self, aws_access_key_id, aws_secret_access_key,
|
87
|
-
aws_region='us-east-1'):
|
88
|
-
super(S3KeyGenerator, self).__init__()
|
89
|
-
|
90
|
-
session = Session(
|
91
|
-
aws_access_key_id=aws_access_key_id,
|
92
|
-
aws_secret_access_key=aws_secret_access_key,
|
93
|
-
region_name=aws_region
|
94
|
-
)
|
95
|
-
self.s3 = session.client('s3')
|
96
|
-
|
97
|
-
def get_file_key(self, file_obj):
|
98
|
-
""" Get file key from s3 object """
|
99
|
-
return file_obj.get('Key', None)
|
100
|
-
|
101
|
-
def get_file_name(self, file_obj):
|
102
|
-
""" Get file name from s3 object """
|
103
|
-
if file_obj is not None:
|
104
|
-
key = file_obj.get('Key', None)
|
105
|
-
if key is not None:
|
106
|
-
return key.split('/')[-1]
|
107
|
-
return None
|
108
|
-
|
109
|
-
def get_file_size(self, base_path, file_obj):
|
110
|
-
""" Return file size of s3 object """
|
111
|
-
return file_obj.get('Size', 0)
|
112
|
-
|
113
|
-
# All files in bucket
|
114
|
-
# Range of files with an offset
|
115
|
-
def list_files(self, bucket, folder='', limit=None, offset=None,
|
116
|
-
size_limit=None, return_full_path=True,
|
117
|
-
skip_common_hidden=True):
|
118
|
-
""" Lists files inside an S3 bucket+folder
|
119
|
-
|
120
|
-
Note: This does not guarantee any sort of order. Boto+S3 does not
|
121
|
-
provide an interface for sorting results, so that would need
|
122
|
-
to happen in memory.
|
123
|
-
|
124
|
-
limit will include a maximum of 'limit' values
|
125
|
-
offset will start including values only after 'offset' keys
|
126
|
-
size_limit will not include files over a specific size (in bytes)
|
127
|
-
skip_common_hidden will exclude common hidden files
|
128
|
-
return_full_path will include 'bucket/' in key.
|
129
|
-
"""
|
130
|
-
files = []
|
131
|
-
|
132
|
-
try:
|
133
|
-
file_data = self.s3.list_objects_v2(
|
134
|
-
Bucket=bucket, Delimiter='/', Prefix=folder)
|
135
|
-
files = self.list_iterator(
|
136
|
-
file_data['Contents'],
|
137
|
-
bucket,
|
138
|
-
limit=limit,
|
139
|
-
offset=offset,
|
140
|
-
size_limit=size_limit,
|
141
|
-
return_full_path=return_full_path,
|
142
|
-
skip_common_hidden=skip_common_hidden
|
143
|
-
)
|
144
|
-
|
145
|
-
except Exception as e:
|
146
|
-
logger.error("Unable to list objects: {0}".format(e))
|
147
|
-
return files
|
148
|
-
|
149
|
-
|
150
|
-
class LocalKeyGenerator(KeyGenerator):
|
151
|
-
""" Generic generator to produce a list of file names from filesystem.
|
152
|
-
"""
|
153
|
-
def __init__(self):
|
154
|
-
super(LocalKeyGenerator, self).__init__()
|
155
|
-
|
156
|
-
def get_file_key(self, file_obj):
|
157
|
-
""" Get file key from local object """
|
158
|
-
return file_obj
|
159
|
-
|
160
|
-
def get_file_name(self, file_obj):
|
161
|
-
""" Get file name from local object """
|
162
|
-
return file_obj
|
163
|
-
|
164
|
-
def get_file_size(self, base_path, file_obj):
|
165
|
-
""" Get file size from local object """
|
166
|
-
full_path = os.path.normpath(base_path + '/' + file_obj)
|
167
|
-
try:
|
168
|
-
return os.stat(full_path).st_size
|
169
|
-
except Exception as e:
|
170
|
-
logger.error("File {0} not found ...".format(full_path))
|
171
|
-
return 0
|
172
|
-
|
173
|
-
def list_files(self, folder_path, limit=None, offset=None,
|
174
|
-
size_limit=None, return_full_path=True,
|
175
|
-
skip_common_hidden=True):
|
176
|
-
""" Lists all file names inside a path.
|
177
|
-
|
178
|
-
skip_common_hidden will exclude common hidden files
|
179
|
-
return_full_path will include path in addition to filename
|
180
|
-
"""
|
181
|
-
files = []
|
182
|
-
try:
|
183
|
-
file_data = os.listdir(folder_path)
|
184
|
-
files = self.list_iterator(
|
185
|
-
file_data,
|
186
|
-
folder_path,
|
187
|
-
limit=limit,
|
188
|
-
offset=offset,
|
189
|
-
size_limit=size_limit,
|
190
|
-
return_full_path=return_full_path,
|
191
|
-
skip_common_hidden=skip_common_hidden
|
192
|
-
)
|
193
|
-
|
194
|
-
except Exception as e:
|
195
|
-
logger.error("Unable to list objects: {0}".format(e))
|
196
|
-
return files
|