scalable-pypeline 1.1.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pypeline/generators.py ADDED
@@ -0,0 +1,196 @@
1
+ import os
2
+ from boto3 import Session
3
+ import logging
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+
8
+ class KeyGenerator(object):
9
+ """ Common functions for key generators.
10
+ """
11
+ def __init__(self):
12
+ super(KeyGenerator, self).__init__()
13
+ self.hidden_files = ('.DS_Store', '.git', 'Icon', '.Dropbox')
14
+
15
+ def get_file_key(self, file_obj):
16
+ """ Required for each specific generator - how to extract key
17
+ """
18
+ return file_obj
19
+
20
+ def get_file_name(self, file_obj):
21
+ """ Required for each specific generator - how to extract file name
22
+ """
23
+ return file_obj
24
+
25
+ def get_file_size(self, base_path, file_obj):
26
+ """ Required for each specific generator - how to find file size (BYTES)
27
+ """
28
+ return 0
29
+
30
+ def get_final_path(self, base_path, file_name, return_full_path):
31
+ """ Required for each specific generator - create final file path that
32
+ is added to list.
33
+ """
34
+ if return_full_path:
35
+ return os.path.normpath(base_path + '/' + file_name)
36
+
37
+ return file_name
38
+
39
+ def list_iterator(self, all_files, base_path, limit=None, offset=None,
40
+ size_limit=None, return_full_path=True,
41
+ skip_common_hidden=True):
42
+ """ accept vars from everywhere to handle offset/limit/size logic
43
+ """
44
+ filtered_files = []
45
+ try:
46
+ # Compile list of all files within limit/offset if those exist
47
+ idx = -1
48
+ listed_files = 0
49
+ offset_reached = False
50
+ for f in all_files:
51
+ this_key = self.get_file_key(f)
52
+ this_filename = self.get_file_name(f)
53
+
54
+ if skip_common_hidden and this_filename in self.hidden_files:
55
+ continue
56
+
57
+ idx += 1
58
+ if offset and idx >= int(offset):
59
+ offset_reached = True
60
+
61
+ if (limit and listed_files >= int(limit))\
62
+ or (offset and not offset_reached):
63
+ continue
64
+
65
+ # Verify filesize. Having some issues with large PDFs (process
66
+ # simply killed). So allow option of skipping files above certain
67
+ # size in megabytes.
68
+ if size_limit is not None:
69
+ size_in_bytes = self.get_file_size(base_path, f)
70
+ if size_in_bytes > size_limit:
71
+ continue
72
+
73
+ filtered_files.append(
74
+ self.get_final_path(base_path, this_key, return_full_path)
75
+ )
76
+ listed_files += 1
77
+ except Exception as e:
78
+ logger.error("Unable to list objects: {0}".format(e))
79
+
80
+ return filtered_files
81
+
82
+
83
+ class S3KeyGenerator(KeyGenerator):
84
+ """ Produce a list of object keys from S3.
85
+ """
86
+ def __init__(self, aws_access_key_id, aws_secret_access_key,
87
+ aws_region='us-east-1'):
88
+ super(S3KeyGenerator, self).__init__()
89
+
90
+ session = Session(
91
+ aws_access_key_id=aws_access_key_id,
92
+ aws_secret_access_key=aws_secret_access_key,
93
+ region_name=aws_region
94
+ )
95
+ self.s3 = session.client('s3')
96
+
97
+ def get_file_key(self, file_obj):
98
+ """ Get file key from s3 object """
99
+ return file_obj.get('Key', None)
100
+
101
+ def get_file_name(self, file_obj):
102
+ """ Get file name from s3 object """
103
+ if file_obj is not None:
104
+ key = file_obj.get('Key', None)
105
+ if key is not None:
106
+ return key.split('/')[-1]
107
+ return None
108
+
109
+ def get_file_size(self, base_path, file_obj):
110
+ """ Return file size of s3 object """
111
+ return file_obj.get('Size', 0)
112
+
113
+ # All files in bucket
114
+ # Range of files with an offset
115
+ def list_files(self, bucket, folder='', limit=None, offset=None,
116
+ size_limit=None, return_full_path=True,
117
+ skip_common_hidden=True):
118
+ """ Lists files inside an S3 bucket+folder
119
+
120
+ Note: This does not guarantee any sort of order. Boto+S3 does not
121
+ provide an interface for sorting results, so that would need
122
+ to happen in memory.
123
+
124
+ limit will include a maximum of 'limit' values
125
+ offset will start including values only after 'offset' keys
126
+ size_limit will not include files over a specific size (in bytes)
127
+ skip_common_hidden will exclude common hidden files
128
+ return_full_path will include 'bucket/' in key.
129
+ """
130
+ files = []
131
+
132
+ try:
133
+ file_data = self.s3.list_objects_v2(
134
+ Bucket=bucket, Delimiter='/', Prefix=folder)
135
+ files = self.list_iterator(
136
+ file_data['Contents'],
137
+ bucket,
138
+ limit=limit,
139
+ offset=offset,
140
+ size_limit=size_limit,
141
+ return_full_path=return_full_path,
142
+ skip_common_hidden=skip_common_hidden
143
+ )
144
+
145
+ except Exception as e:
146
+ logger.error("Unable to list objects: {0}".format(e))
147
+ return files
148
+
149
+
150
+ class LocalKeyGenerator(KeyGenerator):
151
+ """ Generic generator to produce a list of file names from filesystem.
152
+ """
153
+ def __init__(self):
154
+ super(LocalKeyGenerator, self).__init__()
155
+
156
+ def get_file_key(self, file_obj):
157
+ """ Get file key from local object """
158
+ return file_obj
159
+
160
+ def get_file_name(self, file_obj):
161
+ """ Get file name from local object """
162
+ return file_obj
163
+
164
+ def get_file_size(self, base_path, file_obj):
165
+ """ Get file size from local object """
166
+ full_path = os.path.normpath(base_path + '/' + file_obj)
167
+ try:
168
+ return os.stat(full_path).st_size
169
+ except Exception as e:
170
+ logger.error("File {0} not found ...".format(full_path))
171
+ return 0
172
+
173
+ def list_files(self, folder_path, limit=None, offset=None,
174
+ size_limit=None, return_full_path=True,
175
+ skip_common_hidden=True):
176
+ """ Lists all file names inside a path.
177
+
178
+ skip_common_hidden will exclude common hidden files
179
+ return_full_path will include path in addition to filename
180
+ """
181
+ files = []
182
+ try:
183
+ file_data = os.listdir(folder_path)
184
+ files = self.list_iterator(
185
+ file_data,
186
+ folder_path,
187
+ limit=limit,
188
+ offset=offset,
189
+ size_limit=size_limit,
190
+ return_full_path=return_full_path,
191
+ skip_common_hidden=skip_common_hidden
192
+ )
193
+
194
+ except Exception as e:
195
+ logger.error("Unable to list objects: {0}".format(e))
196
+ return files
File without changes
@@ -0,0 +1,159 @@
1
+ """ A *local* configuration server to serve local pipeline and schedule configs.
2
+
3
+ This should be used for development purposes only.
4
+
5
+ Note: `deployment_id` is unused in all endpoints but is included in this
6
+ development server for full compatibility with Sermos managed deployments.
7
+ """
8
+ import json
9
+ import yaml
10
+ import logging
11
+ from typing import Union
12
+ from marshmallow.exceptions import ValidationError
13
+ from flask import Flask, request, jsonify
14
+ from rho_web.response import abort
15
+ from pypeline.schedule_config_schema import BaseScheduleSchema
16
+
17
+ logger = logging.getLogger(__name__)
18
+ api = Flask(__name__)
19
+ PREFIX = '/api/v1'
20
+
21
+
22
+ def set_api_config(base_dir: str = None,
23
+ pipelines_yaml: str = None,
24
+ schedules_json: str = None):
25
+ """ Establish baseline api configuration (where to find config files)
26
+ """
27
+ api.config.update(
28
+ BASE_DIR=base_dir if base_dir else 'dev',
29
+ PIPELINES_YAML=pipelines_yaml if pipelines_yaml else 'pipelines.yaml',
30
+ SCHEDULES_JSON=schedules_json if schedules_json else 'schedules.json')
31
+
32
+
33
+ set_api_config() # Set by default, can overload manually before starting
34
+
35
+
36
+ def _retrieve_schedules() -> Union[dict, None]:
37
+ """ Load local schedules.json file
38
+ """
39
+ filename = api.config['BASE_DIR'] + '/' + api.config['SCHEDULES_JSON']
40
+ with open(filename, 'r') as f:
41
+ schedules = json.loads(f.read())
42
+ return schedules
43
+
44
+
45
+ def _save_schedules(schedules: dict) -> Union[dict, None]:
46
+ """ Save local schedules.json file
47
+ """
48
+ filename = api.config['BASE_DIR'] + '/' + api.config['SCHEDULES_JSON']
49
+ with open(filename, 'w') as f:
50
+ f.write(json.dumps(schedules))
51
+
52
+
53
+ def _retrieve_pipelines() -> Union[dict, None]:
54
+ """ Load local pipelines.yaml and load a specific pipeline configuration.
55
+ """
56
+ filename = api.config['BASE_DIR'] + '/' + api.config['PIPELINES_YAML']
57
+ with open(filename, 'r') as f:
58
+ pipelines = yaml.safe_load(f.read())
59
+ return pipelines
60
+
61
+
62
+ def _retrieve_pipeline(pipeline_id: str) -> Union[dict, None]:
63
+ """ Load local pipelines.yaml and load a specific pipeline configuration.
64
+ """
65
+ pipelines = _retrieve_pipelines()
66
+ for pipeline in pipelines['pipelines']:
67
+ if pipeline['metadata']['pipelineId'] == pipeline_id:
68
+ return pipeline
69
+ return None
70
+
71
+
72
+ @api.route(PREFIX + '/deployments/<string:deployment_id>/schedule_tasks',
73
+ methods=['GET'])
74
+ def get_schedules(deployment_id: str):
75
+ """ Load local schedules.json file.
76
+ """
77
+ logger.debug(f"Retrieving schedules for {deployment_id}")
78
+
79
+ schedules = _retrieve_schedules()
80
+ try:
81
+ BaseScheduleSchema().load(schedules)
82
+ except Exception as e:
83
+ logger.error(f"Error retrieving schedules: {e}")
84
+ abort(400, message=e)
85
+ return jsonify(schedules)
86
+
87
+
88
+ @api.route(PREFIX + '/deployments/<string:deployment_id>/schedule_tasks/'
89
+ '<string:task_id>',
90
+ methods=['POST'])
91
+ def update_schedules(deployment_id: str):
92
+ """ Update local schedules.json file with values from provided schedule.
93
+
94
+ Primarily this is intended to keep the last_run_at and total_run_count
95
+ values up to date.
96
+ """
97
+ logger.debug(f"Updating schedules for {deployment_id}")
98
+
99
+ new_schedules = json.loads(request.data) # Schedules with updates
100
+ schedules = _retrieve_schedules() # Schedules known to Sermos
101
+ update_vars = ('last_run_at', 'total_run_count')
102
+ for s in schedules['schedules']:
103
+ for new_s in new_schedules['schedules']:
104
+ if s['name'] == new_s['name']:
105
+ for var in update_vars:
106
+ s[var] = new_s[var]
107
+ try:
108
+ scs = BaseScheduleSchema()
109
+ scs.load(schedules) # Validate new schedule
110
+ except ValidationError:
111
+ abort(400, message="Invalid new schedule ...")
112
+
113
+ _save_schedules(schedules)
114
+
115
+ return jsonify({'message': 'Schedules updated ...'})
116
+
117
+
118
+ @api.route(PREFIX + '/deployments/<string:deployment_id>/pipelines',
119
+ methods=['GET'])
120
+ def get_pipelines(deployment_id: str):
121
+ """ Load local pipelines.yaml file
122
+ """
123
+ logger.debug(f"Retrieving pipelines for {deployment_id}")
124
+
125
+ pipelines = _retrieve_pipelines()
126
+
127
+ # Transform into what we expect from Cloud API server. The local
128
+ # pipelines.yaml file format is for your own development and reference
129
+ # if you choose to deploy independently.
130
+ retval = []
131
+ for pipeline in pipelines['pipelines']:
132
+ retval.append(pipeline)
133
+ return jsonify({'data': {'results': retval}})
134
+
135
+
136
+ @api.route(PREFIX + '/pipelines/<string:deployment_id>/<string:pipeline_id>',
137
+ methods=['GET'])
138
+ def get_pipeline(deployment_id: str, pipeline_id: str):
139
+ """ Load local pipeline.yaml and retrieve a specific pipeline.
140
+ """
141
+ logger.debug(f"Retrieving pipeline for {deployment_id} / {pipeline_id}")
142
+
143
+ pipeline = _retrieve_pipeline(pipeline_id)
144
+
145
+ if pipeline is not None:
146
+ return jsonify({'data': pipeline})
147
+ return jsonify({}), 404
148
+
149
+
150
+ @api.route(PREFIX + '/auth', methods=['POST'])
151
+ def validate_access_key():
152
+ """ Validate a provided API key.
153
+
154
+ NOTE: This is a *mock* endpoint, no actual validation occurs.
155
+ """
156
+ access_key = request.headers.get('accesskey', None)
157
+ if access_key is None:
158
+ abort(401)
159
+ return jsonify({})
@@ -0,0 +1,171 @@
1
+ import os
2
+ import logging
3
+ import logging.config
4
+ from pypeline import __version__
5
+ from logging import StreamHandler
6
+
7
+ logging_set = False
8
+
9
+
10
+ def get_log_level(level: str = None) -> int:
11
+ """ Attempt to get the log level from the environment, otherwise use the
12
+ default INFO level. The environment variable LOG_LEVEL should be e.g.,
13
+ 'DEBUG'
14
+ """
15
+ if level is not None:
16
+ level_str = str(level)
17
+ else:
18
+ level_str = os.environ.get('LOG_LEVEL', 'INFO')
19
+ return getattr(logging, level_str)
20
+
21
+
22
+ def get_log_format(type: str = 'standard',
23
+ app_version: str = None,
24
+ client_version: str = None):
25
+ """ Standard log format. Supports `standard` and `simple`
26
+ """
27
+ if app_version is None:
28
+ app_version = "?"
29
+ if client_version is None:
30
+ client_version = "?"
31
+
32
+ format = '%(message)s'
33
+ if type == 'standard':
34
+ format = '%(process)d - %(levelname)s - %(asctime)s - '\
35
+ + '%(filename)s (%(lineno)d) - '\
36
+ + 'sermos v{} - client v{} - %(message)s'\
37
+ .format(app_version, client_version)
38
+ elif type == 'simple':
39
+ format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
40
+
41
+ return format
42
+
43
+
44
+ def get_date_format():
45
+ """ Standard date format
46
+ """
47
+ return '%Y-%m-%dT%H:%M:%S'
48
+
49
+
50
+ def setup_logging(app_version: str = None,
51
+ client_version: str = None,
52
+ default_level: str = None,
53
+ overload_elasticsearch: bool = False,
54
+ establish_logging_config: bool = True):
55
+ """ Setup logging configuration for standard streaming output + optional
56
+ log aggregator.
57
+
58
+ Standard usage is to invoke this at application bootstrapping time
59
+ to establish default log handling. e.g.
60
+
61
+ def create_app():
62
+ setup_logging()
63
+
64
+ Individual application modules should load a logger like normal:
65
+ import logging
66
+ logger = logging.getLogger(__name__)
67
+
68
+ elasticsearch-py is overly verbose with it's 'info' logging. This
69
+ will set that logger to `warning` if `overload_elasticsearch` is True
70
+
71
+ `establish_logging_config` is intended to be used by something invoking
72
+ setup_logging() explicitly with the intention of setting the final
73
+ configuration, which is the default behavior. Set this to `False` in the
74
+ case where you might not be sure if logging has been set up yet.
75
+ """
76
+ global logging_set
77
+
78
+ if logging_set and not establish_logging_config:
79
+ return
80
+
81
+ if establish_logging_config or not logging_set:
82
+ logging_set = True
83
+
84
+ # Set our application version values, which can be passed to this method.
85
+ # By default, we report the app versions for sermos and the client
86
+ A_VERSION = __version__ # sermos version
87
+ CA_VERSION = None # application version of client app using sermos
88
+ if app_version is not None:
89
+ A_VERSION = app_version
90
+ if client_version is not None:
91
+ CA_VERSION = client_version
92
+
93
+ log_level = get_log_level(default_level)
94
+
95
+ config = {
96
+ 'disable_existing_loggers': False,
97
+ 'version': 1,
98
+ 'formatters': {
99
+ 'simple': {
100
+ 'format':
101
+ get_log_format(type='simple',
102
+ app_version=A_VERSION,
103
+ client_version=CA_VERSION),
104
+ 'datefmt':
105
+ get_date_format()
106
+ },
107
+ 'standard': {
108
+ 'format':
109
+ get_log_format(type='standard',
110
+ app_version=A_VERSION,
111
+ client_version=CA_VERSION),
112
+ 'datefmt':
113
+ get_date_format()
114
+ },
115
+ },
116
+ 'handlers': {
117
+ 'consoleFull': {
118
+ 'level': 'DEBUG',
119
+ 'formatter': 'standard',
120
+ 'class': 'logging.StreamHandler',
121
+ 'stream': 'ext://sys.stdout'
122
+ },
123
+ },
124
+ 'loggers': {
125
+ '': {
126
+ 'handlers': ['consoleFull'],
127
+ 'level': 'ERROR',
128
+ },
129
+ 'sermos': {
130
+ 'handlers': ['consoleFull'],
131
+ 'level': 'DEBUG',
132
+ 'propagate': False
133
+ },
134
+ 'timing': {
135
+ 'handlers': ['consoleFull'],
136
+ 'level': 'DEBUG',
137
+ 'propagate': False
138
+ },
139
+ 'celery': {
140
+ 'handlers': ['consoleFull'],
141
+ 'level': 'DEBUG',
142
+ 'propagate': False
143
+ },
144
+ 'bin': {
145
+ 'handlers': ['consoleFull'],
146
+ 'level': 'DEBUG',
147
+ 'propagate': False
148
+ },
149
+ },
150
+ 'root': {
151
+ 'level': 'DEBUG',
152
+ 'handlers': ['consoleFull']
153
+ }
154
+ }
155
+
156
+ for handler, handler_config in config['handlers'].items():
157
+ # Override this handler's level to the level passed to this method
158
+ handler_config['level'] = log_level
159
+ config['handlers'][handler] = handler_config
160
+
161
+ # Set the root handler's level
162
+ config['root']['level'] = log_level
163
+
164
+ logging.config.dictConfig(config)
165
+
166
+ es_logger = logging.getLogger('elasticsearch')
167
+ if overload_elasticsearch is True:
168
+ es_logger.setLevel(logging.WARNING)
169
+ else:
170
+ # Ensure to set to baseline in the event this is invoked multiple times.
171
+ es_logger.setLevel(logging.INFO)