scalable-pypeline 1.2.3__py2.py3-none-any.whl → 2.0.2__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. pypeline/__init__.py +1 -1
  2. pypeline/barrier.py +34 -0
  3. pypeline/composition.py +349 -0
  4. pypeline/constants.py +51 -84
  5. pypeline/dramatiq.py +470 -0
  6. pypeline/extensions.py +9 -8
  7. pypeline/flask/__init__.py +3 -5
  8. pypeline/flask/api/pipelines.py +109 -148
  9. pypeline/flask/api/schedules.py +14 -39
  10. pypeline/flask/decorators.py +18 -53
  11. pypeline/flask/flask_pypeline.py +156 -0
  12. pypeline/middleware.py +61 -0
  13. pypeline/pipeline_config_schema.py +105 -92
  14. pypeline/pypeline_yaml.py +458 -0
  15. pypeline/schedule_config_schema.py +35 -120
  16. pypeline/utils/config_utils.py +52 -310
  17. pypeline/utils/module_utils.py +35 -71
  18. pypeline/utils/pipeline_utils.py +161 -0
  19. scalable_pypeline-2.0.2.dist-info/METADATA +217 -0
  20. scalable_pypeline-2.0.2.dist-info/RECORD +27 -0
  21. scalable_pypeline-2.0.2.dist-info/entry_points.txt +3 -0
  22. tests/fixtures/__init__.py +0 -1
  23. pypeline/celery.py +0 -206
  24. pypeline/celery_beat.py +0 -254
  25. pypeline/flask/api/utils.py +0 -35
  26. pypeline/flask/flask_sermos.py +0 -156
  27. pypeline/generators.py +0 -196
  28. pypeline/logging_config.py +0 -171
  29. pypeline/pipeline/__init__.py +0 -0
  30. pypeline/pipeline/chained_task.py +0 -70
  31. pypeline/pipeline/generator.py +0 -254
  32. pypeline/sermos_yaml.py +0 -442
  33. pypeline/utils/graph_utils.py +0 -144
  34. pypeline/utils/task_utils.py +0 -552
  35. scalable_pypeline-1.2.3.dist-info/METADATA +0 -163
  36. scalable_pypeline-1.2.3.dist-info/RECORD +0 -33
  37. scalable_pypeline-1.2.3.dist-info/entry_points.txt +0 -2
  38. tests/fixtures/s3_fixtures.py +0 -52
  39. {scalable_pypeline-1.2.3.dist-info → scalable_pypeline-2.0.2.dist-info}/LICENSE +0 -0
  40. {scalable_pypeline-1.2.3.dist-info → scalable_pypeline-2.0.2.dist-info}/WHEEL +0 -0
  41. {scalable_pypeline-1.2.3.dist-info → scalable_pypeline-2.0.2.dist-info}/top_level.txt +0 -0
pypeline/celery_beat.py DELETED
@@ -1,254 +0,0 @@
1
- """ Custom Sermos Scheduler and Celery Entry classes used for dynamic beat.
2
- """
3
- import datetime
4
- import os
5
- import logging
6
- from rhodb.redis_conf import RedisConnector
7
- from celery.beat import Scheduler, ScheduleEntry
8
- from celery import current_app
9
- from celery.utils.time import is_naive
10
- from celery.schedules import schedule as c_schedule, crontab as c_crontab
11
- from pypeline.utils.config_utils import retrieve_latest_schedule_config, \
12
- update_schedule_config
13
- from pypeline.constants import CONFIG_REFRESH_RATE, SCHEDULE_DATE_FORMAT, \
14
- USING_SERMOS_CLOUD
15
-
16
- logger = logging.getLogger(__name__)
17
- redis_conn = RedisConnector().get_connection()
18
-
19
-
20
- def convert_to_datetime(
21
- datetime_str: str,
22
- datetime_format: str = SCHEDULE_DATE_FORMAT) -> datetime.datetime:
23
- """ Accept a string in the standard format and return a datetime object
24
- """
25
- return datetime.datetime.strptime(datetime_str, datetime_format)
26
-
27
-
28
- def instantiate_celery_schedule(schedule_entry: dict) -> c_schedule:
29
- """ From a schedule entry and the full schedule from Sermos, create a
30
- celery `schedule` object.
31
- """
32
- scheduleType = schedule_entry['config']['scheduleType']
33
-
34
- if scheduleType == 'interval':
35
- # Create a timedelta object
36
- period = schedule_entry['config']['schedule']['period']
37
- every = schedule_entry['config']['schedule']['every']
38
- the_delta = datetime.timedelta(**{period: every})
39
- # Instantiate the celery schedule object
40
- return c_schedule(run_every=the_delta)
41
-
42
- if scheduleType == 'crontab':
43
- return c_crontab(
44
- minute=schedule_entry['config']['schedule']['minute'],
45
- hour=schedule_entry['config']['schedule']['hour'],
46
- day_of_week=schedule_entry['config']['schedule']['dayOfWeek'],
47
- day_of_month=schedule_entry['config']['schedule']['dayOfMonth'],
48
- month_of_year=schedule_entry['config']['schedule']['monthOfYear'])
49
-
50
- raise ValueError(f"Unsupported scheduleType ({scheduleType} ...")
51
-
52
-
53
- class SermosEntry(ScheduleEntry):
54
- """ Create a beat entry with additional functionality for Sermos scheduler.
55
-
56
- https://docs.celeryproject.org/en/latest/userguide/periodic-tasks.html
57
- """
58
- def __init__(self, schedule_entry: dict = None, **kwargs):
59
- schedule_entry = schedule_entry if schedule_entry else {}
60
- if schedule_entry:
61
- # This event is being instantiated directly with the Sermos
62
- # schedule entry
63
- celery_schedule = instantiate_celery_schedule(schedule_entry)
64
-
65
- # celery.beat.ScheduleEntry expects these keys in a dictionary
66
- # called `options`. See
67
- # https://docs.celeryproject.org/en/stable/userguide/calling.html
68
- # In the case of Sermos, we require the queue in the
69
- # ScheduleEntrySchema, others are all optional.
70
- options = dict()
71
- optional_keys = ('queue', 'exchange', 'routing_key', 'expires')
72
- for key in optional_keys:
73
- value = schedule_entry['config'].get(key, None)
74
- if value is not None:
75
- options[key] = value
76
-
77
- last_run_at = schedule_entry.get('lastRunAt')
78
- if last_run_at is None:
79
- last_run_at = current_app.now()
80
- schedule_entry['lastRunAt'] = last_run_at
81
- if isinstance(schedule_entry['lastRunAt'], str):
82
- last_run_at = convert_to_datetime(
83
- schedule_entry['lastRunAt'])
84
-
85
- # Verify times are accurate
86
- orig = last_run_at
87
- if not is_naive(last_run_at):
88
- last_run_at = last_run_at.replace(tzinfo=None)
89
- assert orig.hour == last_run_at.hour # timezone sanity
90
-
91
- if USING_SERMOS_CLOUD:
92
- # We need to keep track of the id because this used to send
93
- # updates to sermos cloud. The name can't be concatenated
94
- # with the sermos id or else it will be created as duplicate
95
- # celery beat task.
96
- name = schedule_entry['name']
97
- self.sermos_id = schedule_entry['id']
98
- else:
99
- name = schedule_entry['name']
100
-
101
- super().__init__(app=current_app._get_current_object(),
102
- name=name,
103
- task=schedule_entry['config']['task'],
104
- args=schedule_entry.get('args', None),
105
- kwargs=schedule_entry.get('kwargs', None),
106
- options=options,
107
- schedule=celery_schedule,
108
- last_run_at=last_run_at,
109
- total_run_count=schedule_entry.get(
110
- 'totalRunCount', 0))
111
- else:
112
- # This is a task issued directly by celery's scheduler so won't
113
- # have the schedule_entry argument. Still not entirely clear why
114
- # this is seen. Pop the id before initializing the super class.
115
- # Add it back after so we can keep sermos up to date w/ config.
116
- if USING_SERMOS_CLOUD:
117
- sermos_id = kwargs.pop('sermos_id')
118
- super().__init__(**kwargs)
119
- self.sermos_id = sermos_id
120
- else:
121
- super().__init__(**kwargs)
122
-
123
- # Ensure all events have 'event' key - this is populated by ChainedTask
124
- if 'event' not in self.kwargs.keys():
125
- self.kwargs['event'] = {}
126
-
127
-
128
- class SermosScheduler(Scheduler):
129
- """ Sermos' implementation of a Celery Scheduler. Leverages a Sermos
130
- configuration server to provide the up-to-date schedule and provides to
131
- this scheduler for in-memory tracking.
132
- """
133
- Entry = SermosEntry
134
- _last_refresh = None # Internal time keeper for Sermos syncing
135
- _refresh_rate = CONFIG_REFRESH_RATE * 1000000 # Turn to microseconds
136
- _schedule = None # Holds latest Celery schedule with only enabled tasks
137
- _schedule_full = None # Holds latest schedule, regardless of enabled
138
- _initial_read = True # Set to False upon initial bootstrapping
139
-
140
- def __init__(self, *args, **kwargs):
141
- logger.info("Initializing SermosScheduler ...")
142
- # This step ensures the latest schedule is pulled from Sermos/cache
143
- # and bootstraps the local time checker we use.
144
- self.set_under_schedule()
145
- self._last_refresh = datetime.datetime.utcnow()
146
-
147
- # Default 60 second max interval here so our schedule is always
148
- # forced to be up to date.
149
- max_interval = int(
150
- os.environ.get('CELERY_BEAT_SYNC_MAX_INTERVAL',
151
- CONFIG_REFRESH_RATE))
152
- kwargs['max_interval'] = max_interval
153
-
154
- kwargs['schedule'] = self._schedule
155
- Scheduler.__init__(self, *args, **kwargs)
156
-
157
- def set_under_schedule(self):
158
- """ Parse the latest schedule config and set self._schedule with parsed
159
- schedule including only those that are enabled.
160
- """
161
- s = {}
162
- s_full = []
163
- s_full_orig = [s.copy() for s in self._schedule_full
164
- ] if self._schedule_full else []
165
- latest_schedule = retrieve_latest_schedule_config()
166
- for sched in latest_schedule:
167
- s_full.append(sched) # Append to full list regardless of enabled
168
- if sched['enabled']:
169
- s[sched['name']] = SermosEntry(sched)
170
- self._schedule = s
171
- self._schedule_full = s_full
172
-
173
- # Report if schedule changed
174
- if self._schedule_full != s_full_orig:
175
- logger.info("SermosScheduler: Schedule updated ...")
176
- logger.info(f"SermosScheduler: {self._schedule}")
177
-
178
- def get_current_sermos_schedule(self):
179
- """ Unpack Celery's current representation of the schedule into Sermos
180
- format. This is used to send updates back to Sermos related to dynamic
181
- properties such as last_run_at and total_run_count.
182
- """
183
-
184
- sched = {'schedules': []}
185
- for entry_name, entry in self.schedule.items():
186
- sched['schedules'].append({
187
- 'id': entry.sermos_id,
188
- 'lastRunAt': entry.last_run_at.isoformat(),
189
- 'totalRunCount': entry.total_run_count
190
- })
191
-
192
- return sched
193
-
194
- def setup_schedule(self):
195
- self.install_default_entries(self.data)
196
- # Overload default behavior and instead bootstrap with our _schedule
197
- # instead of app.conf.beat_schedule.
198
- self.merge_inplace(self._schedule)
199
-
200
- def should_refresh(self):
201
- """ Determine if enough time has elapsed to perform a schedule refresh.
202
-
203
- We turn everything into microseconds so we don't spam external services
204
- intra-second as most of the time, more than one task exists in the
205
- schedule and therefore we need to check the scheduler's `schedule`
206
- on each task very rapidly when issuing tasks.
207
- """
208
- now = datetime.datetime.utcnow()
209
- microseconds_since_last_refresh = float(
210
- str((now - self._last_refresh).seconds) + "." +
211
- str((now - self._last_refresh).microseconds)) * 1000000
212
- res = bool(microseconds_since_last_refresh > self._refresh_rate)
213
- if res is True:
214
- self._last_refresh = now - datetime.timedelta(milliseconds=1)
215
- return res
216
-
217
- def sync(self):
218
- """ Sync local schedule with Sermos and update Celery's representation
219
- TODO check this vis-a-vis local vs cloud
220
- """
221
- if self.schedule and USING_SERMOS_CLOUD:
222
- update_schedule_config(self.get_current_sermos_schedule())
223
- self.set_under_schedule() # Internal representation
224
- self.merge_inplace(self._schedule) # Celery representation
225
-
226
- def get_schedule(self):
227
- """ Overload default Scheduler get_schedule method to check for updates
228
-
229
- Note: Celery uses a property function, e.g.:
230
- https://www.tutorialsteacher.com/python/property-function
231
- for getting/setting the schedule internally. We only override the
232
- get_schedule method here.
233
- """
234
- update = False
235
- if self._initial_read:
236
- logger.info('SermosScheduler: Initial read ...')
237
- update = True
238
- self._initial_read = False
239
- elif self.should_refresh():
240
- logger.info('SermosScheduler: Refreshing schedule ...')
241
- update = True
242
-
243
- if update:
244
- self.sync()
245
-
246
- return self._schedule
247
-
248
- def set_schedule(self, schedule):
249
- """ Redefine Celery set_schedule method
250
- """
251
- self.data = schedule
252
-
253
- # Redefine Celery schedule property()
254
- schedule = property(get_schedule, set_schedule)
@@ -1,35 +0,0 @@
1
- """ Utilities for Sermos APIs and interacting with Pipelines/Schedules
2
- """
3
- import logging
4
- from typing import Union
5
-
6
- from pypeline.pipeline.generator import PipelineGenerator
7
-
8
- logger = logging.getLogger(__name__)
9
-
10
-
11
- def chain_helper(pipeline_id: str,
12
- access_key: Union[str, None] = None,
13
- chain_payload: Union[dict, None] = None,
14
- queue: Union[str, None] = None,
15
- default_task_ttl: int = None):
16
- """ Helper method to generate a pipeline chain *with* error handling.
17
-
18
- Usage:
19
- my_chain = chain_helper('pipeline-name')
20
- my_chain.delay()
21
- """
22
- # Get our pipeline. The PipelineGenerator will use the PipelineRunWrapper
23
- # to cache this "run" of the pipeline.
24
- gen = PipelineGenerator(pipeline_id,
25
- access_key=access_key,
26
- queue=queue,
27
- default_task_ttl=default_task_ttl,
28
- chain_payload=chain_payload)
29
- if gen.good_to_go:
30
- # Generate our 'chain', which is the grouping of celery constructs that
31
- # allows our dag to run asynchronously and synchronously according to
32
- # the adjacency list defined in our pipeline configuration.
33
- gen.generate_chain()
34
-
35
- return gen
@@ -1,156 +0,0 @@
1
- """ Sermos implementation as a Flask extension
2
- """
3
- import os
4
- if os.getenv('USE_GEVENT', 'false').lower() == 'true':
5
- import gevent.monkey
6
- gevent.monkey.patch_all()
7
-
8
- import logging
9
- from flask import Flask
10
- from werkzeug.middleware.proxy_fix import ProxyFix
11
- from flask_smorest import Api, Blueprint
12
- from pypeline.extensions import sermos_config
13
- from pypeline.constants import DEFAULT_OPENAPI_CONFIG
14
- from pypeline import __version__
15
-
16
- logger = logging.getLogger(__name__)
17
-
18
-
19
- class FlaskSermos:
20
- """ Sermos Flask extension.
21
- """
22
- def __init__(self, app: Flask = None):
23
- """ Class init
24
- """
25
- self.app = app
26
- self.sermos_config = sermos_config if sermos_config is not None else {}
27
-
28
- if app is not None:
29
- self.init_app(app)
30
-
31
- def init_app(self, app: Flask, init_api: bool = False):
32
- """ Sermos bootstrapping process.
33
-
34
- Application config variables to set include:
35
-
36
- SERMOS_CLIENT_VERSION (default: v?.?.?)
37
- SERMOS_HIJACK_ROOT_LOGGER (default: False)
38
-
39
- Optional, if `init_api` is True:
40
-
41
- API_DOCUMENTATION_TITLE
42
- API_DOCUMENTATION_DESCRIPTION
43
- OPENAPI_VERSION
44
- OPENAPI_URL_PREFIX
45
- OPENAPI_SWAGGER_APP_NAME
46
- OPENAPI_SWAGGER_UI_PATH
47
- OPENAPI_SWAGGER_BASE_TEMPLATE
48
- OPENAPI_SWAGGER_URL
49
- OPENAPI_SWAGGER_UI_URL
50
- SWAGGER_UI_DOC_EXPANSION
51
- EXPLAIN_TEMPLATE_LOADING
52
-
53
- Args:
54
- app (Flask): Flask Application to initialize.
55
- init_api (bool): If `True`, Sermos will initialize its
56
- core APIs (including Pipelines, Scheduled Tasks, etc.) and
57
- provide a pre-configured OpenAPI Spec/Swagger UI interface
58
- available at the route defined in your application's config
59
- under `OPENAPI_URL_PREFIX` (default `/api`). Refer to
60
- [flask-smorest](https://flask-smorest.readthedocs.io/en/latest/openapi.html)
61
- documentation for additional configuration options.
62
- """
63
- # Ensure there's a SERMOS_CLIENT_VERSION on app config
64
- app.config.setdefault(
65
- 'SERMOS_CLIENT_VERSION',
66
- app.config.get("SERMOS_CLIENT_VERSION", "v?.?.?"))
67
-
68
- app.wsgi_app = ProxyFix(app.wsgi_app)
69
- app.url_map.strict_slashes = False
70
-
71
- # Create and register the sermos blueprint
72
- bp = Blueprint('sermos',
73
- __name__,
74
- template_folder='../templates',
75
- static_folder='../static',
76
- url_prefix='/sermos')
77
- app.register_blueprint(bp)
78
-
79
- # Bootstrap api if app requests
80
- if init_api is True:
81
- self._bootstrap_api(app)
82
-
83
- def _bootstrap_api(self, app: Flask):
84
- """ If initializing the API, we will create the core Sermos API paths
85
- and initialize the default Swagger documentation.
86
- """
87
- # Set sensible defaults for Swagger docs. Provided `app` will
88
- # take precedent.
89
- for swagger_config in DEFAULT_OPENAPI_CONFIG:
90
- app.config.setdefault(
91
- swagger_config[0],
92
- app.config.get(swagger_config[0], swagger_config[1]))
93
-
94
- # Attempt to override with values from client's sermos.yaml if
95
- # they are available. This will add new tags and new docs if
96
- # defined and add to the core Sermos API docs.
97
- api_config = self.sermos_config.get('apiConfig', {})
98
- api_docs = api_config.get('apiDocumentation', {})
99
-
100
- custom_tags = api_config.get('prefixDescriptions', [])
101
-
102
- app.config['SERMOS_CLIENT_VERSION'] = \
103
- api_docs.get('version', None) \
104
- if api_docs.get('version', None) is not None \
105
- else app.config['SERMOS_CLIENT_VERSION']
106
-
107
- app.config['API_DOCUMENTATION_TITLE'] = \
108
- api_docs.get('title', None) \
109
- if api_docs.get('title', None) is not None \
110
- else app.config['API_DOCUMENTATION_TITLE']
111
-
112
- app.config['API_DOCUMENTATION_DESCRIPTION'] = \
113
- api_docs.get('description', None) \
114
- if api_docs.get('description', None) is not None \
115
- else app.config['API_DOCUMENTATION_DESCRIPTION']
116
-
117
- # Set default Sermos Tags along with custom tags from sermos.yaml
118
- tags = [{
119
- 'name': 'Pipelines',
120
- 'description': 'Operations related to Pipelines'
121
- }, {
122
- 'name': 'Schedules',
123
- 'description': 'Operations related to Schedules'
124
- }] + custom_tags
125
-
126
- # Set up the initializing spec kwargs for API
127
- spec_kwargs = {
128
- 'title': app.config['API_DOCUMENTATION_TITLE'],
129
- 'version': f"Sermos: {__version__} - "
130
- f"Client: {app.config['SERMOS_CLIENT_VERSION']}",
131
- 'description': app.config['API_DOCUMENTATION_DESCRIPTION'],
132
- 'tags': tags
133
- }
134
- try:
135
- api = Api()
136
- api.init_app(app, spec_kwargs=spec_kwargs)
137
-
138
- # Register available Sermos API Namespaces
139
- self._register_api_namespaces(api)
140
-
141
- except Exception as e:
142
- api = None
143
- logging.exception(f"Unable to initialize API ... {e}")
144
-
145
- # Register the Sermos Core API as an extension for use in Client App
146
- app.extensions.setdefault('sermos_core_api', api)
147
-
148
- @staticmethod
149
- def _register_api_namespaces(api: Api):
150
- """ Register Default API namespaces
151
- TODO add metrics APIs
152
- """
153
- from pypeline.flask.api.pipelines import bp as pipelinesbp
154
- api.register_blueprint(pipelinesbp)
155
- from pypeline.flask.api.schedules import bp as schedulesbp
156
- api.register_blueprint(schedulesbp)
pypeline/generators.py DELETED
@@ -1,196 +0,0 @@
1
- import os
2
- from boto3 import Session
3
- import logging
4
-
5
- logger = logging.getLogger(__name__)
6
-
7
-
8
- class KeyGenerator(object):
9
- """ Common functions for key generators.
10
- """
11
- def __init__(self):
12
- super(KeyGenerator, self).__init__()
13
- self.hidden_files = ('.DS_Store', '.git', 'Icon', '.Dropbox')
14
-
15
- def get_file_key(self, file_obj):
16
- """ Required for each specific generator - how to extract key
17
- """
18
- return file_obj
19
-
20
- def get_file_name(self, file_obj):
21
- """ Required for each specific generator - how to extract file name
22
- """
23
- return file_obj
24
-
25
- def get_file_size(self, base_path, file_obj):
26
- """ Required for each specific generator - how to find file size (BYTES)
27
- """
28
- return 0
29
-
30
- def get_final_path(self, base_path, file_name, return_full_path):
31
- """ Required for each specific generator - create final file path that
32
- is added to list.
33
- """
34
- if return_full_path:
35
- return os.path.normpath(base_path + '/' + file_name)
36
-
37
- return file_name
38
-
39
- def list_iterator(self, all_files, base_path, limit=None, offset=None,
40
- size_limit=None, return_full_path=True,
41
- skip_common_hidden=True):
42
- """ accept vars from everywhere to handle offset/limit/size logic
43
- """
44
- filtered_files = []
45
- try:
46
- # Compile list of all files within limit/offset if those exist
47
- idx = -1
48
- listed_files = 0
49
- offset_reached = False
50
- for f in all_files:
51
- this_key = self.get_file_key(f)
52
- this_filename = self.get_file_name(f)
53
-
54
- if skip_common_hidden and this_filename in self.hidden_files:
55
- continue
56
-
57
- idx += 1
58
- if offset and idx >= int(offset):
59
- offset_reached = True
60
-
61
- if (limit and listed_files >= int(limit))\
62
- or (offset and not offset_reached):
63
- continue
64
-
65
- # Verify filesize. Having some issues with large PDFs (process
66
- # simply killed). So allow option of skipping files above certain
67
- # size in megabytes.
68
- if size_limit is not None:
69
- size_in_bytes = self.get_file_size(base_path, f)
70
- if size_in_bytes > size_limit:
71
- continue
72
-
73
- filtered_files.append(
74
- self.get_final_path(base_path, this_key, return_full_path)
75
- )
76
- listed_files += 1
77
- except Exception as e:
78
- logger.error("Unable to list objects: {0}".format(e))
79
-
80
- return filtered_files
81
-
82
-
83
- class S3KeyGenerator(KeyGenerator):
84
- """ Produce a list of object keys from S3.
85
- """
86
- def __init__(self, aws_access_key_id, aws_secret_access_key,
87
- aws_region='us-east-1'):
88
- super(S3KeyGenerator, self).__init__()
89
-
90
- session = Session(
91
- aws_access_key_id=aws_access_key_id,
92
- aws_secret_access_key=aws_secret_access_key,
93
- region_name=aws_region
94
- )
95
- self.s3 = session.client('s3')
96
-
97
- def get_file_key(self, file_obj):
98
- """ Get file key from s3 object """
99
- return file_obj.get('Key', None)
100
-
101
- def get_file_name(self, file_obj):
102
- """ Get file name from s3 object """
103
- if file_obj is not None:
104
- key = file_obj.get('Key', None)
105
- if key is not None:
106
- return key.split('/')[-1]
107
- return None
108
-
109
- def get_file_size(self, base_path, file_obj):
110
- """ Return file size of s3 object """
111
- return file_obj.get('Size', 0)
112
-
113
- # All files in bucket
114
- # Range of files with an offset
115
- def list_files(self, bucket, folder='', limit=None, offset=None,
116
- size_limit=None, return_full_path=True,
117
- skip_common_hidden=True):
118
- """ Lists files inside an S3 bucket+folder
119
-
120
- Note: This does not guarantee any sort of order. Boto+S3 does not
121
- provide an interface for sorting results, so that would need
122
- to happen in memory.
123
-
124
- limit will include a maximum of 'limit' values
125
- offset will start including values only after 'offset' keys
126
- size_limit will not include files over a specific size (in bytes)
127
- skip_common_hidden will exclude common hidden files
128
- return_full_path will include 'bucket/' in key.
129
- """
130
- files = []
131
-
132
- try:
133
- file_data = self.s3.list_objects_v2(
134
- Bucket=bucket, Delimiter='/', Prefix=folder)
135
- files = self.list_iterator(
136
- file_data['Contents'],
137
- bucket,
138
- limit=limit,
139
- offset=offset,
140
- size_limit=size_limit,
141
- return_full_path=return_full_path,
142
- skip_common_hidden=skip_common_hidden
143
- )
144
-
145
- except Exception as e:
146
- logger.error("Unable to list objects: {0}".format(e))
147
- return files
148
-
149
-
150
- class LocalKeyGenerator(KeyGenerator):
151
- """ Generic generator to produce a list of file names from filesystem.
152
- """
153
- def __init__(self):
154
- super(LocalKeyGenerator, self).__init__()
155
-
156
- def get_file_key(self, file_obj):
157
- """ Get file key from local object """
158
- return file_obj
159
-
160
- def get_file_name(self, file_obj):
161
- """ Get file name from local object """
162
- return file_obj
163
-
164
- def get_file_size(self, base_path, file_obj):
165
- """ Get file size from local object """
166
- full_path = os.path.normpath(base_path + '/' + file_obj)
167
- try:
168
- return os.stat(full_path).st_size
169
- except Exception as e:
170
- logger.error("File {0} not found ...".format(full_path))
171
- return 0
172
-
173
- def list_files(self, folder_path, limit=None, offset=None,
174
- size_limit=None, return_full_path=True,
175
- skip_common_hidden=True):
176
- """ Lists all file names inside a path.
177
-
178
- skip_common_hidden will exclude common hidden files
179
- return_full_path will include path in addition to filename
180
- """
181
- files = []
182
- try:
183
- file_data = os.listdir(folder_path)
184
- files = self.list_iterator(
185
- file_data,
186
- folder_path,
187
- limit=limit,
188
- offset=offset,
189
- size_limit=size_limit,
190
- return_full_path=return_full_path,
191
- skip_common_hidden=skip_common_hidden
192
- )
193
-
194
- except Exception as e:
195
- logger.error("Unable to list objects: {0}".format(e))
196
- return files