onetick-py 1.177.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- locator_parser/__init__.py +0 -0
- locator_parser/acl.py +73 -0
- locator_parser/actions.py +262 -0
- locator_parser/common.py +368 -0
- locator_parser/io.py +43 -0
- locator_parser/locator.py +150 -0
- onetick/__init__.py +101 -0
- onetick/doc_utilities/__init__.py +3 -0
- onetick/doc_utilities/napoleon.py +40 -0
- onetick/doc_utilities/ot_doctest.py +140 -0
- onetick/doc_utilities/snippets.py +279 -0
- onetick/lib/__init__.py +4 -0
- onetick/lib/instance.py +141 -0
- onetick/py/__init__.py +293 -0
- onetick/py/_stack_info.py +89 -0
- onetick/py/_version.py +2 -0
- onetick/py/aggregations/__init__.py +11 -0
- onetick/py/aggregations/_base.py +648 -0
- onetick/py/aggregations/_docs.py +948 -0
- onetick/py/aggregations/compute.py +286 -0
- onetick/py/aggregations/functions.py +2216 -0
- onetick/py/aggregations/generic.py +104 -0
- onetick/py/aggregations/high_low.py +80 -0
- onetick/py/aggregations/num_distinct.py +83 -0
- onetick/py/aggregations/order_book.py +501 -0
- onetick/py/aggregations/other.py +1014 -0
- onetick/py/backports.py +26 -0
- onetick/py/cache.py +374 -0
- onetick/py/callback/__init__.py +5 -0
- onetick/py/callback/callback.py +276 -0
- onetick/py/callback/callbacks.py +131 -0
- onetick/py/compatibility.py +798 -0
- onetick/py/configuration.py +771 -0
- onetick/py/core/__init__.py +0 -0
- onetick/py/core/_csv_inspector.py +93 -0
- onetick/py/core/_internal/__init__.py +0 -0
- onetick/py/core/_internal/_manually_bound_value.py +6 -0
- onetick/py/core/_internal/_nodes_history.py +250 -0
- onetick/py/core/_internal/_op_utils/__init__.py +0 -0
- onetick/py/core/_internal/_op_utils/every_operand.py +9 -0
- onetick/py/core/_internal/_op_utils/is_const.py +10 -0
- onetick/py/core/_internal/_per_tick_scripts/tick_list_sort_template.script +121 -0
- onetick/py/core/_internal/_proxy_node.py +140 -0
- onetick/py/core/_internal/_state_objects.py +2312 -0
- onetick/py/core/_internal/_state_vars.py +93 -0
- onetick/py/core/_source/__init__.py +0 -0
- onetick/py/core/_source/_symbol_param.py +95 -0
- onetick/py/core/_source/schema.py +97 -0
- onetick/py/core/_source/source_methods/__init__.py +0 -0
- onetick/py/core/_source/source_methods/aggregations.py +809 -0
- onetick/py/core/_source/source_methods/applyers.py +296 -0
- onetick/py/core/_source/source_methods/columns.py +141 -0
- onetick/py/core/_source/source_methods/data_quality.py +301 -0
- onetick/py/core/_source/source_methods/debugs.py +272 -0
- onetick/py/core/_source/source_methods/drops.py +120 -0
- onetick/py/core/_source/source_methods/fields.py +619 -0
- onetick/py/core/_source/source_methods/filters.py +1002 -0
- onetick/py/core/_source/source_methods/joins.py +1413 -0
- onetick/py/core/_source/source_methods/merges.py +605 -0
- onetick/py/core/_source/source_methods/misc.py +1455 -0
- onetick/py/core/_source/source_methods/pandases.py +155 -0
- onetick/py/core/_source/source_methods/renames.py +356 -0
- onetick/py/core/_source/source_methods/sorts.py +183 -0
- onetick/py/core/_source/source_methods/switches.py +142 -0
- onetick/py/core/_source/source_methods/symbols.py +117 -0
- onetick/py/core/_source/source_methods/times.py +627 -0
- onetick/py/core/_source/source_methods/writes.py +986 -0
- onetick/py/core/_source/symbol.py +205 -0
- onetick/py/core/_source/tmp_otq.py +222 -0
- onetick/py/core/column.py +209 -0
- onetick/py/core/column_operations/__init__.py +0 -0
- onetick/py/core/column_operations/_methods/__init__.py +4 -0
- onetick/py/core/column_operations/_methods/_internal.py +28 -0
- onetick/py/core/column_operations/_methods/conversions.py +216 -0
- onetick/py/core/column_operations/_methods/methods.py +292 -0
- onetick/py/core/column_operations/_methods/op_types.py +160 -0
- onetick/py/core/column_operations/accessors/__init__.py +0 -0
- onetick/py/core/column_operations/accessors/_accessor.py +28 -0
- onetick/py/core/column_operations/accessors/decimal_accessor.py +104 -0
- onetick/py/core/column_operations/accessors/dt_accessor.py +537 -0
- onetick/py/core/column_operations/accessors/float_accessor.py +184 -0
- onetick/py/core/column_operations/accessors/str_accessor.py +1367 -0
- onetick/py/core/column_operations/base.py +1121 -0
- onetick/py/core/cut_builder.py +150 -0
- onetick/py/core/db_constants.py +20 -0
- onetick/py/core/eval_query.py +245 -0
- onetick/py/core/lambda_object.py +441 -0
- onetick/py/core/multi_output_source.py +232 -0
- onetick/py/core/per_tick_script.py +2256 -0
- onetick/py/core/query_inspector.py +464 -0
- onetick/py/core/source.py +1744 -0
- onetick/py/db/__init__.py +2 -0
- onetick/py/db/_inspection.py +1128 -0
- onetick/py/db/db.py +1327 -0
- onetick/py/db/utils.py +64 -0
- onetick/py/docs/__init__.py +0 -0
- onetick/py/docs/docstring_parser.py +112 -0
- onetick/py/docs/utils.py +81 -0
- onetick/py/functions.py +2398 -0
- onetick/py/license.py +190 -0
- onetick/py/log.py +88 -0
- onetick/py/math.py +935 -0
- onetick/py/misc.py +470 -0
- onetick/py/oqd/__init__.py +22 -0
- onetick/py/oqd/eps.py +1195 -0
- onetick/py/oqd/sources.py +325 -0
- onetick/py/otq.py +216 -0
- onetick/py/pyomd_mock.py +47 -0
- onetick/py/run.py +916 -0
- onetick/py/servers.py +173 -0
- onetick/py/session.py +1347 -0
- onetick/py/sources/__init__.py +19 -0
- onetick/py/sources/cache.py +167 -0
- onetick/py/sources/common.py +128 -0
- onetick/py/sources/csv.py +642 -0
- onetick/py/sources/custom.py +85 -0
- onetick/py/sources/data_file.py +305 -0
- onetick/py/sources/data_source.py +1045 -0
- onetick/py/sources/empty.py +94 -0
- onetick/py/sources/odbc.py +337 -0
- onetick/py/sources/order_book.py +271 -0
- onetick/py/sources/parquet.py +168 -0
- onetick/py/sources/pit.py +191 -0
- onetick/py/sources/query.py +495 -0
- onetick/py/sources/snapshots.py +419 -0
- onetick/py/sources/split_query_output_by_symbol.py +198 -0
- onetick/py/sources/symbology_mapping.py +123 -0
- onetick/py/sources/symbols.py +374 -0
- onetick/py/sources/ticks.py +825 -0
- onetick/py/sql.py +70 -0
- onetick/py/state.py +251 -0
- onetick/py/types.py +2131 -0
- onetick/py/utils/__init__.py +70 -0
- onetick/py/utils/acl.py +93 -0
- onetick/py/utils/config.py +186 -0
- onetick/py/utils/default.py +49 -0
- onetick/py/utils/file.py +38 -0
- onetick/py/utils/helpers.py +76 -0
- onetick/py/utils/locator.py +94 -0
- onetick/py/utils/perf.py +498 -0
- onetick/py/utils/query.py +49 -0
- onetick/py/utils/render.py +1374 -0
- onetick/py/utils/script.py +244 -0
- onetick/py/utils/temp.py +471 -0
- onetick/py/utils/types.py +120 -0
- onetick/py/utils/tz.py +84 -0
- onetick_py-1.177.0.dist-info/METADATA +137 -0
- onetick_py-1.177.0.dist-info/RECORD +152 -0
- onetick_py-1.177.0.dist-info/WHEEL +5 -0
- onetick_py-1.177.0.dist-info/entry_points.txt +2 -0
- onetick_py-1.177.0.dist-info/licenses/LICENSE +21 -0
- onetick_py-1.177.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,648 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import List, Dict, Union, TYPE_CHECKING, Tuple, Optional, Any
|
|
3
|
+
from copy import deepcopy
|
|
4
|
+
from functools import wraps
|
|
5
|
+
from collections import namedtuple
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from onetick.py.core.source import Source # hack for annotations
|
|
10
|
+
|
|
11
|
+
from onetick.py.core.column import _Column
|
|
12
|
+
from onetick.py.core.column_operations.base import _Operation, OnetickParameter
|
|
13
|
+
from onetick.py.core._source._symbol_param import _SymbolParamColumn
|
|
14
|
+
from onetick.py import types as ott
|
|
15
|
+
from onetick.py import utils
|
|
16
|
+
from onetick.py.otq import otq
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def validate(method):
|
|
20
|
+
"""wraps schema getter with validations of input columns + src and resulting schema + output column"""
|
|
21
|
+
|
|
22
|
+
@wraps(method)
|
|
23
|
+
def inner(obj: '_Aggregation', src: 'Source', name):
|
|
24
|
+
obj.validate_input_columns(src)
|
|
25
|
+
for column in obj.group_by:
|
|
26
|
+
if str(column) not in src.schema or not isinstance(src[str(column)], _Column):
|
|
27
|
+
raise KeyError(f"There is no '{column}' column to group by")
|
|
28
|
+
schema: Dict = method(obj, src=src, name=name)
|
|
29
|
+
if not obj.overwrite_output_field:
|
|
30
|
+
obj.validate_output_name(schema, name)
|
|
31
|
+
return schema
|
|
32
|
+
|
|
33
|
+
return inner
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def operation_gb(method):
|
|
37
|
+
"""wraps aggregation to apply _Operation and remove it after aggregation"""
|
|
38
|
+
|
|
39
|
+
@wraps(method)
|
|
40
|
+
def inner(obj: '_Aggregation', src: 'Source', *args, **kwargs) -> 'Source':
|
|
41
|
+
inplace = kwargs.get('inplace')
|
|
42
|
+
res = src if inplace else src.copy()
|
|
43
|
+
src_schema = src.schema
|
|
44
|
+
|
|
45
|
+
gb_copy = obj.group_by.copy()
|
|
46
|
+
obj.group_by = []
|
|
47
|
+
for i, gb in enumerate(gb_copy):
|
|
48
|
+
if isinstance(gb, _Operation) and not isinstance(gb, _Column):
|
|
49
|
+
name = f'GROUP_{i}'
|
|
50
|
+
if name in src_schema:
|
|
51
|
+
raise AttributeError(f"'{name}' column name is reserved for group by Operation "
|
|
52
|
+
f"but it exists in current schema")
|
|
53
|
+
res[name] = gb
|
|
54
|
+
obj.group_by.append(res[name])
|
|
55
|
+
else:
|
|
56
|
+
obj.group_by.append(gb)
|
|
57
|
+
res = method(obj, res, *args, **kwargs)
|
|
58
|
+
|
|
59
|
+
obj.group_by = gb_copy
|
|
60
|
+
return res
|
|
61
|
+
return inner
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def operation_replacer(method):
|
|
65
|
+
"""
|
|
66
|
+
PY-378
|
|
67
|
+
Decorator allows working with aggregation's columns specified as operations.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
@wraps(method)
|
|
71
|
+
def inner(obj: '_Aggregation', src: 'Source', *args, **kwargs) -> 'Source':
|
|
72
|
+
inplace = kwargs.get('inplace')
|
|
73
|
+
res = src if inplace else src.copy()
|
|
74
|
+
tmp_columns = {}
|
|
75
|
+
|
|
76
|
+
aggrs = getattr(obj, 'aggrs', None)
|
|
77
|
+
if aggrs:
|
|
78
|
+
aggs = aggrs.values()
|
|
79
|
+
else:
|
|
80
|
+
name = args[0] if args else kwargs.get('name')
|
|
81
|
+
# pylint: disable-next=unidiomatic-typecheck
|
|
82
|
+
if type(obj.column_name) is _Operation and name is None:
|
|
83
|
+
raise ValueError('Output field name must be specified when aggregating operation')
|
|
84
|
+
aggs = [obj]
|
|
85
|
+
|
|
86
|
+
# Add operation from each aggregation object to source `res` as column
|
|
87
|
+
# and replace *column_name* property in each aggregation object with this column's name.
|
|
88
|
+
for i, agg in enumerate(aggs):
|
|
89
|
+
# pylint: disable-next=unidiomatic-typecheck
|
|
90
|
+
if type(agg.column_name) is _Operation:
|
|
91
|
+
tmp_name = f'__TMP_AGG_COLUMN_{i}__'
|
|
92
|
+
res[tmp_name] = agg.column_name
|
|
93
|
+
tmp_columns[tmp_name] = (agg, agg.column_name)
|
|
94
|
+
agg.column_name = tmp_name
|
|
95
|
+
|
|
96
|
+
res = method(obj, res, *args, **kwargs)
|
|
97
|
+
|
|
98
|
+
if tmp_columns:
|
|
99
|
+
# Rollback all aggregation objects and source `res`.
|
|
100
|
+
# Delete all temporary columns and change property *column_name* back in aggregations.
|
|
101
|
+
to_drop = list(set(tmp_columns).intersection(res.schema))
|
|
102
|
+
if to_drop:
|
|
103
|
+
res.drop(to_drop, inplace=True)
|
|
104
|
+
for agg, column_name in tmp_columns.values():
|
|
105
|
+
agg.column_name = column_name
|
|
106
|
+
|
|
107
|
+
return res
|
|
108
|
+
return inner
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def output_column_overwriter(method):
|
|
112
|
+
"""
|
|
113
|
+
Allows outputting aggregation to existing field.
|
|
114
|
+
In this case temporary renaming existing field.
|
|
115
|
+
"""
|
|
116
|
+
@wraps(method)
|
|
117
|
+
def inner(obj: '_Aggregation', src: 'Source', *args, **kwargs) -> 'Source':
|
|
118
|
+
column_name = obj.column_name
|
|
119
|
+
name = args[0] if args else kwargs.get('name')
|
|
120
|
+
name = name or column_name
|
|
121
|
+
|
|
122
|
+
if not obj.overwrite_output_field or not name or name not in src.schema:
|
|
123
|
+
return method(obj, src, *args, **kwargs)
|
|
124
|
+
|
|
125
|
+
inplace = kwargs.get('inplace')
|
|
126
|
+
res = src if inplace else src.copy()
|
|
127
|
+
|
|
128
|
+
# rename existing field to the temporary name
|
|
129
|
+
tmp_name = f'__TMP_AGG_COLUMN_{name}__'
|
|
130
|
+
res[tmp_name] = res[name]
|
|
131
|
+
res.drop(name, inplace=True)
|
|
132
|
+
# aggregating renamed field
|
|
133
|
+
kwargs['name'] = name
|
|
134
|
+
obj.column_name = tmp_name
|
|
135
|
+
|
|
136
|
+
res = method(obj, res, *args, **kwargs)
|
|
137
|
+
|
|
138
|
+
# removing temporary field
|
|
139
|
+
if tmp_name in res.schema:
|
|
140
|
+
res.drop(tmp_name, inplace=True)
|
|
141
|
+
obj.column_name = column_name
|
|
142
|
+
|
|
143
|
+
return res
|
|
144
|
+
return inner
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def get_seconds_from_time_offset(time_offset):
|
|
148
|
+
if not isinstance(time_offset, ott.OTPBaseTimeOffset):
|
|
149
|
+
raise ValueError('Only DatePart objects can be passed in this function')
|
|
150
|
+
|
|
151
|
+
return int(pd.Timedelta(time_offset).total_seconds())
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def get_bucket_interval_from_datepart(bucket_interval):
|
|
155
|
+
if not isinstance(bucket_interval, ott.OTPBaseTimeOffset):
|
|
156
|
+
raise ValueError('Only DatePart objects can be passed in this function')
|
|
157
|
+
|
|
158
|
+
if isinstance(bucket_interval, ott.ExpressionDefinedTimeOffset):
|
|
159
|
+
raise ValueError(f"Operation as DatePart isn't allowed: {str(bucket_interval.n)}")
|
|
160
|
+
|
|
161
|
+
# bucket_interval also could be one of these:
|
|
162
|
+
# otp.Milli, otp.Second, otp.Minute, otp.Hour, otp.Day, otp.Month
|
|
163
|
+
# bucket_interval will be converted and corresponding bucket_units value will be set
|
|
164
|
+
|
|
165
|
+
offset, datepart = bucket_interval.get_offset()
|
|
166
|
+
if datepart not in {'millisecond', 'second', 'minute', 'hour', 'day', 'month'}:
|
|
167
|
+
raise ValueError(f"Unsupported DatePart passed to bucket_interval: {datepart}")
|
|
168
|
+
|
|
169
|
+
if offset < 0:
|
|
170
|
+
raise ValueError(
|
|
171
|
+
f"Negative DateParts aren't allowed for bucket_interval: {offset} ({datepart})"
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
if datepart in {'millisecond', 'minute', 'hour'}:
|
|
175
|
+
# bucket_units could be only seconds, days, months or ticks
|
|
176
|
+
# so other DateParts are converted to seconds
|
|
177
|
+
if datepart == 'millisecond':
|
|
178
|
+
offset, datepart = offset / 1000, 'second'
|
|
179
|
+
else:
|
|
180
|
+
offset, datepart = ott.Second(get_seconds_from_time_offset(bucket_interval)).get_offset()
|
|
181
|
+
|
|
182
|
+
return offset, f"{datepart}s" # type: ignore[union-attr]
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
class _Aggregation(ABC):
|
|
186
|
+
|
|
187
|
+
@property
|
|
188
|
+
@abstractmethod
|
|
189
|
+
def NAME(self) -> str:
|
|
190
|
+
pass
|
|
191
|
+
|
|
192
|
+
@property
|
|
193
|
+
@abstractmethod
|
|
194
|
+
def EP(self) -> otq.EpBase:
|
|
195
|
+
pass
|
|
196
|
+
|
|
197
|
+
DEFAULT_OUTPUT_NAME = 'VALUE'
|
|
198
|
+
|
|
199
|
+
FIELDS_MAPPING = {
|
|
200
|
+
"column_name": "INPUT_FIELD_NAME",
|
|
201
|
+
"running": "IS_RUNNING_AGGR",
|
|
202
|
+
"all_fields": "ALL_FIELDS_FOR_SLIDING",
|
|
203
|
+
"bucket_interval": "BUCKET_INTERVAL",
|
|
204
|
+
"bucket_time": "BUCKET_TIME",
|
|
205
|
+
"bucket_units": "BUCKET_INTERVAL_UNITS",
|
|
206
|
+
"bucket_end_condition": "BUCKET_END_CRITERIA",
|
|
207
|
+
"end_condition_per_group": "BUCKET_END_PER_GROUP",
|
|
208
|
+
"boundary_tick_bucket": "BOUNDARY_TICK_BUCKET",
|
|
209
|
+
"group_by": "GROUP_BY",
|
|
210
|
+
"groups_to_display": "GROUPS_TO_DISPLAY",
|
|
211
|
+
}
|
|
212
|
+
FIELDS_DEFAULT = {
|
|
213
|
+
"running": False,
|
|
214
|
+
"all_fields": False,
|
|
215
|
+
"bucket_interval": 0,
|
|
216
|
+
"bucket_time": "BUCKET_END",
|
|
217
|
+
"bucket_units": "seconds",
|
|
218
|
+
"bucket_end_condition": None,
|
|
219
|
+
"end_condition_per_group": False,
|
|
220
|
+
"boundary_tick_bucket": "new",
|
|
221
|
+
"group_by": [],
|
|
222
|
+
"groups_to_display": "all",
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
FIELDS_TO_SKIP: List = [] # attr listed here won't be used in self.__str__
|
|
226
|
+
|
|
227
|
+
output_field_type: Optional[type] = None # None will force to use type of input column
|
|
228
|
+
require_type: Optional[Tuple[type, ...]] = None
|
|
229
|
+
_validations_to_skip: List = []
|
|
230
|
+
|
|
231
|
+
def __init__(self,
|
|
232
|
+
column: Union[str, _Column, _Operation],
|
|
233
|
+
running: bool = False,
|
|
234
|
+
all_fields: Union[bool, str] = False,
|
|
235
|
+
bucket_interval: Union[int, ott.OTPBaseTimeOffset] = 0,
|
|
236
|
+
bucket_time: str = "end",
|
|
237
|
+
bucket_units: Union[str, None] = None,
|
|
238
|
+
bucket_end_condition: Optional[_Operation] = None,
|
|
239
|
+
end_condition_per_group: bool = False,
|
|
240
|
+
boundary_tick_bucket: str = "new",
|
|
241
|
+
group_by: Optional[Union[List, str, _Operation]] = None,
|
|
242
|
+
groups_to_display: str = "all",
|
|
243
|
+
overwrite_output_field: bool = False):
|
|
244
|
+
"""
|
|
245
|
+
Abstract method that implements common logic for aggregations
|
|
246
|
+
"""
|
|
247
|
+
if isinstance(column, list):
|
|
248
|
+
column = ','.join(map(str, column))
|
|
249
|
+
|
|
250
|
+
column_name: Union[str, _Operation] = str(column)
|
|
251
|
+
|
|
252
|
+
if column_name == "Time":
|
|
253
|
+
# TODO: need to understand how to better work with alias
|
|
254
|
+
column_name = "TIMESTAMP"
|
|
255
|
+
|
|
256
|
+
# pylint: disable-next=unidiomatic-typecheck
|
|
257
|
+
if type(column) is _Operation:
|
|
258
|
+
column_name = column
|
|
259
|
+
|
|
260
|
+
if isinstance(bucket_interval, float):
|
|
261
|
+
if bucket_units is not None and bucket_units != 'seconds':
|
|
262
|
+
raise ValueError('Float values for bucket_interval are only supported for seconds.')
|
|
263
|
+
if bucket_interval < 0.001:
|
|
264
|
+
raise ValueError('Float values for bucket_interval less than 0.001 are not supported.')
|
|
265
|
+
|
|
266
|
+
if isinstance(bucket_interval, ott.OTPBaseTimeOffset):
|
|
267
|
+
bucket_interval, bucket_units = get_bucket_interval_from_datepart(bucket_interval)
|
|
268
|
+
|
|
269
|
+
if isinstance(all_fields, str) and all_fields == "when_ticks_exit_window":
|
|
270
|
+
if not running:
|
|
271
|
+
raise ValueError("`all_fields` can't be set to 'when_ticks_exit_window' when `running=False`")
|
|
272
|
+
|
|
273
|
+
if not bucket_interval:
|
|
274
|
+
raise ValueError(
|
|
275
|
+
"`all_fields` can't be set to 'when_ticks_exit_window' when `bucket_interval` is zero`"
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
all_fields = all_fields.upper()
|
|
279
|
+
|
|
280
|
+
self.column_name = column_name
|
|
281
|
+
self.running = running
|
|
282
|
+
self.all_fields = all_fields
|
|
283
|
+
self.bucket_time = bucket_time
|
|
284
|
+
|
|
285
|
+
if isinstance(bucket_interval, _Operation):
|
|
286
|
+
if bucket_interval.dtype is bool:
|
|
287
|
+
if bucket_end_condition is not None:
|
|
288
|
+
raise ValueError(
|
|
289
|
+
"Bucket end condition passed on both `bucket_interval` and `bucket_end_condition` parameters"
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
bucket_end_condition = bucket_interval
|
|
293
|
+
bucket_interval = 0
|
|
294
|
+
elif isinstance(bucket_interval, OnetickParameter) and bucket_interval.dtype is int:
|
|
295
|
+
bucket_interval = str(bucket_interval)
|
|
296
|
+
elif isinstance(bucket_interval, _SymbolParamColumn) and bucket_interval.dtype is int:
|
|
297
|
+
bucket_interval = str(bucket_interval.expr)
|
|
298
|
+
else:
|
|
299
|
+
raise ValueError("Bucket interval can only be boolean otp.Operation or integer otp.param")
|
|
300
|
+
|
|
301
|
+
self.bucket_interval = bucket_interval
|
|
302
|
+
|
|
303
|
+
if bucket_end_condition is None:
|
|
304
|
+
self.bucket_end_condition = None # type: ignore
|
|
305
|
+
else:
|
|
306
|
+
self.bucket_end_condition = str(bucket_end_condition)
|
|
307
|
+
|
|
308
|
+
self.bucket_units = bucket_units
|
|
309
|
+
if self.bucket_units is None:
|
|
310
|
+
if self.bucket_end_condition:
|
|
311
|
+
# allow omitting bucket_units if bucket_end_condition is set
|
|
312
|
+
self.bucket_units = 'flexible'
|
|
313
|
+
else:
|
|
314
|
+
# default value
|
|
315
|
+
self.bucket_units = 'seconds'
|
|
316
|
+
|
|
317
|
+
self.end_condition_per_group = end_condition_per_group
|
|
318
|
+
self.boundary_tick_bucket = boundary_tick_bucket
|
|
319
|
+
self.large_ints = False
|
|
320
|
+
if isinstance(group_by, (_Operation, str)):
|
|
321
|
+
group_by = [group_by]
|
|
322
|
+
self.group_by = group_by or []
|
|
323
|
+
self.groups_to_display = groups_to_display
|
|
324
|
+
self.overwrite_output_field = overwrite_output_field
|
|
325
|
+
|
|
326
|
+
self._param_validation()
|
|
327
|
+
self.bucket_time = f'BUCKET_{self.bucket_time.upper()}'
|
|
328
|
+
|
|
329
|
+
@staticmethod
|
|
330
|
+
def _attr2str(value) -> str:
|
|
331
|
+
if isinstance(value, bool):
|
|
332
|
+
return 'true' if value else 'false'
|
|
333
|
+
if isinstance(value, list):
|
|
334
|
+
return ','.join(value)
|
|
335
|
+
return str(value)
|
|
336
|
+
|
|
337
|
+
@property
|
|
338
|
+
def ep_params(self) -> Dict:
|
|
339
|
+
"""prepare params for self.__str__ and otq.EpBase"""
|
|
340
|
+
params = {}
|
|
341
|
+
|
|
342
|
+
for field, ep_param in self.FIELDS_MAPPING.items():
|
|
343
|
+
if field in self.FIELDS_TO_SKIP:
|
|
344
|
+
continue
|
|
345
|
+
|
|
346
|
+
default_value = self.FIELDS_DEFAULT.get(field)
|
|
347
|
+
if getattr(self, field) != default_value:
|
|
348
|
+
if field == 'group_by':
|
|
349
|
+
params[ep_param] = ",".join(list(map(str, self.group_by)))
|
|
350
|
+
else:
|
|
351
|
+
params[ep_param] = getattr(self, field)
|
|
352
|
+
return params
|
|
353
|
+
|
|
354
|
+
def __str__(self):
|
|
355
|
+
params = [f'{k}={self._attr2str(v)}' for k, v in self.ep_params.items()]
|
|
356
|
+
return self.NAME + "(" + ",".join(params) + ")"
|
|
357
|
+
|
|
358
|
+
def to_ep(self, name: Optional[str]) -> otq.EpBase:
|
|
359
|
+
params = dict((k.lower(), v) for k, v in self.ep_params.items())
|
|
360
|
+
if 'output_field_name' not in self.FIELDS_TO_SKIP:
|
|
361
|
+
params['output_field_name'] = name
|
|
362
|
+
return self.EP(**params)
|
|
363
|
+
|
|
364
|
+
@validate
|
|
365
|
+
def _get_common_schema(self, src: 'Source', name: str) -> Dict:
|
|
366
|
+
"""return data schema without output fields (this fields should be added further)"""
|
|
367
|
+
schema = {}
|
|
368
|
+
for column in self.group_by:
|
|
369
|
+
schema[str(column)] = src.schema[str(column)]
|
|
370
|
+
if self.all_fields:
|
|
371
|
+
schema.update(src.schema)
|
|
372
|
+
return schema
|
|
373
|
+
|
|
374
|
+
def _modify_source(self, res: 'Source', **kwargs):
|
|
375
|
+
"""
|
|
376
|
+
Modify resulting source inplace before sinking to aggregation.
|
|
377
|
+
Can be overriden if needed.
|
|
378
|
+
"""
|
|
379
|
+
pass
|
|
380
|
+
|
|
381
|
+
def _get_output_schema(self, src: 'Source', name: Optional[str] = None) -> Dict:
|
|
382
|
+
if not name or name in src.__class__.meta_fields:
|
|
383
|
+
return {}
|
|
384
|
+
return {
|
|
385
|
+
name: self.output_field_type or src.schema[self.column_name]
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
@operation_gb
|
|
389
|
+
@operation_replacer
|
|
390
|
+
@output_column_overwriter
|
|
391
|
+
def apply(self, src: 'Source', name: Optional[str] = None, inplace: bool = False) -> 'Source':
|
|
392
|
+
"""
|
|
393
|
+
Applies aggregation to Source and sets proper schema
|
|
394
|
+
|
|
395
|
+
Parameters
|
|
396
|
+
----------
|
|
397
|
+
src: Source
|
|
398
|
+
Source to apply aggregation
|
|
399
|
+
name: str, optional
|
|
400
|
+
Name of output column. If not specified, will be used self.column_name
|
|
401
|
+
inplace: bool
|
|
402
|
+
Modify passed ``src`` object or return modified copy.
|
|
403
|
+
"""
|
|
404
|
+
if inplace:
|
|
405
|
+
res = src
|
|
406
|
+
src = src.copy()
|
|
407
|
+
else:
|
|
408
|
+
res = src.copy()
|
|
409
|
+
out_name = name or self.column_name
|
|
410
|
+
schema = self._get_common_schema(src, out_name)
|
|
411
|
+
# it's important to validate input schema before sinking
|
|
412
|
+
self._modify_source(res)
|
|
413
|
+
res.sink(self.to_ep(name=str(out_name)))
|
|
414
|
+
schema.update(self._get_output_schema(src, str(out_name)))
|
|
415
|
+
res.schema.set(**schema)
|
|
416
|
+
|
|
417
|
+
if not self.all_fields:
|
|
418
|
+
# in this case we propagate only resulting fields, that stored in res.schema (flexible schema case)
|
|
419
|
+
res._add_table(strict=True)
|
|
420
|
+
else:
|
|
421
|
+
# adding table to convert types in schema, e.g. float to int
|
|
422
|
+
res._add_table(strict=False)
|
|
423
|
+
return res
|
|
424
|
+
|
|
425
|
+
def validate_input_columns(self, src: 'Source'):
|
|
426
|
+
"""checks that columns used in aggregation presented in Source"""
|
|
427
|
+
if self.column_name not in src.schema:
|
|
428
|
+
raise TypeError(f"Aggregation `{self.NAME}` uses column `{self.column_name}` as input, which doesn't exist")
|
|
429
|
+
if not self.require_type:
|
|
430
|
+
return
|
|
431
|
+
dtype = src.schema[self.column_name]
|
|
432
|
+
base_dtype = ott.get_base_type(dtype)
|
|
433
|
+
for t in self.require_type:
|
|
434
|
+
# more generic types can be specified in self.require_type too
|
|
435
|
+
if dtype is t or base_dtype is t:
|
|
436
|
+
return
|
|
437
|
+
raise TypeError(f"Aggregation `{self.NAME}` require {self.require_type} types, got {dtype}")
|
|
438
|
+
|
|
439
|
+
@staticmethod
|
|
440
|
+
def validate_output_name(schema: Dict, name: Union[List, str]):
|
|
441
|
+
"""checks that aggregation won't output columns with same names"""
|
|
442
|
+
if not isinstance(name, list):
|
|
443
|
+
name = [name]
|
|
444
|
+
|
|
445
|
+
same_fields = []
|
|
446
|
+
for n in name:
|
|
447
|
+
if n in schema:
|
|
448
|
+
if '__long_nsec_' in n:
|
|
449
|
+
same_fields.append(n.replace('__long_nsec_', '')) # hack for large ints
|
|
450
|
+
else:
|
|
451
|
+
same_fields.append(n)
|
|
452
|
+
if same_fields:
|
|
453
|
+
raise ValueError("You try to propagate all fields and put result into already existing fields: "
|
|
454
|
+
f"'{', '.join(same_fields)}' ")
|
|
455
|
+
|
|
456
|
+
def _param_validation(self):
|
|
457
|
+
"""validate __init__ parameters"""
|
|
458
|
+
if self.running and self.bucket_time == "start":
|
|
459
|
+
raise ValueError("It is not allowed to set up running=True and bucket_time='start'")
|
|
460
|
+
if self.bucket_units == "flexible" and self.bucket_end_condition is None:
|
|
461
|
+
raise ValueError("bucket_units is set to 'flexible' but bucket_end_condition is not specified. "
|
|
462
|
+
"Please specify bucket_end_condition.")
|
|
463
|
+
if self.bucket_units != "flexible" and self.bucket_end_condition is not None:
|
|
464
|
+
raise ValueError("bucket_end_condition can be used only with 'flexible' bucket_units. "
|
|
465
|
+
"Please set bucket_units to 'flexible'.")
|
|
466
|
+
|
|
467
|
+
if self.bucket_time not in ['start', 'end']:
|
|
468
|
+
raise ValueError(f"'bucket_time' might be either 'start' or 'end', but passed '{self.bucket_time}'")
|
|
469
|
+
|
|
470
|
+
valid_units = ("seconds", "ticks", "days", "months", "flexible")
|
|
471
|
+
if self.bucket_units not in valid_units:
|
|
472
|
+
raise ValueError("'bucket_units' can be one of the following: "
|
|
473
|
+
f"'{', '.join(valid_units)}'; however, '{self.bucket_units}' was passed")
|
|
474
|
+
|
|
475
|
+
valid_boundary = {"new", "previous"}
|
|
476
|
+
if self.boundary_tick_bucket not in valid_boundary:
|
|
477
|
+
message = "'boundary_tick_bucket' can be one of the following: {}; however, {} was passed"
|
|
478
|
+
raise ValueError(message.format(', '.join(list(valid_boundary)), self.boundary_tick_bucket))
|
|
479
|
+
|
|
480
|
+
for column in self.group_by:
|
|
481
|
+
if not isinstance(column, _Operation) and not isinstance(column, str):
|
|
482
|
+
raise TypeError(f"Unsupported type '{column}' of a column to group by")
|
|
483
|
+
|
|
484
|
+
if self.groups_to_display not in ('all', 'event_in_last_bucket'):
|
|
485
|
+
raise ValueError("Parameter 'groups_to_display' can only be set to 'all' or 'event_in_last_bucket':"
|
|
486
|
+
f" got '{self.groups_to_display}'")
|
|
487
|
+
|
|
488
|
+
if self.all_fields and not self.running and 'running_all_fields' not in self._validations_to_skip:
|
|
489
|
+
raise ValueError("It is not allowed set all_fields to True for not running aggregation")
|
|
490
|
+
|
|
491
|
+
if not self.running and self.overwrite_output_field:
|
|
492
|
+
raise ValueError("Parameter 'overwrite_output_field' can only be used with running aggregations")
|
|
493
|
+
|
|
494
|
+
@property
|
|
495
|
+
def is_multi_column_aggregation(self):
|
|
496
|
+
return isinstance(self, _MultiColumnAggregation)
|
|
497
|
+
|
|
498
|
+
@property
|
|
499
|
+
def is_all_columns_aggregation(self):
|
|
500
|
+
return isinstance(self, _AllColumnsAggregation)
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
class _AggregationTSType(_Aggregation):
|
|
504
|
+
|
|
505
|
+
FIELDS_MAPPING = deepcopy(_Aggregation.FIELDS_MAPPING)
|
|
506
|
+
FIELDS_MAPPING['time_series_type'] = 'TIME_SERIES_TYPE'
|
|
507
|
+
FIELDS_DEFAULT = deepcopy(_Aggregation.FIELDS_DEFAULT)
|
|
508
|
+
FIELDS_DEFAULT['time_series_type'] = 'event_ts'
|
|
509
|
+
|
|
510
|
+
def __init__(self, column, time_series_type: str = "event_ts", *args, **kwargs):
|
|
511
|
+
"""
|
|
512
|
+
Abstract class that implements common logic for aggregations with ability to select time series type
|
|
513
|
+
inherited from _Aggregation
|
|
514
|
+
|
|
515
|
+
Parameters
|
|
516
|
+
----------
|
|
517
|
+
column: see _Aggregation
|
|
518
|
+
time_series_type: "event_ts" or "state_ts", default="event_ts"
|
|
519
|
+
"state_ts":
|
|
520
|
+
if there is a tick in bucket with timestamp = bucket start:
|
|
521
|
+
only ticks in bucket used for calculation max value
|
|
522
|
+
else:
|
|
523
|
+
latest tick from previous bucket included in current bucket
|
|
524
|
+
"event_ts": only ticks from current bucket used for calculations
|
|
525
|
+
args: see _Aggregation
|
|
526
|
+
kwargs: see _Aggregation
|
|
527
|
+
"""
|
|
528
|
+
if time_series_type not in ["event_ts", "state_ts"]:
|
|
529
|
+
raise ValueError('time_series_type argument must be "event_ts" or "state_ts"')
|
|
530
|
+
self.time_series_type = time_series_type
|
|
531
|
+
super().__init__(column, *args, **kwargs)
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
class _AggregationTSSelection(_Aggregation):
|
|
535
|
+
|
|
536
|
+
FIELDS_MAPPING = deepcopy(_Aggregation.FIELDS_MAPPING)
|
|
537
|
+
FIELDS_MAPPING['selection'] = 'SELECTION'
|
|
538
|
+
FIELDS_DEFAULT = deepcopy(_Aggregation.FIELDS_DEFAULT)
|
|
539
|
+
FIELDS_DEFAULT['selection'] = 'first'
|
|
540
|
+
|
|
541
|
+
def __init__(self, column, selection: str = "first", *args, **kwargs):
|
|
542
|
+
if selection not in ["first", "last"]:
|
|
543
|
+
raise ValueError(f'{self.__class__.__name__} selection argument must be "first" or "last"')
|
|
544
|
+
self.selection = selection
|
|
545
|
+
super().__init__(column, *args, **kwargs)
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
class _FloatAggregation(_Aggregation):
|
|
549
|
+
|
|
550
|
+
require_type = (int, float, ott._inf, ott.decimal)
|
|
551
|
+
|
|
552
|
+
"""
|
|
553
|
+
Aggregation that expect int or float as input
|
|
554
|
+
"""
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
class _KeepTs(_Aggregation):
|
|
558
|
+
|
|
559
|
+
def __init__(self, *args, keep_timestamp=True, **kwargs):
|
|
560
|
+
super().__init__(*args, **kwargs)
|
|
561
|
+
self.keep_timestamp = keep_timestamp
|
|
562
|
+
|
|
563
|
+
@validate # type: ignore
|
|
564
|
+
def _get_common_schema(self, src: 'Source', *args, **kwargs) -> Dict:
|
|
565
|
+
schema = src.schema.copy()
|
|
566
|
+
schema['TICK_TIME'] = ott.nsectime
|
|
567
|
+
return schema
|
|
568
|
+
|
|
569
|
+
def apply(self, src: 'Source', *args, **kwargs) -> 'Source':
|
|
570
|
+
res = super().apply(src=src, *args, **kwargs)
|
|
571
|
+
if self.keep_timestamp:
|
|
572
|
+
# TICK_TIME can be empty if it's a tick from default_tick aggregation parameter
|
|
573
|
+
res['TICK_TIME'] = res.if_else(res['TICK_TIME'], res['TICK_TIME'], res['TIMESTAMP'])
|
|
574
|
+
res['TIMESTAMP'] = res['TICK_TIME']
|
|
575
|
+
res.drop('TICK_TIME', inplace=True)
|
|
576
|
+
return res
|
|
577
|
+
|
|
578
|
+
|
|
579
|
+
class _ExpectLargeInts(_Aggregation):
|
|
580
|
+
FIELDS_MAPPING = deepcopy(_Aggregation.FIELDS_MAPPING)
|
|
581
|
+
FIELDS_MAPPING['large_ints'] = 'EXPECT_LARGE_INTS'
|
|
582
|
+
FIELDS_MAPPING['null_int_val'] = 'NULL_INT_VAL'
|
|
583
|
+
FIELDS_DEFAULT = deepcopy(_Aggregation.FIELDS_DEFAULT)
|
|
584
|
+
FIELDS_DEFAULT['large_ints'] = False
|
|
585
|
+
FIELDS_DEFAULT['null_int_val'] = 0
|
|
586
|
+
|
|
587
|
+
def __init__(self, *args, large_ints=False, null_int_val=0, **kwargs):
|
|
588
|
+
super().__init__(*args, **kwargs)
|
|
589
|
+
if large_ints not in {True, False, utils.adaptive}:
|
|
590
|
+
raise ValueError(f"Wrong value for {self.__class__.__name__} aggregation"
|
|
591
|
+
f" 'large_ints' parameter: {large_ints}")
|
|
592
|
+
if large_ints is utils.adaptive:
|
|
593
|
+
large_ints = 'IF_INPUT_VAL_IS_LONG_INTEGER'
|
|
594
|
+
|
|
595
|
+
if null_int_val and not large_ints:
|
|
596
|
+
raise ValueError(
|
|
597
|
+
f"Wrong value for {self.__class__.__name__} aggregation:"
|
|
598
|
+
f" 'null_int_val' parameter is set, however 'large_ints' is `False`"
|
|
599
|
+
)
|
|
600
|
+
|
|
601
|
+
self.large_ints = large_ints
|
|
602
|
+
self.null_int_val = null_int_val
|
|
603
|
+
|
|
604
|
+
def apply(self, src: 'Source', name: Optional[str] = None) -> 'Source':
|
|
605
|
+
out_name = name or self.column_name
|
|
606
|
+
res, col, convert_back = self._ts_to_long(src, str(out_name))
|
|
607
|
+
res = super().apply(res, col.tmp_out_column)
|
|
608
|
+
if not convert_back:
|
|
609
|
+
return res
|
|
610
|
+
return self._long_to_ts(res, col)
|
|
611
|
+
|
|
612
|
+
def _ts_to_long(self, src: 'Source', name: str) -> Tuple['Source', Any, bool]:
|
|
613
|
+
agg_columns = namedtuple('agg_columns', ('in_column', 'tmp_in_column', 'tmp_out_column', 'out_column'))
|
|
614
|
+
if src.schema[self.column_name] != ott.nsectime:
|
|
615
|
+
return src, agg_columns(self.column_name, self.column_name, name, name), False
|
|
616
|
+
self.large_ints = True
|
|
617
|
+
res = src.copy()
|
|
618
|
+
col = agg_columns(self.column_name, f'__long_nsec_{self.column_name}',
|
|
619
|
+
f'__long_nsec_{name}', name)
|
|
620
|
+
res[col.tmp_in_column] = res[col.in_column].apply(int)
|
|
621
|
+
self.column_name = col.tmp_in_column
|
|
622
|
+
return res, col, True
|
|
623
|
+
|
|
624
|
+
def _long_to_ts(self, src: 'Source', col) -> 'Source':
|
|
625
|
+
res = src.copy()
|
|
626
|
+
res[col.out_column] = res[col.tmp_out_column].astype(ott.nsectime)
|
|
627
|
+
to_drop = []
|
|
628
|
+
for c in [col.tmp_out_column, col.tmp_in_column]:
|
|
629
|
+
if c in res.schema:
|
|
630
|
+
to_drop.append(c)
|
|
631
|
+
if to_drop:
|
|
632
|
+
res.drop(to_drop, inplace=True)
|
|
633
|
+
self.column_name = col.in_column
|
|
634
|
+
return res
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
class _MultiColumnAggregation:
|
|
638
|
+
"""
|
|
639
|
+
Helper class for identifying multi-column aggregations.
|
|
640
|
+
"""
|
|
641
|
+
pass
|
|
642
|
+
|
|
643
|
+
|
|
644
|
+
class _AllColumnsAggregation(_MultiColumnAggregation):
|
|
645
|
+
"""
|
|
646
|
+
Helper class for identifying aggregations, which returns all fields from original ticks.
|
|
647
|
+
"""
|
|
648
|
+
pass
|