onetick-py 1.177.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- locator_parser/__init__.py +0 -0
- locator_parser/acl.py +73 -0
- locator_parser/actions.py +262 -0
- locator_parser/common.py +368 -0
- locator_parser/io.py +43 -0
- locator_parser/locator.py +150 -0
- onetick/__init__.py +101 -0
- onetick/doc_utilities/__init__.py +3 -0
- onetick/doc_utilities/napoleon.py +40 -0
- onetick/doc_utilities/ot_doctest.py +140 -0
- onetick/doc_utilities/snippets.py +279 -0
- onetick/lib/__init__.py +4 -0
- onetick/lib/instance.py +141 -0
- onetick/py/__init__.py +293 -0
- onetick/py/_stack_info.py +89 -0
- onetick/py/_version.py +2 -0
- onetick/py/aggregations/__init__.py +11 -0
- onetick/py/aggregations/_base.py +648 -0
- onetick/py/aggregations/_docs.py +948 -0
- onetick/py/aggregations/compute.py +286 -0
- onetick/py/aggregations/functions.py +2216 -0
- onetick/py/aggregations/generic.py +104 -0
- onetick/py/aggregations/high_low.py +80 -0
- onetick/py/aggregations/num_distinct.py +83 -0
- onetick/py/aggregations/order_book.py +501 -0
- onetick/py/aggregations/other.py +1014 -0
- onetick/py/backports.py +26 -0
- onetick/py/cache.py +374 -0
- onetick/py/callback/__init__.py +5 -0
- onetick/py/callback/callback.py +276 -0
- onetick/py/callback/callbacks.py +131 -0
- onetick/py/compatibility.py +798 -0
- onetick/py/configuration.py +771 -0
- onetick/py/core/__init__.py +0 -0
- onetick/py/core/_csv_inspector.py +93 -0
- onetick/py/core/_internal/__init__.py +0 -0
- onetick/py/core/_internal/_manually_bound_value.py +6 -0
- onetick/py/core/_internal/_nodes_history.py +250 -0
- onetick/py/core/_internal/_op_utils/__init__.py +0 -0
- onetick/py/core/_internal/_op_utils/every_operand.py +9 -0
- onetick/py/core/_internal/_op_utils/is_const.py +10 -0
- onetick/py/core/_internal/_per_tick_scripts/tick_list_sort_template.script +121 -0
- onetick/py/core/_internal/_proxy_node.py +140 -0
- onetick/py/core/_internal/_state_objects.py +2312 -0
- onetick/py/core/_internal/_state_vars.py +93 -0
- onetick/py/core/_source/__init__.py +0 -0
- onetick/py/core/_source/_symbol_param.py +95 -0
- onetick/py/core/_source/schema.py +97 -0
- onetick/py/core/_source/source_methods/__init__.py +0 -0
- onetick/py/core/_source/source_methods/aggregations.py +809 -0
- onetick/py/core/_source/source_methods/applyers.py +296 -0
- onetick/py/core/_source/source_methods/columns.py +141 -0
- onetick/py/core/_source/source_methods/data_quality.py +301 -0
- onetick/py/core/_source/source_methods/debugs.py +272 -0
- onetick/py/core/_source/source_methods/drops.py +120 -0
- onetick/py/core/_source/source_methods/fields.py +619 -0
- onetick/py/core/_source/source_methods/filters.py +1002 -0
- onetick/py/core/_source/source_methods/joins.py +1413 -0
- onetick/py/core/_source/source_methods/merges.py +605 -0
- onetick/py/core/_source/source_methods/misc.py +1455 -0
- onetick/py/core/_source/source_methods/pandases.py +155 -0
- onetick/py/core/_source/source_methods/renames.py +356 -0
- onetick/py/core/_source/source_methods/sorts.py +183 -0
- onetick/py/core/_source/source_methods/switches.py +142 -0
- onetick/py/core/_source/source_methods/symbols.py +117 -0
- onetick/py/core/_source/source_methods/times.py +627 -0
- onetick/py/core/_source/source_methods/writes.py +986 -0
- onetick/py/core/_source/symbol.py +205 -0
- onetick/py/core/_source/tmp_otq.py +222 -0
- onetick/py/core/column.py +209 -0
- onetick/py/core/column_operations/__init__.py +0 -0
- onetick/py/core/column_operations/_methods/__init__.py +4 -0
- onetick/py/core/column_operations/_methods/_internal.py +28 -0
- onetick/py/core/column_operations/_methods/conversions.py +216 -0
- onetick/py/core/column_operations/_methods/methods.py +292 -0
- onetick/py/core/column_operations/_methods/op_types.py +160 -0
- onetick/py/core/column_operations/accessors/__init__.py +0 -0
- onetick/py/core/column_operations/accessors/_accessor.py +28 -0
- onetick/py/core/column_operations/accessors/decimal_accessor.py +104 -0
- onetick/py/core/column_operations/accessors/dt_accessor.py +537 -0
- onetick/py/core/column_operations/accessors/float_accessor.py +184 -0
- onetick/py/core/column_operations/accessors/str_accessor.py +1367 -0
- onetick/py/core/column_operations/base.py +1121 -0
- onetick/py/core/cut_builder.py +150 -0
- onetick/py/core/db_constants.py +20 -0
- onetick/py/core/eval_query.py +245 -0
- onetick/py/core/lambda_object.py +441 -0
- onetick/py/core/multi_output_source.py +232 -0
- onetick/py/core/per_tick_script.py +2256 -0
- onetick/py/core/query_inspector.py +464 -0
- onetick/py/core/source.py +1744 -0
- onetick/py/db/__init__.py +2 -0
- onetick/py/db/_inspection.py +1128 -0
- onetick/py/db/db.py +1327 -0
- onetick/py/db/utils.py +64 -0
- onetick/py/docs/__init__.py +0 -0
- onetick/py/docs/docstring_parser.py +112 -0
- onetick/py/docs/utils.py +81 -0
- onetick/py/functions.py +2398 -0
- onetick/py/license.py +190 -0
- onetick/py/log.py +88 -0
- onetick/py/math.py +935 -0
- onetick/py/misc.py +470 -0
- onetick/py/oqd/__init__.py +22 -0
- onetick/py/oqd/eps.py +1195 -0
- onetick/py/oqd/sources.py +325 -0
- onetick/py/otq.py +216 -0
- onetick/py/pyomd_mock.py +47 -0
- onetick/py/run.py +916 -0
- onetick/py/servers.py +173 -0
- onetick/py/session.py +1347 -0
- onetick/py/sources/__init__.py +19 -0
- onetick/py/sources/cache.py +167 -0
- onetick/py/sources/common.py +128 -0
- onetick/py/sources/csv.py +642 -0
- onetick/py/sources/custom.py +85 -0
- onetick/py/sources/data_file.py +305 -0
- onetick/py/sources/data_source.py +1045 -0
- onetick/py/sources/empty.py +94 -0
- onetick/py/sources/odbc.py +337 -0
- onetick/py/sources/order_book.py +271 -0
- onetick/py/sources/parquet.py +168 -0
- onetick/py/sources/pit.py +191 -0
- onetick/py/sources/query.py +495 -0
- onetick/py/sources/snapshots.py +419 -0
- onetick/py/sources/split_query_output_by_symbol.py +198 -0
- onetick/py/sources/symbology_mapping.py +123 -0
- onetick/py/sources/symbols.py +374 -0
- onetick/py/sources/ticks.py +825 -0
- onetick/py/sql.py +70 -0
- onetick/py/state.py +251 -0
- onetick/py/types.py +2131 -0
- onetick/py/utils/__init__.py +70 -0
- onetick/py/utils/acl.py +93 -0
- onetick/py/utils/config.py +186 -0
- onetick/py/utils/default.py +49 -0
- onetick/py/utils/file.py +38 -0
- onetick/py/utils/helpers.py +76 -0
- onetick/py/utils/locator.py +94 -0
- onetick/py/utils/perf.py +498 -0
- onetick/py/utils/query.py +49 -0
- onetick/py/utils/render.py +1374 -0
- onetick/py/utils/script.py +244 -0
- onetick/py/utils/temp.py +471 -0
- onetick/py/utils/types.py +120 -0
- onetick/py/utils/tz.py +84 -0
- onetick_py-1.177.0.dist-info/METADATA +137 -0
- onetick_py-1.177.0.dist-info/RECORD +152 -0
- onetick_py-1.177.0.dist-info/WHEEL +5 -0
- onetick_py-1.177.0.dist-info/entry_points.txt +2 -0
- onetick_py-1.177.0.dist-info/licenses/LICENSE +21 -0
- onetick_py-1.177.0.dist-info/top_level.txt +2 -0
onetick/py/functions.py
ADDED
|
@@ -0,0 +1,2398 @@
|
|
|
1
|
+
import itertools
|
|
2
|
+
import warnings
|
|
3
|
+
import inspect
|
|
4
|
+
import re
|
|
5
|
+
import datetime as dt
|
|
6
|
+
from collections import defaultdict, Counter
|
|
7
|
+
from functools import singledispatch
|
|
8
|
+
from itertools import chain, zip_longest, repeat
|
|
9
|
+
from typing import List, Union, Type, Optional, Sequence
|
|
10
|
+
from enum import Enum
|
|
11
|
+
|
|
12
|
+
from onetick.py.otq import otq
|
|
13
|
+
|
|
14
|
+
from onetick.py.configuration import config, default_presort_concurrency
|
|
15
|
+
from onetick.py.core.eval_query import _QueryEvalWrapper
|
|
16
|
+
from onetick.py.core._source._symbol_param import _SymbolParamSource
|
|
17
|
+
from onetick.py.core._source.tmp_otq import TmpOtq
|
|
18
|
+
from onetick.py.utils import get_type_that_includes, adaptive, default
|
|
19
|
+
import onetick.py.types as ott
|
|
20
|
+
from onetick.py.core.column import Column
|
|
21
|
+
from onetick.py.core.column_operations.base import Operation
|
|
22
|
+
from onetick.py.core.cut_builder import _QCutBuilder, _CutBuilder
|
|
23
|
+
from onetick.py.backports import Literal
|
|
24
|
+
from onetick.py.compatibility import (
|
|
25
|
+
is_supported_join_with_aggregated_window,
|
|
26
|
+
is_supported_next_in_join_with_aggregated_window,
|
|
27
|
+
is_apply_rights_supported,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
__all__ = ['merge', 'join', 'join_by_time', 'apply_query', 'apply', 'cut', 'qcut', 'coalesce', 'corp_actions', 'format']
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def output_type_by_index(sources, index):
|
|
35
|
+
if index is None:
|
|
36
|
+
from onetick.py.core.source import _Source
|
|
37
|
+
return _Source
|
|
38
|
+
return type(sources[index])
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def apply_symbol_to_ep(base_ep, symbol, tmp_otq, symbol_date=None):
|
|
42
|
+
if not symbol:
|
|
43
|
+
return base_ep
|
|
44
|
+
|
|
45
|
+
from onetick.py.core.source import _Source
|
|
46
|
+
from onetick.py.sources import query as otp_query
|
|
47
|
+
|
|
48
|
+
if isinstance(symbol, _QueryEvalWrapper):
|
|
49
|
+
symbol = symbol.to_eval_string(tmp_otq=tmp_otq, symbol_date=symbol_date)
|
|
50
|
+
elif isinstance(symbol, otp_query):
|
|
51
|
+
if symbol_date is not None:
|
|
52
|
+
raise ValueError("Parameter 'symbol_date' is not supported if symbols are set with otp.query object")
|
|
53
|
+
symbol = symbol.to_eval_string()
|
|
54
|
+
elif isinstance(symbol, (_Source, otq.GraphQuery)):
|
|
55
|
+
symbol = _Source._convert_symbol_to_string(symbol, tmp_otq=tmp_otq, symbol_date=symbol_date,)
|
|
56
|
+
|
|
57
|
+
return base_ep.symbols(symbol)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def merge(sources, align_schema=True, symbols=None, identify_input_ts=False,
|
|
61
|
+
presort=adaptive, concurrency=default, batch_size=default, output_type_index=None,
|
|
62
|
+
add_symbol_index: bool = False, separate_db_name: bool = False,
|
|
63
|
+
added_field_name_suffix: str = '', stabilize_schema: Union[Type[adaptive], bool] = adaptive,
|
|
64
|
+
enforce_order: bool = False, symbol_date=None):
|
|
65
|
+
"""
|
|
66
|
+
Merges ticks from the ``sources`` into a single output ordered by the timestamp.
|
|
67
|
+
|
|
68
|
+
Note
|
|
69
|
+
----
|
|
70
|
+
If merged ticks have the same timestamp, their order is not guaranteed by default.
|
|
71
|
+
Set parameter ``enforce_order`` to set the order according to parameter ``sources``.
|
|
72
|
+
|
|
73
|
+
Parameters
|
|
74
|
+
----------
|
|
75
|
+
sources : list
|
|
76
|
+
List of sources to merge
|
|
77
|
+
align_schema : bool
|
|
78
|
+
If set to True, then table is added right after merge.
|
|
79
|
+
We recommended to keep True to prevent problems with
|
|
80
|
+
different tick schemas. Default: True
|
|
81
|
+
symbols: str, list of str or functions, :class:`Source`, :py:class:`onetick.query.GraphQuery`
|
|
82
|
+
Symbol(s) to run the query for passed as a string, a list of strings, or as a "symbols" query which results
|
|
83
|
+
include the ``SYMBOL_NAME`` column. The start/end times for the
|
|
84
|
+
symbols query will taken from the :meth:`run` params.
|
|
85
|
+
See :ref:`symbols <static/concepts/symbols:Symbols: bound and unbound>` for more details.
|
|
86
|
+
identify_input_ts: bool
|
|
87
|
+
If set to False, the fields *SYMBOL_NAME* and *TICK_TYPE* are not appended to the output ticks.
|
|
88
|
+
presort: bool
|
|
89
|
+
Add the **PRESORT** EP before merging.
|
|
90
|
+
By default, it is set to True if ``symbols`` are set
|
|
91
|
+
and to False otherwise.
|
|
92
|
+
concurrency: int
|
|
93
|
+
Specifies the number of CPU cores to utilize for the ``presort``.
|
|
94
|
+
By default, the value is inherited from the value of the query where this PRESORT is used.
|
|
95
|
+
|
|
96
|
+
For the main query it may be specified in the ``concurrency`` parameter of :meth:`run` method
|
|
97
|
+
(which by default is set to
|
|
98
|
+
:py:attr:`otp.config.default_concurrency<onetick.py.configuration.Config.default_concurrency>`).
|
|
99
|
+
|
|
100
|
+
For the auxiliary queries (like first-stage queries) empty value means OneTick's default of 1.
|
|
101
|
+
If :py:attr:`otp.config.presort_force_default_concurrency<onetick.py.configuration.Config.presort_force_default_concurrency>`
|
|
102
|
+
is set then default concurrency value will be set in all PRESORT EPs in all queries.
|
|
103
|
+
batch_size: int
|
|
104
|
+
Specifies the query batch size for the ``presort``.
|
|
105
|
+
By default, the value from
|
|
106
|
+
:py:attr:`otp.config.default_batch_size<onetick.py.configuration.Config.default_batch_size>`
|
|
107
|
+
is used.
|
|
108
|
+
output_type_index: int
|
|
109
|
+
Specifies index of source in ``sources`` from which type and properties of output will be taken.
|
|
110
|
+
Useful when merging sources that inherited from :class:`Source`.
|
|
111
|
+
By default, output object type will be :class:`Source`.
|
|
112
|
+
add_symbol_index: bool
|
|
113
|
+
If set to True, this function adds a field *SYMBOL_INDEX* to each tick,
|
|
114
|
+
with a numeric index (1-based) corresponding to the symbol the tick is for.
|
|
115
|
+
separate_db_name: bool
|
|
116
|
+
If set to True, the security name of the input time series is separated into
|
|
117
|
+
the pure symbol name and the database name parts
|
|
118
|
+
propagated in the *SYMBOL_NAME* and *DB_NAME* fields, respectively.
|
|
119
|
+
Otherwise, the full symbol name is propagated in a single field called *SYMBOL_NAME*.
|
|
120
|
+
added_field_name_suffix: str
|
|
121
|
+
The suffix to add to the names of additional fields
|
|
122
|
+
(that is, *SYMBOL_NAME*, *TICK_TYPE*, *DB_NAME* and *SYMBOL_INDEX*).
|
|
123
|
+
stabilize_schema: bool
|
|
124
|
+
If set to True, any fields that were present on any tick in the input time series
|
|
125
|
+
will be present in the ticks of the output time series.
|
|
126
|
+
New fields will be added to the output tick at the point they are first seen in the input time series.
|
|
127
|
+
If any field already present in the input is not present on a given input tick,
|
|
128
|
+
its type will be determined by the widest encountered type under that field name.
|
|
129
|
+
Incompatible types (for example, int and float) under the same field name will result in an exception.
|
|
130
|
+
|
|
131
|
+
Default is False.
|
|
132
|
+
enforce_order: bool
|
|
133
|
+
If merged ticks have the same timestamp, their order is not guaranteed by default.
|
|
134
|
+
Set this parameter to True to set the order according to parameter ``sources``.
|
|
135
|
+
|
|
136
|
+
Special OneTick field *OMDSEQ* will be used to order sources.
|
|
137
|
+
If it exists then it will be overwritten and deleted.
|
|
138
|
+
symbol_date: :py:class:`otp.datetime <onetick.py.datetime>` or :py:class:`datetime.datetime` or int
|
|
139
|
+
Symbol date or integer in the YYYYMMDD format.
|
|
140
|
+
Can only be specified if parameters ``symbols`` is set.
|
|
141
|
+
|
|
142
|
+
Returns
|
|
143
|
+
-------
|
|
144
|
+
:class:`Source` or same class as ``sources[output_type_index]``
|
|
145
|
+
A time series of ticks.
|
|
146
|
+
|
|
147
|
+
See also
|
|
148
|
+
--------
|
|
149
|
+
**MERGE** and **PRESORT** OneTick event processors
|
|
150
|
+
|
|
151
|
+
Examples
|
|
152
|
+
--------
|
|
153
|
+
|
|
154
|
+
``merge`` is used to merge different data sources:
|
|
155
|
+
|
|
156
|
+
>>> data1 = otp.Ticks(X=[1, 2], Y=['a', 'd'])
|
|
157
|
+
>>> data2 = otp.Ticks(X=[-1, -2], Y=['*', '-'])
|
|
158
|
+
>>> data = otp.merge([data1, data2]) # OTdirective: snippet-name:merge.as list;
|
|
159
|
+
>>> otp.run(data)
|
|
160
|
+
Time X Y
|
|
161
|
+
0 2003-12-01 00:00:00.000 1 a
|
|
162
|
+
1 2003-12-01 00:00:00.000 -1 *
|
|
163
|
+
2 2003-12-01 00:00:00.001 2 d
|
|
164
|
+
3 2003-12-01 00:00:00.001 -2 -
|
|
165
|
+
|
|
166
|
+
Merge series from multiple symbols into one series:
|
|
167
|
+
|
|
168
|
+
>>> # OTdirective: snippet-name:merge.bound symbols;
|
|
169
|
+
>>> data = otp.Ticks(X=[1])
|
|
170
|
+
>>> data['SYMBOL_NAME'] = data['_SYMBOL_NAME']
|
|
171
|
+
>>> symbols = otp.Ticks(SYMBOL_NAME=['A', 'B'])
|
|
172
|
+
>>> data = otp.merge([data], symbols=symbols)
|
|
173
|
+
>>> otp.run(data)
|
|
174
|
+
Time X SYMBOL_NAME
|
|
175
|
+
0 2003-12-01 1 A
|
|
176
|
+
1 2003-12-01 1 B
|
|
177
|
+
|
|
178
|
+
Use ``identify_input_ts`` and other parameters to add information about symbol to each tick:
|
|
179
|
+
|
|
180
|
+
>>> symbols = otp.Ticks(SYMBOL_NAME=['COMMON::S1', 'DEMO_L1::S2'])
|
|
181
|
+
>>> data = otp.Tick(A=1, db=None, tick_type='TT')
|
|
182
|
+
>>> data = otp.merge([data], symbols=symbols, identify_input_ts=True,
|
|
183
|
+
... separate_db_name=True, add_symbol_index=True, added_field_name_suffix='__')
|
|
184
|
+
>>> otp.run(data)
|
|
185
|
+
Time A SYMBOL_NAME__ DB_NAME__ TICK_TYPE__ SYMBOL_INDEX__
|
|
186
|
+
0 2003-12-01 1 S1 COMMON TT 1
|
|
187
|
+
1 2003-12-01 1 S2 DEMO_L1 TT 2
|
|
188
|
+
|
|
189
|
+
Adding symbol parameters before merge:
|
|
190
|
+
|
|
191
|
+
>>> symbols = otp.Ticks(SYMBOL_NAME=['S1', 'S2'], param=[1, -1])
|
|
192
|
+
>>> def func(symbol):
|
|
193
|
+
... pre = otp.Ticks(X=[1])
|
|
194
|
+
... pre["SYMBOL_NAME"] = symbol.name
|
|
195
|
+
... pre["PARAM"] = symbol.param
|
|
196
|
+
... return pre
|
|
197
|
+
>>> data = otp.merge([func], symbols=symbols)
|
|
198
|
+
>>> otp.run(data)[['PARAM', 'SYMBOL_NAME']]
|
|
199
|
+
PARAM SYMBOL_NAME
|
|
200
|
+
0 1 S1
|
|
201
|
+
1 -1 S2
|
|
202
|
+
|
|
203
|
+
Use parameter ``output_type_index`` to specify which input class to use to create output object.
|
|
204
|
+
It may be useful in case some custom user class was used as input:
|
|
205
|
+
|
|
206
|
+
>>> class CustomTick(otp.Tick):
|
|
207
|
+
... def custom_method(self):
|
|
208
|
+
... return 'custom_result'
|
|
209
|
+
>>> data1 = otp.Tick(A=1)
|
|
210
|
+
>>> data2 = CustomTick(B=2)
|
|
211
|
+
>>> data = otp.merge([data1, data2], output_type_index=1)
|
|
212
|
+
>>> type(data)
|
|
213
|
+
<class 'onetick.py.functions.CustomTick'>
|
|
214
|
+
>>> data.custom_method()
|
|
215
|
+
'custom_result'
|
|
216
|
+
>>> otp.run(data)
|
|
217
|
+
Time A B
|
|
218
|
+
0 2003-12-01 1 0
|
|
219
|
+
1 2003-12-01 0 2
|
|
220
|
+
""" # noqa: E501
|
|
221
|
+
from onetick.py.core.source import _Source
|
|
222
|
+
|
|
223
|
+
if not sources:
|
|
224
|
+
raise ValueError("Merge should have one or more inputs")
|
|
225
|
+
|
|
226
|
+
output_type = output_type_by_index(sources, output_type_index)
|
|
227
|
+
|
|
228
|
+
if presort is adaptive:
|
|
229
|
+
presort = True if symbols is not None else False
|
|
230
|
+
|
|
231
|
+
if concurrency is not default and not presort:
|
|
232
|
+
warnings.warn("Using the `concurrency` parameter makes effect only when "
|
|
233
|
+
"the `presort` parameter is set to True")
|
|
234
|
+
if batch_size is not default and not presort:
|
|
235
|
+
warnings.warn("Using the `batch_size` parameter makes effect only when "
|
|
236
|
+
"the `presort` parameter is set to True")
|
|
237
|
+
|
|
238
|
+
if concurrency is default:
|
|
239
|
+
concurrency = default_presort_concurrency()
|
|
240
|
+
if concurrency is None:
|
|
241
|
+
# None means inherit concurrency from the query where this EP is used
|
|
242
|
+
# otq.Presort does not support None
|
|
243
|
+
concurrency = ''
|
|
244
|
+
|
|
245
|
+
if batch_size is default:
|
|
246
|
+
batch_size = config.default_batch_size
|
|
247
|
+
|
|
248
|
+
merge_kwargs = {
|
|
249
|
+
'identify_input_ts': identify_input_ts,
|
|
250
|
+
'add_symbol_index': add_symbol_index,
|
|
251
|
+
'separate_db_name': separate_db_name,
|
|
252
|
+
'added_field_name_suffix': added_field_name_suffix,
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
if 'stabilize_schema' in otq.Merge.Parameters.list_parameters():
|
|
256
|
+
if stabilize_schema is adaptive:
|
|
257
|
+
stabilize_schema = False
|
|
258
|
+
merge_kwargs['stabilize_schema'] = stabilize_schema
|
|
259
|
+
elif stabilize_schema is not adaptive:
|
|
260
|
+
raise ValueError("Parameter 'stabilize_schema' is not supported in this OneTick build")
|
|
261
|
+
|
|
262
|
+
if symbol_date is not None:
|
|
263
|
+
if symbols is None:
|
|
264
|
+
raise ValueError("Parameter 'symbol_date' can only be specified together with parameter 'symbols'")
|
|
265
|
+
if isinstance(symbols, (str, list)):
|
|
266
|
+
# this is a hack
|
|
267
|
+
# onetick.query doesn't have an interface to set symbol_date for the EP node
|
|
268
|
+
# so instead of setting symbols for the EP node,
|
|
269
|
+
# we will turn symbol list into the first stage query, and symbol_date will be set for this query
|
|
270
|
+
import onetick.py as otp
|
|
271
|
+
if isinstance(symbols, str):
|
|
272
|
+
symbols = [symbols]
|
|
273
|
+
symbols = otp.Ticks(SYMBOL_NAME=symbols)
|
|
274
|
+
|
|
275
|
+
def _base_ep_for_cross_symbol(symbol, tmp_otq, symbol_date=None):
|
|
276
|
+
if presort:
|
|
277
|
+
base_ep = otq.Presort(batch_size=batch_size, max_concurrency=concurrency)
|
|
278
|
+
else:
|
|
279
|
+
base_ep = otq.Merge(**merge_kwargs)
|
|
280
|
+
|
|
281
|
+
base_ep = apply_symbol_to_ep(base_ep, symbol, tmp_otq, symbol_date=symbol_date)
|
|
282
|
+
|
|
283
|
+
return base_ep
|
|
284
|
+
|
|
285
|
+
def _evaluate_functions_in_sources_list(sources, symbols):
|
|
286
|
+
result = []
|
|
287
|
+
|
|
288
|
+
if not isinstance(sources, list):
|
|
289
|
+
sources = [sources]
|
|
290
|
+
|
|
291
|
+
for s in sources:
|
|
292
|
+
if not isinstance(s, _Source) and callable(s):
|
|
293
|
+
num_params = len(inspect.signature(s).parameters)
|
|
294
|
+
|
|
295
|
+
if num_params == 0:
|
|
296
|
+
s = s()
|
|
297
|
+
elif num_params == 1:
|
|
298
|
+
s = s(symbols.to_symbol_param() if isinstance(symbols, (_Source, _QueryEvalWrapper))
|
|
299
|
+
else _SymbolParamSource())
|
|
300
|
+
else:
|
|
301
|
+
raise ValueError(
|
|
302
|
+
f"It is expected only one parameter from the callback, but {num_params} passed"
|
|
303
|
+
) # TODO: test this case
|
|
304
|
+
if isinstance(s, _Source):
|
|
305
|
+
result.append(s)
|
|
306
|
+
else:
|
|
307
|
+
raise ValueError("Source and functions (returning _source) are expected as preprocessors")
|
|
308
|
+
return result
|
|
309
|
+
|
|
310
|
+
sources = _evaluate_functions_in_sources_list(sources, symbols)
|
|
311
|
+
if enforce_order:
|
|
312
|
+
sources = _enforce_order_for_sources(sources)
|
|
313
|
+
need_table = False
|
|
314
|
+
merged_columns, need_table, used_columns = _collect_merged_columns(need_table, sources)
|
|
315
|
+
need_table = _is_table_after_merge_needed(need_table, used_columns)
|
|
316
|
+
|
|
317
|
+
# we need to store internal graphs somewhere while we create base ep from eval
|
|
318
|
+
intermediate_tmp_otq = TmpOtq()
|
|
319
|
+
result = output_type(node=_base_ep_for_cross_symbol(symbols, tmp_otq=intermediate_tmp_otq, symbol_date=symbol_date),
|
|
320
|
+
schema=merged_columns)
|
|
321
|
+
result._tmp_otq.merge(intermediate_tmp_otq)
|
|
322
|
+
|
|
323
|
+
__copy_sources_on_merge_or_join(result, sources, symbols, output_type_index=output_type_index)
|
|
324
|
+
|
|
325
|
+
if presort:
|
|
326
|
+
result.sink(otq.Merge(**merge_kwargs))
|
|
327
|
+
|
|
328
|
+
if enforce_order:
|
|
329
|
+
result.drop('OMDSEQ', inplace=True)
|
|
330
|
+
merged_columns.pop('OMDSEQ')
|
|
331
|
+
|
|
332
|
+
if identify_input_ts:
|
|
333
|
+
result.schema['SYMBOL_NAME' + added_field_name_suffix] = str
|
|
334
|
+
result.schema['TICK_TYPE' + added_field_name_suffix] = str
|
|
335
|
+
if separate_db_name:
|
|
336
|
+
result.schema['DB_NAME' + added_field_name_suffix] = str
|
|
337
|
+
|
|
338
|
+
if add_symbol_index:
|
|
339
|
+
result.schema['SYMBOL_INDEX' + added_field_name_suffix] = int
|
|
340
|
+
|
|
341
|
+
result = _add_table_after_merge(align_schema, merged_columns, need_table, result)
|
|
342
|
+
result._fix_varstrings()
|
|
343
|
+
return result
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def _add_table_after_merge(add_table, merged_columns, need_table, result):
|
|
347
|
+
if add_table and need_table:
|
|
348
|
+
# a special case, when the add_table parameter is a list of common columns that should
|
|
349
|
+
# be added to a final table
|
|
350
|
+
# it is used internally
|
|
351
|
+
if isinstance(add_table, list):
|
|
352
|
+
merged_columns = {key: value for key, value in merged_columns.items() if key in add_table}
|
|
353
|
+
|
|
354
|
+
if len(merged_columns):
|
|
355
|
+
table = otq.Table(
|
|
356
|
+
fields=",".join(ott.type2str(dtype) + " " + name for name, dtype in merged_columns.items()),
|
|
357
|
+
keep_input_fields=True,
|
|
358
|
+
)
|
|
359
|
+
result.sink(table)
|
|
360
|
+
return result
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def __copy_sources_on_merge_or_join(result,
|
|
364
|
+
sources,
|
|
365
|
+
symbols=None,
|
|
366
|
+
names=None,
|
|
367
|
+
drop_meta=False,
|
|
368
|
+
leading=None,
|
|
369
|
+
output_type_index=None,
|
|
370
|
+
use_rename_ep=True):
|
|
371
|
+
""" copy columns, state vars and other metadata from joining, merging sources
|
|
372
|
+
|
|
373
|
+
Parameters
|
|
374
|
+
----------
|
|
375
|
+
result: _Source
|
|
376
|
+
Source object constructed as join, merge operation, e.g. result = _Source(otq.Merge(sources))
|
|
377
|
+
sources: list of _Source, tuple of _Source
|
|
378
|
+
Sources were joined, merged
|
|
379
|
+
symbols:
|
|
380
|
+
Symbols to copy
|
|
381
|
+
names: list of str or None, tuple of str or None, bool, optional
|
|
382
|
+
- If collection of string or None than add passthrough eps with such name to `sources` if name is specify
|
|
383
|
+
or do not add anything if corresponding item in names is None.
|
|
384
|
+
- If True, than autogenerate such names in __SRC_{number}__ format
|
|
385
|
+
- If None, False than do not add passthrough eps and do not change node names.
|
|
386
|
+
drop_meta : bool, optional
|
|
387
|
+
If True drop TIMESTAMP and OMDSEQ field
|
|
388
|
+
leading : List of str, Tuple of str, Optional
|
|
389
|
+
List of leading sources names
|
|
390
|
+
output_type_index: int, optional
|
|
391
|
+
Specifies index of source in `sources` from which properties of `result` will be taken.
|
|
392
|
+
Useful when merging sources that inherited from otp.Source.
|
|
393
|
+
use_rename_ep: bool
|
|
394
|
+
Use :py:class:`onetick.query.RenameFields` event processor or not.
|
|
395
|
+
This event processor can't be used in generic aggregation.
|
|
396
|
+
|
|
397
|
+
Returns
|
|
398
|
+
-------
|
|
399
|
+
None
|
|
400
|
+
Modify result directly
|
|
401
|
+
"""
|
|
402
|
+
from onetick.py.core.source import _Source
|
|
403
|
+
|
|
404
|
+
result._copy_state_vars_from(sources)
|
|
405
|
+
result._clean_sources_dates() # because it is not a real _source
|
|
406
|
+
|
|
407
|
+
for source in sources:
|
|
408
|
+
result._merge_tmp_otq(source)
|
|
409
|
+
if source.get_name():
|
|
410
|
+
if not result.get_name():
|
|
411
|
+
result.set_name(source.get_name())
|
|
412
|
+
if result.get_name() != source.get_name():
|
|
413
|
+
warnings.warn(f"Merging/joining sources with different names: '{result.get_name()}' "
|
|
414
|
+
f"and '{source.get_name()}'. Some of those names will be lost")
|
|
415
|
+
|
|
416
|
+
if isinstance(symbols, _Source):
|
|
417
|
+
result._merge_tmp_otq(symbols)
|
|
418
|
+
|
|
419
|
+
names = __copy_and_rename_nodes_on_merge_join(result, names, sources, symbols)
|
|
420
|
+
|
|
421
|
+
if drop_meta:
|
|
422
|
+
to_drop = list(map(lambda x: x + ".TIMESTAMP", names))
|
|
423
|
+
to_drop += list(map(lambda x: x + ".OMDSEQ", names))
|
|
424
|
+
__rename_leading_omdseq(leading, names, result, sources, use_rename_ep=use_rename_ep)
|
|
425
|
+
result.sink(otq.Passthrough(fields=",".join(to_drop), drop_fields=True))
|
|
426
|
+
|
|
427
|
+
if output_type_index is not None:
|
|
428
|
+
result._copy_properties_from(sources[output_type_index])
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
def __rename_fields(source, mapping, use_rename_ep=True):
|
|
432
|
+
"""
|
|
433
|
+
Function to rename fields from ``mapping`` in ``source``.
|
|
434
|
+
Note that it is a low-level function that doesn't change python schema of the ``source``.
|
|
435
|
+
Modifies ``source`` inplace, doesn't return anything.
|
|
436
|
+
If ``use_rename_ep`` is `True`, then :py:class:`onetick.query.RenameFields` event processor will be used.
|
|
437
|
+
"""
|
|
438
|
+
if use_rename_ep:
|
|
439
|
+
source.sink(otq.RenameFields(','.join(f'{k}={v}' for k, v in mapping.items())))
|
|
440
|
+
return
|
|
441
|
+
# May be needed, because RenameFields ep is not supported in generic aggregation
|
|
442
|
+
for old, new in mapping.items():
|
|
443
|
+
# RenameFields ignores non-existent fields,
|
|
444
|
+
# all this mess is needed to mimic that logic
|
|
445
|
+
source.sink(otq.WhereClause(where=f'UNDEFINED("{old}")'))
|
|
446
|
+
if_branch_graph = source.node().copy_graph()
|
|
447
|
+
if_branch_rules = source.node().copy_rules()
|
|
448
|
+
source.sink(otq.AddField(new, old), out_pin='ELSE')
|
|
449
|
+
source.sink(otq.Passthrough(old, drop_fields=True))
|
|
450
|
+
source.sink(otq.Merge(identify_input_ts=False))
|
|
451
|
+
source.source(if_branch_graph)
|
|
452
|
+
source.node().add_rules(if_branch_rules)
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
def __rename_leading_omdseq(leading, names, result, sources, use_rename_ep=True):
|
|
456
|
+
if leading is not None:
|
|
457
|
+
if len(leading) == 1:
|
|
458
|
+
leading = leading.pop()
|
|
459
|
+
__rename_fields(result, {f"{leading}.OMDSEQ": "OMDSEQ"}, use_rename_ep=use_rename_ep)
|
|
460
|
+
else:
|
|
461
|
+
number, indexes = __get_number_and_indexes_of_sources_have_field(sources, "OMDSEQ")
|
|
462
|
+
if number == 1:
|
|
463
|
+
__rename_fields(result, {f"{names[indexes.pop()]}.OMDSEQ": "OMDSEQ"}, use_rename_ep=use_rename_ep)
|
|
464
|
+
elif number:
|
|
465
|
+
raise ValueError(
|
|
466
|
+
"Several sources was specified as leading and OMDSEQ field is presented in more than "
|
|
467
|
+
"one source. Resulted OMDSEQ can't be derived in such case."
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def __get_number_and_indexes_of_sources_have_field(sources, field):
|
|
472
|
+
number = 0
|
|
473
|
+
indexes = []
|
|
474
|
+
for s in sources:
|
|
475
|
+
if field in s.columns():
|
|
476
|
+
indexes.append(number)
|
|
477
|
+
number += 1
|
|
478
|
+
return number, indexes
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
def __copy_and_rename_nodes_on_merge_join(result, names, sources, symbols):
|
|
482
|
+
# shared eps between sources
|
|
483
|
+
eps = defaultdict()
|
|
484
|
+
if names is True:
|
|
485
|
+
names = [f"__SRC_{n}__" for n in range(len(sources))]
|
|
486
|
+
if not names:
|
|
487
|
+
names = itertools.repeat(None)
|
|
488
|
+
if sources:
|
|
489
|
+
for name, src in zip(names, sources):
|
|
490
|
+
obj = src
|
|
491
|
+
if name:
|
|
492
|
+
obj = src.copy()
|
|
493
|
+
obj.sink(otq.Passthrough())
|
|
494
|
+
obj.node_name(name)
|
|
495
|
+
|
|
496
|
+
result.source(obj.node().copy_graph(eps))
|
|
497
|
+
result.node().add_rules(obj.node().copy_rules())
|
|
498
|
+
result._set_sources_dates(obj, copy_symbols=not bool(symbols))
|
|
499
|
+
return names
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
def _is_table_after_merge_needed(need_table, used_columns):
|
|
503
|
+
if not need_table:
|
|
504
|
+
for key, value in used_columns.items():
|
|
505
|
+
if not value:
|
|
506
|
+
need_table = True
|
|
507
|
+
break
|
|
508
|
+
|
|
509
|
+
return need_table
|
|
510
|
+
|
|
511
|
+
|
|
512
|
+
def _collect_merged_columns(need_table, sources):
|
|
513
|
+
merged_columns = sources[0].columns(skip_meta_fields=True)
|
|
514
|
+
used_columns = {key: False for key in merged_columns.keys()}
|
|
515
|
+
for src in sources[1:]:
|
|
516
|
+
for key, value in src.columns(skip_meta_fields=True).items():
|
|
517
|
+
if key in merged_columns:
|
|
518
|
+
orig_type = merged_columns[key]
|
|
519
|
+
try:
|
|
520
|
+
merged_dtype, merged_need_table = get_type_that_includes([orig_type, value])
|
|
521
|
+
except ValueError as e:
|
|
522
|
+
raise ValueError(f"Column '{key}' has different types for "
|
|
523
|
+
f"different branches: {orig_type} {value}") from e
|
|
524
|
+
|
|
525
|
+
need_table |= merged_need_table
|
|
526
|
+
merged_columns[key] = merged_dtype
|
|
527
|
+
else:
|
|
528
|
+
need_table = True
|
|
529
|
+
merged_columns[key] = value
|
|
530
|
+
|
|
531
|
+
if key in used_columns:
|
|
532
|
+
used_columns[key] = True
|
|
533
|
+
|
|
534
|
+
return merged_columns, need_table, used_columns
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
def concat(sources=None, add_table=True, symbols=None):
|
|
538
|
+
""" Deprecated: Merges ticks from the sources into a single output _source ordered by the timestamp
|
|
539
|
+
|
|
540
|
+
This function is deprecated due the wrong name notation.
|
|
541
|
+
Use 'merge' instead.
|
|
542
|
+
|
|
543
|
+
Parameters
|
|
544
|
+
----------
|
|
545
|
+
sources : list
|
|
546
|
+
List of sources to merge
|
|
547
|
+
align_schema : bool
|
|
548
|
+
If set to True, then table is added right after merge.
|
|
549
|
+
We recommended to keep True to prevent problems with
|
|
550
|
+
different tick schemas. Default: True
|
|
551
|
+
|
|
552
|
+
Returns
|
|
553
|
+
-------
|
|
554
|
+
A new _source that holds a result of the merged sources
|
|
555
|
+
"""
|
|
556
|
+
warnings.warn("This function is deprecated due the wrong name notation. Use `merge` instead.", FutureWarning)
|
|
557
|
+
return merge(sources=sources, align_schema=add_table, symbols=symbols)
|
|
558
|
+
|
|
559
|
+
|
|
560
|
+
def _add_node_name_prefix_to_columns_in_operation(op, src):
|
|
561
|
+
"""
|
|
562
|
+
Add node name of souce ``src`` as prefix to all columns names in operation ``op``.
|
|
563
|
+
"""
|
|
564
|
+
if not isinstance(op, Operation):
|
|
565
|
+
return op
|
|
566
|
+
|
|
567
|
+
def fun(operation):
|
|
568
|
+
if isinstance(operation, ott.ExpressionDefinedTimeOffset) and isinstance(operation.n, Operation):
|
|
569
|
+
operation.n = operation.n._replace_parameters(fun)
|
|
570
|
+
if isinstance(operation, Column) and operation.obj_ref is src:
|
|
571
|
+
column = operation
|
|
572
|
+
if not src.node_name().strip():
|
|
573
|
+
raise ValueError('You set to use name for column prefix, but name is empty')
|
|
574
|
+
name = f'{src.node_name()}.{column.name}'
|
|
575
|
+
return Column(name, column.dtype, column.obj_ref, precision=getattr(column, "_precision", None))
|
|
576
|
+
return None
|
|
577
|
+
|
|
578
|
+
return op._replace_parameters(fun)
|
|
579
|
+
|
|
580
|
+
|
|
581
|
+
def _enforce_order_for_sources(sources):
|
|
582
|
+
"""
|
|
583
|
+
Enforce order of sources by adding/modifying OMDSEQ field.
|
|
584
|
+
"""
|
|
585
|
+
result = []
|
|
586
|
+
for i, source in enumerate(sources):
|
|
587
|
+
source = source.copy()
|
|
588
|
+
source = source.table(strict=False, **{'OMDSEQ': int})
|
|
589
|
+
source['OMDSEQ'] = i
|
|
590
|
+
# this update_field is needed to let OneTick know that OMDSEQ was changed
|
|
591
|
+
source.sink(otq.UpdateField(field='TIMESTAMP', value='TIMESTAMP'))
|
|
592
|
+
result.append(source)
|
|
593
|
+
return result
|
|
594
|
+
|
|
595
|
+
|
|
596
|
+
def join(left, right, on, how='left_outer', rprefix='RIGHT', keep_fields_not_in_schema=False, output_type_index=None):
|
|
597
|
+
"""
|
|
598
|
+
Joins two sources ``left`` and ``right`` based on ``on`` condition.
|
|
599
|
+
|
|
600
|
+
In case you willing to add prefix/suffix to all columns in one of the sources you should use
|
|
601
|
+
:func:`Source.add_prefix` or :func:`Source.add_suffix`
|
|
602
|
+
|
|
603
|
+
Parameters
|
|
604
|
+
----------
|
|
605
|
+
left: :class:`Source`
|
|
606
|
+
left source to join
|
|
607
|
+
right: :class:`Source`
|
|
608
|
+
right source to join
|
|
609
|
+
on: :py:class:`~onetick.py.Operation` or 'all' or 'same_size' or list of strings
|
|
610
|
+
|
|
611
|
+
If 'all' joins every tick from ``left`` with every tick from ``right``.
|
|
612
|
+
|
|
613
|
+
If 'same_size' and size of sources are same, joins ticks from two sources directly, else raises exception.
|
|
614
|
+
|
|
615
|
+
If it is list of strings, then ticks with same ``on`` fields will be joined.
|
|
616
|
+
|
|
617
|
+
If :py:class:`~onetick.py.Operation` then only ticks on which the condition evaluates to True will be joined.
|
|
618
|
+
how: 'inner' or 'left_outer'
|
|
619
|
+
Joining type.
|
|
620
|
+
Inner join will only produce ticks that matched the ``on`` condition.
|
|
621
|
+
Left outer join will also produce the ticks from the ``left`` source
|
|
622
|
+
that didn't match the condition.
|
|
623
|
+
|
|
624
|
+
Doesn't matter for ``on='same_size'``.
|
|
625
|
+
rprefix: str
|
|
626
|
+
The name of ``right`` data source. It will be added as prefix to overlapping columns arrived
|
|
627
|
+
from right to result
|
|
628
|
+
keep_fields_not_in_schema: bool
|
|
629
|
+
|
|
630
|
+
If True - join function will try to preserve any fields of original sources that are not in the source schema,
|
|
631
|
+
propagating them to output. This means a possibility of runtime error if fields are duplicating.
|
|
632
|
+
|
|
633
|
+
If False, will remove all fields that are not in schema.
|
|
634
|
+
output_type_index: int
|
|
635
|
+
Specifies index of source in sources from which type and properties of output will be taken.
|
|
636
|
+
Useful when joining sources that inherited from :class:`Source`.
|
|
637
|
+
By default output object type will be :class:`Source`.
|
|
638
|
+
|
|
639
|
+
Note
|
|
640
|
+
----
|
|
641
|
+
``join`` does some internal optimization in case of using time-based ``on`` conditions. Optimization doesn't apply
|
|
642
|
+
if ``on`` expression has functions in it. So it is recommended to use addition/subtraction number of
|
|
643
|
+
milliseconds (integers).
|
|
644
|
+
|
|
645
|
+
See examples for more details.
|
|
646
|
+
|
|
647
|
+
Returns
|
|
648
|
+
-------
|
|
649
|
+
:class:`Source` or same class as ``[left, right][output_type_index]``
|
|
650
|
+
joined data
|
|
651
|
+
|
|
652
|
+
See also
|
|
653
|
+
--------
|
|
654
|
+
**JOIN** OneTick event processor
|
|
655
|
+
|
|
656
|
+
Examples
|
|
657
|
+
--------
|
|
658
|
+
>>> d1 = otp.Ticks({'ID': [1, 2, 3], 'A': ['a', 'b', 'c']})
|
|
659
|
+
>>> d2 = otp.Ticks({'ID': [2, 3, 4], 'B': ['q', 'w', 'e']})
|
|
660
|
+
|
|
661
|
+
Outer join:
|
|
662
|
+
|
|
663
|
+
>>> data = otp.join(d1, d2, on=d1['ID'] == d2['ID'], how='left_outer')
|
|
664
|
+
>>> otp.run(data)
|
|
665
|
+
Time ID A RIGHT_ID B
|
|
666
|
+
0 2003-12-01 00:00:00.000 1 a 0
|
|
667
|
+
1 2003-12-01 00:00:00.001 2 b 2 q
|
|
668
|
+
2 2003-12-01 00:00:00.002 3 c 3 w
|
|
669
|
+
|
|
670
|
+
Inner join:
|
|
671
|
+
|
|
672
|
+
>>> data = otp.join(d1, d2, on=d1['ID'] == d2['ID'], how='inner')
|
|
673
|
+
>>> otp.run(data)
|
|
674
|
+
Time ID A RIGHT_ID B
|
|
675
|
+
0 2003-12-01 00:00:00.001 2 b 2 q
|
|
676
|
+
1 2003-12-01 00:00:00.002 3 c 3 w
|
|
677
|
+
|
|
678
|
+
Join all ticks:
|
|
679
|
+
|
|
680
|
+
>>> data = otp.join(d1, d2, on='all')
|
|
681
|
+
>>> otp.run(data)
|
|
682
|
+
Time ID A RIGHT_ID B
|
|
683
|
+
0 2003-12-01 00:00:00.000 1 a 2 q
|
|
684
|
+
1 2003-12-01 00:00:00.000 1 a 3 w
|
|
685
|
+
2 2003-12-01 00:00:00.000 1 a 4 e
|
|
686
|
+
3 2003-12-01 00:00:00.001 2 b 2 q
|
|
687
|
+
4 2003-12-01 00:00:00.001 2 b 3 w
|
|
688
|
+
5 2003-12-01 00:00:00.001 2 b 4 e
|
|
689
|
+
6 2003-12-01 00:00:00.002 3 c 2 q
|
|
690
|
+
7 2003-12-01 00:00:00.002 3 c 3 w
|
|
691
|
+
8 2003-12-01 00:00:00.002 3 c 4 e
|
|
692
|
+
|
|
693
|
+
Join same size sources:
|
|
694
|
+
|
|
695
|
+
>>> data = otp.join(d1, d2, on='same_size')
|
|
696
|
+
>>> otp.run(data)
|
|
697
|
+
Time ID A RIGHT_ID B
|
|
698
|
+
0 2003-12-01 00:00:00.000 1 a 2 q
|
|
699
|
+
1 2003-12-01 00:00:00.001 2 b 3 w
|
|
700
|
+
2 2003-12-01 00:00:00.002 3 c 4 e
|
|
701
|
+
|
|
702
|
+
Adding prefix to the right source for all columns:
|
|
703
|
+
|
|
704
|
+
>>> d_right = d2.add_prefix('right_')
|
|
705
|
+
>>> data = otp.join(d1, d_right, on=d1['ID'] == d_right['right_ID'])
|
|
706
|
+
>>> otp.run(data)
|
|
707
|
+
Time ID A right_ID right_B
|
|
708
|
+
0 2003-12-01 00:00:00.000 1 a 0
|
|
709
|
+
1 2003-12-01 00:00:00.001 2 b 2 q
|
|
710
|
+
2 2003-12-01 00:00:00.002 3 c 3 w
|
|
711
|
+
|
|
712
|
+
This condition will be optimized during run time:
|
|
713
|
+
|
|
714
|
+
>>> data = otp.join(d1, d2, on=(d1['ID'] == d2['ID']) & (d1['Time'] >= d2['Time']), how='left_outer')
|
|
715
|
+
>>> otp.run(data)
|
|
716
|
+
Time ID A RIGHT_ID B
|
|
717
|
+
0 2003-12-01 00:00:00.000 1 a 0
|
|
718
|
+
1 2003-12-01 00:00:00.001 2 b 2 q
|
|
719
|
+
2 2003-12-01 00:00:00.002 3 c 3 w
|
|
720
|
+
|
|
721
|
+
This condition won't be optimized during run time since in transforms addition to time into function.
|
|
722
|
+
So please note, this way of using ``join`` is not recommended.
|
|
723
|
+
|
|
724
|
+
>>> data = otp.join(d1, d2, on=(d1['ID'] == d2['ID']) & (d1['Time'] >= d2['Time'] + otp.Milli(1)), how='left_outer')
|
|
725
|
+
>>> otp.run(data)
|
|
726
|
+
Time ID A RIGHT_ID B
|
|
727
|
+
0 2003-12-01 00:00:00.000 1 a 0
|
|
728
|
+
1 2003-12-01 00:00:00.001 2 b 2 q
|
|
729
|
+
2 2003-12-01 00:00:00.002 3 c 3 w
|
|
730
|
+
|
|
731
|
+
In such cases (adding/subtracting constants to time) adding/subtraction number of milliseconds should be done.
|
|
732
|
+
This example will return exactly the same result as previous one, but it will be optimized, so runtime will be
|
|
733
|
+
shorter.
|
|
734
|
+
|
|
735
|
+
>>> data = otp.join(d1, d2, on=(d1['ID'] == d2['ID']) & (d1['Time'] >= d2['Time'] + 1), how='left_outer')
|
|
736
|
+
>>> otp.run(data)
|
|
737
|
+
Time ID A RIGHT_ID B
|
|
738
|
+
0 2003-12-01 00:00:00.000 1 a 0
|
|
739
|
+
1 2003-12-01 00:00:00.001 2 b 2 q
|
|
740
|
+
2 2003-12-01 00:00:00.002 3 c 3 w
|
|
741
|
+
|
|
742
|
+
``on`` can be list of strings:
|
|
743
|
+
|
|
744
|
+
>>> left = otp.Ticks(A=[1, 2, 3], B=[4, 6, 7])
|
|
745
|
+
>>> right = otp.Ticks(A=[2, 3, 4], B=[6, 9, 8], C=[7, 2, 0])
|
|
746
|
+
>>> data = otp.join(left, right, on=['A', 'B'], how='inner')
|
|
747
|
+
>>> otp.run(data)
|
|
748
|
+
Time A B C
|
|
749
|
+
0 2003-12-01 00:00:00.001 2 6 7
|
|
750
|
+
|
|
751
|
+
Use parameter ``output_type_index`` to specify which input class to use to create output object.
|
|
752
|
+
It may be useful in case some custom user class was used as input:
|
|
753
|
+
|
|
754
|
+
>>> class CustomTick(otp.Tick):
|
|
755
|
+
... def custom_method(self):
|
|
756
|
+
... return 'custom_result'
|
|
757
|
+
>>> data1 = otp.Tick(A=1)
|
|
758
|
+
>>> data2 = CustomTick(B=2)
|
|
759
|
+
>>> data = otp.join(data1, data2, on='same_size', output_type_index=1)
|
|
760
|
+
>>> type(data)
|
|
761
|
+
<class 'onetick.py.functions.CustomTick'>
|
|
762
|
+
>>> data.custom_method()
|
|
763
|
+
'custom_result'
|
|
764
|
+
>>> otp.run(data)
|
|
765
|
+
Time A B
|
|
766
|
+
0 2003-12-01 1 2
|
|
767
|
+
"""
|
|
768
|
+
output_type = output_type_by_index((left, right), output_type_index)
|
|
769
|
+
|
|
770
|
+
on_list = []
|
|
771
|
+
if isinstance(on, list):
|
|
772
|
+
for column in on:
|
|
773
|
+
if column not in left.schema:
|
|
774
|
+
raise ValueError(f'`{column}` column does not exist in the left source.')
|
|
775
|
+
if column not in right.schema:
|
|
776
|
+
raise ValueError(f'`{column}` column does not exist in the right source.')
|
|
777
|
+
if len(on) == 0:
|
|
778
|
+
raise ValueError('`on` parameter can not be empty list.')
|
|
779
|
+
on_list = on
|
|
780
|
+
on = (left[on_list[0]] == right[on_list[0]])
|
|
781
|
+
for column in on_list[1:]:
|
|
782
|
+
on = on & (left[column] == right[column])
|
|
783
|
+
|
|
784
|
+
timezone_hack = None
|
|
785
|
+
if re.search(r'\b_TIMEZONE\b', str(on)):
|
|
786
|
+
# join does not support using _TIMEZONE pseudo-field in join_criteria,
|
|
787
|
+
# replacing it with temporary fields in the branches
|
|
788
|
+
timezone_hack = '__TIMEZONE_HACK__'
|
|
789
|
+
left[timezone_hack] = left['_TIMEZONE']
|
|
790
|
+
right[timezone_hack] = right['_TIMEZONE']
|
|
791
|
+
|
|
792
|
+
if str(on) == "all":
|
|
793
|
+
on = f'1 = 1 or {rprefix}.TIMESTAMP >= 0'
|
|
794
|
+
|
|
795
|
+
_LEFT_NODE_NAME = "__SRC_LEFT__" # this is internal name
|
|
796
|
+
_RIGHT_NODE_NAME = rprefix
|
|
797
|
+
|
|
798
|
+
initial_left_source_node_name = left.node_name()
|
|
799
|
+
initial_right_source_node_name = right.node_name()
|
|
800
|
+
|
|
801
|
+
# we have to add _source prefix to all column operations
|
|
802
|
+
# `on` expression is written with right, so we should modify it, we will restore it later
|
|
803
|
+
left.node_name(_LEFT_NODE_NAME)
|
|
804
|
+
right.node_name(_RIGHT_NODE_NAME)
|
|
805
|
+
|
|
806
|
+
on = _add_node_name_prefix_to_columns_in_operation(on, left)
|
|
807
|
+
on = _add_node_name_prefix_to_columns_in_operation(on, right)
|
|
808
|
+
|
|
809
|
+
columns_name_set = set()
|
|
810
|
+
columns = {}
|
|
811
|
+
fields_to_skip_right_source = {'TIMESTAMP'}
|
|
812
|
+
for name, dtype in chain(left.columns(skip_meta_fields=True).items(), right.columns(skip_meta_fields=True).items()):
|
|
813
|
+
if name in columns_name_set:
|
|
814
|
+
columns[_RIGHT_NODE_NAME + "_" + name] = dtype
|
|
815
|
+
fields_to_skip_right_source.add(name)
|
|
816
|
+
else:
|
|
817
|
+
columns[name] = dtype
|
|
818
|
+
columns_name_set.add(name)
|
|
819
|
+
|
|
820
|
+
if how in ("left_outer", "outer"):
|
|
821
|
+
join_type = "LEFT_OUTER"
|
|
822
|
+
if how == "outer":
|
|
823
|
+
warnings.warn("Value 'outer' for parameter 'how' is deprecated. Use 'left_outer' instead.",
|
|
824
|
+
FutureWarning)
|
|
825
|
+
elif how == "inner":
|
|
826
|
+
join_type = "INNER"
|
|
827
|
+
else:
|
|
828
|
+
raise ValueError("The 'how' parameter has wrong value. Only 'left_outer' and 'inner' are supported")
|
|
829
|
+
|
|
830
|
+
if timezone_hack:
|
|
831
|
+
on = re.sub(r'\._TIMEZONE\b', f'.{timezone_hack}', str(on))
|
|
832
|
+
on = re.sub(r'\b_TIMEZONE\b', f'{_LEFT_NODE_NAME}.{timezone_hack}', str(on))
|
|
833
|
+
|
|
834
|
+
# ------------------
|
|
835
|
+
# create objects
|
|
836
|
+
params = {"join_criteria": str(on), "join_type": join_type, "left_source": _LEFT_NODE_NAME}
|
|
837
|
+
|
|
838
|
+
# return states of sources back
|
|
839
|
+
left.node_name(initial_left_source_node_name)
|
|
840
|
+
right.node_name(initial_right_source_node_name)
|
|
841
|
+
if str(on) == "same_size":
|
|
842
|
+
result = output_type(node=otq.JoinSameSizeTs(), schema=columns)
|
|
843
|
+
else:
|
|
844
|
+
result = output_type(node=otq.Join(**params), schema=columns)
|
|
845
|
+
|
|
846
|
+
__copy_sources_on_merge_or_join(result, (left, right),
|
|
847
|
+
names=(_LEFT_NODE_NAME, _RIGHT_NODE_NAME),
|
|
848
|
+
output_type_index=output_type_index)
|
|
849
|
+
|
|
850
|
+
rename_fields_dict = {}
|
|
851
|
+
for lc, rc in zip_longest(left.columns(skip_meta_fields=True), right.columns(skip_meta_fields=True)):
|
|
852
|
+
if lc:
|
|
853
|
+
rename_fields_dict[f"{_LEFT_NODE_NAME}.{lc}"] = lc
|
|
854
|
+
if rc:
|
|
855
|
+
if rc not in fields_to_skip_right_source:
|
|
856
|
+
rename_fields_dict[f"{_RIGHT_NODE_NAME}.{rc}"] = rc
|
|
857
|
+
else:
|
|
858
|
+
rename_fields_dict[f"{_RIGHT_NODE_NAME}.{rc}"] = f"{_RIGHT_NODE_NAME}_{rc}"
|
|
859
|
+
|
|
860
|
+
__rename_fields(result, rename_fields_dict)
|
|
861
|
+
result.sink(otq.Passthrough(fields=_LEFT_NODE_NAME + ".TIMESTAMP", drop_fields=True))
|
|
862
|
+
|
|
863
|
+
items = []
|
|
864
|
+
for name, dtype in result.columns(skip_meta_fields=True).items():
|
|
865
|
+
items.append(ott.type2str(dtype) + " " + name)
|
|
866
|
+
|
|
867
|
+
if keep_fields_not_in_schema:
|
|
868
|
+
# Here we try to preserve fields of original sources that were not in schema
|
|
869
|
+
# in their original form. If there's a duplication of fields or any other problem
|
|
870
|
+
# in runtime, we'll be able to do nothing
|
|
871
|
+
result.sink(otq.Passthrough(fields=_RIGHT_NODE_NAME + ".TIMESTAMP", drop_fields=True))
|
|
872
|
+
result.sink(otq.RenameFieldsEp(rename_fields=rf"{_LEFT_NODE_NAME}\.(.*)=\1,{_RIGHT_NODE_NAME}\.(.*)=\1",
|
|
873
|
+
use_regex=True))
|
|
874
|
+
result.sink(otq.Table(fields=",".join(items), keep_input_fields=True))
|
|
875
|
+
else:
|
|
876
|
+
result.sink(otq.Table(fields=",".join(items)))
|
|
877
|
+
|
|
878
|
+
if timezone_hack:
|
|
879
|
+
result = result.drop([
|
|
880
|
+
field for field in result.schema
|
|
881
|
+
if field.endswith(timezone_hack)
|
|
882
|
+
])
|
|
883
|
+
left.drop(timezone_hack, inplace=True)
|
|
884
|
+
right.drop(timezone_hack, inplace=True)
|
|
885
|
+
|
|
886
|
+
for column in on_list:
|
|
887
|
+
result.drop(f'{_RIGHT_NODE_NAME}_{column}', inplace=True)
|
|
888
|
+
|
|
889
|
+
return result
|
|
890
|
+
|
|
891
|
+
|
|
892
|
+
def join_by_time(sources, how="outer", on=None, policy=None, check_schema=True, leading=0,
|
|
893
|
+
match_if_identical_times=None, output_type_index=None, use_rename_ep=True,
|
|
894
|
+
source_fields_order=None, symbols=None):
|
|
895
|
+
"""
|
|
896
|
+
Joins ticks from multiple input time series, based on input tick timestamps.
|
|
897
|
+
|
|
898
|
+
``leading`` source tick joined with already arrived ticks from other sources.
|
|
899
|
+
|
|
900
|
+
>>> leading = otp.Ticks(A=[1, 2], offset=[1, 3])
|
|
901
|
+
>>> other = otp.Ticks(B=[1], offset=[2])
|
|
902
|
+
>>> otp.run(otp.join_by_time([leading, other]))
|
|
903
|
+
Time A B
|
|
904
|
+
0 2003-12-01 00:00:00.001 1 0
|
|
905
|
+
1 2003-12-01 00:00:00.003 2 1
|
|
906
|
+
|
|
907
|
+
Note
|
|
908
|
+
----
|
|
909
|
+
In case different ``sources`` have matching columns, the exception will be raised.
|
|
910
|
+
|
|
911
|
+
To fix this error,
|
|
912
|
+
functions :func:`Source.add_prefix` or :func:`Source.add_suffix` can be used to rename all columns in the source.
|
|
913
|
+
|
|
914
|
+
Note that resulting **TIMESTAMP** pseudo-column will be taken from the leading source,
|
|
915
|
+
and timestamps of ticks from non-leading sources will not be added to the output,
|
|
916
|
+
so if you need to save them, you need to copy the timestamp to some other column.
|
|
917
|
+
|
|
918
|
+
See examples below.
|
|
919
|
+
|
|
920
|
+
Parameters
|
|
921
|
+
----------
|
|
922
|
+
sources: Collection[:class:`Source`]
|
|
923
|
+
The collection of Source objects which will be joined
|
|
924
|
+
how: 'outer' or 'inner'
|
|
925
|
+
The method of join ("inner" or "outer").
|
|
926
|
+
Inner join logic will propagate ticks only if all sources participated in forming it.
|
|
927
|
+
Outer join will propagate all ticks even if they couldn't be joined with other sources
|
|
928
|
+
(in this case the fields from other sources will have "zero" values depending on the type of the field).
|
|
929
|
+
Default is "outer".
|
|
930
|
+
on: Collection[:class:`Column`]
|
|
931
|
+
``on`` add an extra check to join - only ticks with same ``on`` fields will be joined
|
|
932
|
+
|
|
933
|
+
>>> leading = otp.Ticks(A=[1, 2], offset=[1, 3])
|
|
934
|
+
>>> other = otp.Ticks(A=[2, 2], B=[1, 2], offset=[0, 2])
|
|
935
|
+
>>> otp.run(otp.join_by_time([leading, other], on=['A']))
|
|
936
|
+
Time A B
|
|
937
|
+
0 2003-12-01 00:00:00.001 1 0
|
|
938
|
+
1 2003-12-01 00:00:00.003 2 2
|
|
939
|
+
|
|
940
|
+
policy: 'arrival_order', 'latest_ticks', 'each_for_leader_with_first' or 'each_for_leader_with_latest'
|
|
941
|
+
Policy of joining ticks with the same timestamps.
|
|
942
|
+
The default value is "arrival_order" by default, but is set to "latest_ticks"
|
|
943
|
+
if parameter ``match_if_identical_times`` is set to True.
|
|
944
|
+
|
|
945
|
+
>>> leading = otp.Ticks(A=[1, 2], offset=[0, 0], OMDSEQ=[0, 3])
|
|
946
|
+
>>> other = otp.Ticks(B=[1, 2], offset=[0, 0], OMDSEQ=[2, 4])
|
|
947
|
+
|
|
948
|
+
Note: in the examples below we assume that all ticks have same timestamps, but order of ticks as in example.
|
|
949
|
+
OMDSEQ is a special field that store order of ticks with same timestamp
|
|
950
|
+
|
|
951
|
+
- ``arrival_order``
|
|
952
|
+
output tick generated on arrival of ``leading`` source tick
|
|
953
|
+
|
|
954
|
+
>>> data = otp.join_by_time([leading, other], policy='arrival_order')
|
|
955
|
+
>>> otp.run(data)[['Time', 'A', 'B']]
|
|
956
|
+
Time A B
|
|
957
|
+
0 2003-12-01 1 0
|
|
958
|
+
1 2003-12-01 2 1
|
|
959
|
+
|
|
960
|
+
- ``latest_ticks``
|
|
961
|
+
Tick generated at the time of expiration of a particular timestamp (when all ticks from all sources
|
|
962
|
+
for current timestamp arrived). Only latest tick from ``leading`` source will be used.
|
|
963
|
+
|
|
964
|
+
>>> data = otp.join_by_time([leading, other], policy='latest_ticks')
|
|
965
|
+
>>> otp.run(data)[['Time', 'A', 'B']]
|
|
966
|
+
Time A B
|
|
967
|
+
0 2003-12-01 2 2
|
|
968
|
+
|
|
969
|
+
- ``each_for_leader_with_first``
|
|
970
|
+
Each tick from ``leading`` source will be joined with first tick from other sources for current timestamp
|
|
971
|
+
|
|
972
|
+
>>> data = otp.join_by_time(
|
|
973
|
+
... [leading, other],
|
|
974
|
+
... policy='each_for_leader_with_first'
|
|
975
|
+
... )
|
|
976
|
+
>>> otp.run(data)[['Time', 'A', 'B']]
|
|
977
|
+
Time A B
|
|
978
|
+
0 2003-12-01 1 1
|
|
979
|
+
1 2003-12-01 2 1
|
|
980
|
+
|
|
981
|
+
- ``each_for_leader_with_latest``
|
|
982
|
+
Each tick from ``leading`` source will be joined with last tick from other sources for current timestamp
|
|
983
|
+
|
|
984
|
+
>>> data = otp.join_by_time(
|
|
985
|
+
... [leading, other],
|
|
986
|
+
... policy='each_for_leader_with_latest'
|
|
987
|
+
... )
|
|
988
|
+
>>> otp.run(data)[['Time', 'A', 'B']]
|
|
989
|
+
Time A B
|
|
990
|
+
0 2003-12-01 1 2
|
|
991
|
+
1 2003-12-01 2 2
|
|
992
|
+
|
|
993
|
+
check_schema: bool
|
|
994
|
+
If True onetick.py will check that all columns names are unambiguous
|
|
995
|
+
and columns listed in `on` param are exists in sources schema.
|
|
996
|
+
Which can lead to false positive error
|
|
997
|
+
in case of some event processors were sink to Source. To avoid this set check_scheme to False.
|
|
998
|
+
leading: int, 'all', :class:`Source`, list of int, list of :class:`Source`
|
|
999
|
+
A list of sources or their indexes. If this parameter is 'all', every source is considered to be leading.
|
|
1000
|
+
The logic of the leading source depends on ``policy`` parameter.
|
|
1001
|
+
The default value is 0, meaning the first specified source will be the leader.
|
|
1002
|
+
|
|
1003
|
+
match_if_identical_times: bool
|
|
1004
|
+
A True value of this parameter causes an output tick to be formed from input ticks with identical timestamps
|
|
1005
|
+
only.
|
|
1006
|
+
If parameter ``how`` is set to 'outer',
|
|
1007
|
+
default values of fields (``otp.nan``, 0, empty string) are propagated for
|
|
1008
|
+
sources that did not tick at a given timestamp.
|
|
1009
|
+
If this parameter is set to True, the default value of ``policy`` parameter is set to 'latest_ticks'.
|
|
1010
|
+
output_type_index: int
|
|
1011
|
+
Specifies index of source in ``sources`` from which type and properties of output will be taken.
|
|
1012
|
+
Useful when joining sources that inherited from :class:`Source`.
|
|
1013
|
+
By default output object type will be :class:`Source`.
|
|
1014
|
+
use_rename_ep: bool
|
|
1015
|
+
This parameter specifies if :py:class:`onetick.query.RenameFields`
|
|
1016
|
+
event processor will be used in internal implementation of this function or not.
|
|
1017
|
+
This event processor can't be used in generic aggregations, so set this parameter to False
|
|
1018
|
+
if ``join_by_time`` is used in generic aggregation logic.
|
|
1019
|
+
source_fields_order: list of int, list of :class:`Source`
|
|
1020
|
+
Controls the order of fields in output ticks.
|
|
1021
|
+
If set, all input sources indexes or objects must be specified.
|
|
1022
|
+
By default, the order of the sources is the same as in the ``sources`` list.
|
|
1023
|
+
symbols: str, list of str or functions, :class:`Source`, :py:class:`onetick.query.GraphQuery`
|
|
1024
|
+
Bound symbol(s) passed as a string, a list of strings, or as a "symbols" query which results
|
|
1025
|
+
include the ``SYMBOL_NAME`` column. The start/end times for the
|
|
1026
|
+
symbols query will taken from the :meth:`run` params.
|
|
1027
|
+
See :ref:`symbols <static/concepts/symbols:Symbols: bound and unbound>` for more details.
|
|
1028
|
+
|
|
1029
|
+
.. warning::
|
|
1030
|
+
Passing more than one source for join and setting ``symbols`` parameter at the same time aren't supported
|
|
1031
|
+
|
|
1032
|
+
.. note::
|
|
1033
|
+
If bound symbols are specified as :class:`Source` or :py:class:`onetick.query.GraphQuery`,
|
|
1034
|
+
you **should** set schema for returned :class:`Source` object manually:
|
|
1035
|
+
``onetick-py`` couldn't determine symbols from sub-query before running the query.
|
|
1036
|
+
|
|
1037
|
+
.. note::
|
|
1038
|
+
If bound symbols are specified as :class:`Source` or :py:class:`onetick.query.GraphQuery`,
|
|
1039
|
+
and this sub-query returns only one symbol name, output columns wouldn't have a prefix with symbol name.
|
|
1040
|
+
|
|
1041
|
+
See also
|
|
1042
|
+
--------
|
|
1043
|
+
**JOIN_BY_TIME** OneTick event processor
|
|
1044
|
+
|
|
1045
|
+
Examples
|
|
1046
|
+
--------
|
|
1047
|
+
>>> d1 = otp.Ticks({'A': [1, 2, 3], 'offset': [1, 2, 3]})
|
|
1048
|
+
>>> d2 = otp.Ticks({'B': [1, 2, 4], 'offset': [1, 2, 4]})
|
|
1049
|
+
>>> otp.run(d1)
|
|
1050
|
+
Time A
|
|
1051
|
+
0 2003-12-01 00:00:00.001 1
|
|
1052
|
+
1 2003-12-01 00:00:00.002 2
|
|
1053
|
+
2 2003-12-01 00:00:00.003 3
|
|
1054
|
+
>>> otp.run(d2)
|
|
1055
|
+
Time B
|
|
1056
|
+
0 2003-12-01 00:00:00.001 1
|
|
1057
|
+
1 2003-12-01 00:00:00.002 2
|
|
1058
|
+
2 2003-12-01 00:00:00.004 4
|
|
1059
|
+
|
|
1060
|
+
Default joining logic, outer join with the first source is the leader by default:
|
|
1061
|
+
|
|
1062
|
+
>>> data = otp.join_by_time([d1, d2])
|
|
1063
|
+
>>> otp.run(data)
|
|
1064
|
+
Time A B
|
|
1065
|
+
0 2003-12-01 00:00:00.001 1 0
|
|
1066
|
+
1 2003-12-01 00:00:00.002 2 1
|
|
1067
|
+
2 2003-12-01 00:00:00.003 3 2
|
|
1068
|
+
|
|
1069
|
+
Leading source can be changed by using parameter ``leading``:
|
|
1070
|
+
|
|
1071
|
+
>>> data = otp.join_by_time([d1, d2], leading=1)
|
|
1072
|
+
>>> otp.run(data)
|
|
1073
|
+
Time A B
|
|
1074
|
+
0 2003-12-01 00:00:00.001 1 1
|
|
1075
|
+
1 2003-12-01 00:00:00.002 2 2
|
|
1076
|
+
2 2003-12-01 00:00:00.004 3 4
|
|
1077
|
+
|
|
1078
|
+
Note that OneTick's logic is different depending on the order of sources specified,
|
|
1079
|
+
so specifying ``leading`` parameter in the previous example is not the same as changing the order of sources here:
|
|
1080
|
+
|
|
1081
|
+
>>> data = otp.join_by_time([d2, d1], leading=0)
|
|
1082
|
+
>>> otp.run(data)
|
|
1083
|
+
Time B A
|
|
1084
|
+
0 2003-12-01 00:00:00.001 1 0
|
|
1085
|
+
1 2003-12-01 00:00:00.002 2 1
|
|
1086
|
+
2 2003-12-01 00:00:00.004 4 3
|
|
1087
|
+
|
|
1088
|
+
Parameter ``source_fields_order`` can be used to change the order of fields in the output,
|
|
1089
|
+
but it also affects the joining logic the same way as changing the order of sources:
|
|
1090
|
+
|
|
1091
|
+
>>> data = otp.join_by_time([d1, d2], leading=1, source_fields_order=[1, 0])
|
|
1092
|
+
>>> otp.run(data)
|
|
1093
|
+
Time B A
|
|
1094
|
+
0 2003-12-01 00:00:00.001 1 0
|
|
1095
|
+
1 2003-12-01 00:00:00.002 2 1
|
|
1096
|
+
2 2003-12-01 00:00:00.004 4 3
|
|
1097
|
+
|
|
1098
|
+
Parameter ``how`` can be set to "inner".
|
|
1099
|
+
In this case only ticks that were successfully joined from all sources will be propagated:
|
|
1100
|
+
|
|
1101
|
+
>>> data = otp.join_by_time([d1, d2], how='inner')
|
|
1102
|
+
>>> otp.run(data)
|
|
1103
|
+
Time A B
|
|
1104
|
+
0 2003-12-01 00:00:00.002 2 1
|
|
1105
|
+
1 2003-12-01 00:00:00.003 3 2
|
|
1106
|
+
|
|
1107
|
+
Set parameter ``match_if_identical_times`` to only join ticks with the same timestamps:
|
|
1108
|
+
|
|
1109
|
+
>>> data = otp.join_by_time([d1, d2], how='inner', match_if_identical_times=True)
|
|
1110
|
+
>>> otp.run(data)
|
|
1111
|
+
Time A B
|
|
1112
|
+
0 2003-12-01 00:00:00.001 1 1
|
|
1113
|
+
1 2003-12-01 00:00:00.002 2 2
|
|
1114
|
+
|
|
1115
|
+
In case of conflicting names in different sources, exception will be raised:
|
|
1116
|
+
|
|
1117
|
+
>>> d3 = otp.Ticks({'A': [1, 2, 4], 'offset': [1, 2, 4]})
|
|
1118
|
+
>>> data = otp.join_by_time([d1, d3])
|
|
1119
|
+
Traceback (most recent call last):
|
|
1120
|
+
...
|
|
1121
|
+
ValueError: There are matched columns between sources: A
|
|
1122
|
+
|
|
1123
|
+
Adding prefix to right source for all columns will fix this problem:
|
|
1124
|
+
|
|
1125
|
+
>>> data = otp.join_by_time([d1, d3.add_prefix('right_')])
|
|
1126
|
+
>>> otp.run(data)
|
|
1127
|
+
Time A right_A
|
|
1128
|
+
0 2003-12-01 00:00:00.001 1 0
|
|
1129
|
+
1 2003-12-01 00:00:00.002 2 1
|
|
1130
|
+
2 2003-12-01 00:00:00.003 3 2
|
|
1131
|
+
|
|
1132
|
+
Note that timestamps from the non-leading source are not added to the output.
|
|
1133
|
+
You can add them manually in a different field:
|
|
1134
|
+
|
|
1135
|
+
>>> d3['D3_TIMESTAMP'] = d3['TIMESTAMP']
|
|
1136
|
+
>>> data = otp.join_by_time([d1, d3.add_prefix('right_')])
|
|
1137
|
+
>>> otp.run(data)
|
|
1138
|
+
Time A right_A right_D3_TIMESTAMP
|
|
1139
|
+
0 2003-12-01 00:00:00.001 1 0 1969-12-31 19:00:00.000
|
|
1140
|
+
1 2003-12-01 00:00:00.002 2 1 2003-12-01 00:00:00.001
|
|
1141
|
+
2 2003-12-01 00:00:00.003 3 2 2003-12-01 00:00:00.002
|
|
1142
|
+
|
|
1143
|
+
Use parameter ``output_type_index`` to specify which input class to use to create output object.
|
|
1144
|
+
It may be useful in case some custom user class was used as input:
|
|
1145
|
+
|
|
1146
|
+
>>> class CustomTick(otp.Tick):
|
|
1147
|
+
... def custom_method(self):
|
|
1148
|
+
... return 'custom_result'
|
|
1149
|
+
>>> data1 = otp.Tick(A=1)
|
|
1150
|
+
>>> data2 = CustomTick(B=2)
|
|
1151
|
+
>>> data = otp.join_by_time([data1, data2], match_if_identical_times=True, output_type_index=1)
|
|
1152
|
+
>>> type(data)
|
|
1153
|
+
<class 'onetick.py.functions.CustomTick'>
|
|
1154
|
+
>>> data.custom_method()
|
|
1155
|
+
'custom_result'
|
|
1156
|
+
>>> otp.run(data)
|
|
1157
|
+
Time A B
|
|
1158
|
+
0 2003-12-01 1 2
|
|
1159
|
+
|
|
1160
|
+
Use parameter ``source_fields_order`` to specify the order of output fields:
|
|
1161
|
+
|
|
1162
|
+
>>> a = otp.Ticks(A=[1, 2])
|
|
1163
|
+
>>> b = otp.Ticks(B=[1, 2])
|
|
1164
|
+
>>> c = otp.Ticks(C=[1, 2])
|
|
1165
|
+
>>> data = otp.join_by_time([a, b, c], match_if_identical_times=True, source_fields_order=[c, b, a])
|
|
1166
|
+
>>> otp.run(data)
|
|
1167
|
+
Time C B A
|
|
1168
|
+
0 2003-12-01 00:00:00.000 1 1 1
|
|
1169
|
+
1 2003-12-01 00:00:00.001 2 2 2
|
|
1170
|
+
|
|
1171
|
+
Indexes can be used too:
|
|
1172
|
+
|
|
1173
|
+
>>> data = otp.join_by_time([a, b, c], match_if_identical_times=True, source_fields_order=[1, 2, 0])
|
|
1174
|
+
>>> otp.run(data)
|
|
1175
|
+
Time B C A
|
|
1176
|
+
0 2003-12-01 00:00:00.000 1 1 1
|
|
1177
|
+
1 2003-12-01 00:00:00.001 2 2 2
|
|
1178
|
+
|
|
1179
|
+
Use parameter `symbols` to specify bound symbols:
|
|
1180
|
+
|
|
1181
|
+
>>> data = otp.Ticks(X=[1, 2, 3, 4])
|
|
1182
|
+
>>> data = otp.join_by_time([data], symbols=['A', 'B'], match_if_identical_times=True)
|
|
1183
|
+
>>> otp.run(data)
|
|
1184
|
+
Time A.X B.X
|
|
1185
|
+
0 2003-12-01 00:00:00.000 1 1
|
|
1186
|
+
1 2003-12-01 00:00:00.001 2 2
|
|
1187
|
+
2 2003-12-01 00:00:00.002 3 3
|
|
1188
|
+
3 2003-12-01 00:00:00.003 4 4
|
|
1189
|
+
|
|
1190
|
+
Returns
|
|
1191
|
+
-------
|
|
1192
|
+
:class:`Source` or same class as ``sources[output_type_index]``
|
|
1193
|
+
A time series of ticks.
|
|
1194
|
+
"""
|
|
1195
|
+
from onetick.py.core.source import _Source
|
|
1196
|
+
|
|
1197
|
+
output_type = output_type_by_index(sources, output_type_index)
|
|
1198
|
+
|
|
1199
|
+
if len(sources) > 1 and symbols:
|
|
1200
|
+
raise ValueError(
|
|
1201
|
+
'It\'s impossible to use `join_by_time` with multiple sources, '
|
|
1202
|
+
'when bound symbols are set via `symbols` parameter.'
|
|
1203
|
+
)
|
|
1204
|
+
|
|
1205
|
+
join_str_keys = []
|
|
1206
|
+
|
|
1207
|
+
# if key is set, then generalize it, ie convert into list;
|
|
1208
|
+
# then remove keys from 'columns_count' dict to pass validation after
|
|
1209
|
+
if on is not None:
|
|
1210
|
+
if isinstance(on, list):
|
|
1211
|
+
# okay
|
|
1212
|
+
pass
|
|
1213
|
+
elif isinstance(on, Column):
|
|
1214
|
+
on = [on]
|
|
1215
|
+
elif isinstance(on, str):
|
|
1216
|
+
on = [on]
|
|
1217
|
+
else:
|
|
1218
|
+
raise TypeError(f"It is not supported to have '{type(on)}' type as a key")
|
|
1219
|
+
|
|
1220
|
+
for join_key in on:
|
|
1221
|
+
dtypes = set()
|
|
1222
|
+
if check_schema:
|
|
1223
|
+
for source in sources:
|
|
1224
|
+
try:
|
|
1225
|
+
key_type = source.schema[str(join_key)]
|
|
1226
|
+
except KeyError as e:
|
|
1227
|
+
raise KeyError(f"Column '{join_key}' not found in source schema {source}") from e
|
|
1228
|
+
type_name = ott.type2str(key_type)
|
|
1229
|
+
if type_name == "string[64]":
|
|
1230
|
+
type_name = "string"
|
|
1231
|
+
dtypes.add(type_name)
|
|
1232
|
+
if len(dtypes) > 1:
|
|
1233
|
+
raise TypeError(f"Column '{join_key}' has different types in sources: {dtypes}")
|
|
1234
|
+
|
|
1235
|
+
if isinstance(join_key, Column):
|
|
1236
|
+
join_str_keys.append(str(join_key))
|
|
1237
|
+
elif isinstance(join_key, str):
|
|
1238
|
+
join_str_keys.append(join_key)
|
|
1239
|
+
|
|
1240
|
+
if check_schema:
|
|
1241
|
+
_check_schema_for_join_by_time(join_str_keys, sources)
|
|
1242
|
+
|
|
1243
|
+
if how not in ["inner", "outer"]:
|
|
1244
|
+
raise ValueError('Wrong value for the "how" parameter. It is allowed to use "inner" or "outer" values')
|
|
1245
|
+
join_type = how.upper()
|
|
1246
|
+
|
|
1247
|
+
# ------------------
|
|
1248
|
+
# create objects
|
|
1249
|
+
params = {"add_source_prefix": False, "join_type": join_type}
|
|
1250
|
+
leading = _fill_leading_sources_param(leading, params, sources)
|
|
1251
|
+
ordered_sources = _fill_source_fields_order_param(source_fields_order, params, sources)
|
|
1252
|
+
|
|
1253
|
+
if on is not None:
|
|
1254
|
+
params["join_keys"] = ",".join(join_str_keys)
|
|
1255
|
+
|
|
1256
|
+
if policy is not None:
|
|
1257
|
+
policies = {"arrival_order", "latest_ticks", "each_for_leader_with_first", "each_for_leader_with_latest"}
|
|
1258
|
+
if policy.lower() not in policies:
|
|
1259
|
+
raise ValueError("Invalid policy. Only the following ones are allowed: " + ", ".join(policies) + ".")
|
|
1260
|
+
params["same_timestamp_join_policy"] = policy.upper()
|
|
1261
|
+
|
|
1262
|
+
if match_if_identical_times is not None:
|
|
1263
|
+
params["match_if_identical_times"] = match_if_identical_times
|
|
1264
|
+
|
|
1265
|
+
is_bound_multi_symbol = False
|
|
1266
|
+
is_source_symbols = isinstance(symbols, (_Source, _QueryEvalWrapper))
|
|
1267
|
+
|
|
1268
|
+
if isinstance(symbols, list) and len(symbols) > 1 or is_source_symbols:
|
|
1269
|
+
is_bound_multi_symbol = True
|
|
1270
|
+
params['add_source_prefix'] = True
|
|
1271
|
+
|
|
1272
|
+
columns = {name: dtype for src in ordered_sources for name, dtype in src.columns(skip_meta_fields=True).items()}
|
|
1273
|
+
|
|
1274
|
+
tmp_otq = TmpOtq()
|
|
1275
|
+
result = output_type(node=apply_symbol_to_ep(otq.JoinByTime(**params), symbols, tmp_otq), schema=columns)
|
|
1276
|
+
result._tmp_otq.merge(tmp_otq)
|
|
1277
|
+
|
|
1278
|
+
__copy_sources_on_merge_or_join(result, sources,
|
|
1279
|
+
symbols=symbols,
|
|
1280
|
+
names=True,
|
|
1281
|
+
drop_meta=True,
|
|
1282
|
+
leading=leading,
|
|
1283
|
+
output_type_index=output_type_index,
|
|
1284
|
+
use_rename_ep=use_rename_ep)
|
|
1285
|
+
|
|
1286
|
+
if is_bound_multi_symbol:
|
|
1287
|
+
if not is_source_symbols:
|
|
1288
|
+
# this isn't supported for symbols defined as otp.Source
|
|
1289
|
+
new_columns = {
|
|
1290
|
+
f"{sym}.{col}": dtype for col, dtype in columns.items() for sym in symbols
|
|
1291
|
+
}
|
|
1292
|
+
result.schema.update(**new_columns)
|
|
1293
|
+
|
|
1294
|
+
result = result.drop(columns=list(columns.keys()))
|
|
1295
|
+
|
|
1296
|
+
if is_source_symbols:
|
|
1297
|
+
result = result.rename({r'__SRC_0__\.(.*)': r'\1'}, use_regex=True)
|
|
1298
|
+
|
|
1299
|
+
if how == "outer":
|
|
1300
|
+
# adding table to convert types in schema, e.g. float to int
|
|
1301
|
+
result._add_table(strict=False)
|
|
1302
|
+
|
|
1303
|
+
return result
|
|
1304
|
+
|
|
1305
|
+
|
|
1306
|
+
def _fill_source_fields_order_param(source_fields_order, params, sources):
|
|
1307
|
+
if source_fields_order is None:
|
|
1308
|
+
return sources
|
|
1309
|
+
if not isinstance(source_fields_order, Sequence):
|
|
1310
|
+
raise ValueError(f"Wrong type for parameter 'source_fields_order': {type(source_fields_order)}")
|
|
1311
|
+
if len(source_fields_order) != len(sources):
|
|
1312
|
+
raise ValueError("Wrong number of sources in parameter 'source_fields_order':"
|
|
1313
|
+
f" {len(source_fields_order)} (need {len(sources)})")
|
|
1314
|
+
if isinstance(source_fields_order[0], int):
|
|
1315
|
+
indexes = source_fields_order
|
|
1316
|
+
ordered_sources = [sources[i] for i in indexes]
|
|
1317
|
+
else:
|
|
1318
|
+
indexes = [__find_by_id(sources, src) for src in source_fields_order]
|
|
1319
|
+
ordered_sources = source_fields_order
|
|
1320
|
+
params['source_order'] = ','.join(f'__SRC_{i}__' for i in indexes)
|
|
1321
|
+
return ordered_sources
|
|
1322
|
+
|
|
1323
|
+
|
|
1324
|
+
@singledispatch
|
|
1325
|
+
def _fill_leading_sources_param(leading, params, sources):
|
|
1326
|
+
from onetick.py.core.source import _Source
|
|
1327
|
+
|
|
1328
|
+
if isinstance(leading, _Source): # TODO: PY-104 Get rid of circular dependencies in code to avoid local import
|
|
1329
|
+
result = f"__SRC_{__find_by_id(sources, leading)}__"
|
|
1330
|
+
params["leading_sources"] = result
|
|
1331
|
+
result = [result]
|
|
1332
|
+
elif leading == "all": # all sources are leading which is specified by empty string
|
|
1333
|
+
params["leading_sources"] = ""
|
|
1334
|
+
result = []
|
|
1335
|
+
else:
|
|
1336
|
+
raise ValueError(
|
|
1337
|
+
"wrong leading param was specified, please use any of int, 'all' literal, list of int, list of _Source"
|
|
1338
|
+
)
|
|
1339
|
+
return result
|
|
1340
|
+
|
|
1341
|
+
|
|
1342
|
+
@_fill_leading_sources_param.register(int)
|
|
1343
|
+
def _(leading, params, sources):
|
|
1344
|
+
if leading < 0:
|
|
1345
|
+
leading = len(sources) + leading
|
|
1346
|
+
if 0 <= leading < len(sources):
|
|
1347
|
+
result = f"__SRC_{leading}__"
|
|
1348
|
+
params["leading_sources"] = result
|
|
1349
|
+
return [result]
|
|
1350
|
+
else:
|
|
1351
|
+
raise ValueError(
|
|
1352
|
+
f"leading source index should be in range(-len(source), len(source)), but {leading} was specified."
|
|
1353
|
+
)
|
|
1354
|
+
|
|
1355
|
+
|
|
1356
|
+
@_fill_leading_sources_param.register(list) # type: ignore # _ already defined above
|
|
1357
|
+
@_fill_leading_sources_param.register(tuple)
|
|
1358
|
+
def _(leading, params, sources):
|
|
1359
|
+
if len(leading) > len(sources):
|
|
1360
|
+
raise ValueError("Number of leading sources can't be bigger number of sources")
|
|
1361
|
+
if isinstance(leading[0], int):
|
|
1362
|
+
result = leading
|
|
1363
|
+
else:
|
|
1364
|
+
result = [__find_by_id(sources, lead) for lead in leading]
|
|
1365
|
+
indexes = ",".join(f"__SRC_{i}__" for i in result)
|
|
1366
|
+
params["leading_sources"] = indexes
|
|
1367
|
+
return result
|
|
1368
|
+
|
|
1369
|
+
|
|
1370
|
+
def __find_by_id(collection, item):
|
|
1371
|
+
for index, s in enumerate(collection):
|
|
1372
|
+
if s is item:
|
|
1373
|
+
return index
|
|
1374
|
+
raise ValueError("The source should be in join sources list")
|
|
1375
|
+
|
|
1376
|
+
|
|
1377
|
+
def _check_schema_for_join_by_time(join_str_keys, sources):
|
|
1378
|
+
# check that there aren't matching columns
|
|
1379
|
+
columns_count = Counter()
|
|
1380
|
+
for src in sources:
|
|
1381
|
+
columns_count.update(src.columns(skip_meta_fields=True).keys())
|
|
1382
|
+
for join_key in join_str_keys:
|
|
1383
|
+
del columns_count[join_key]
|
|
1384
|
+
matched = [k for k, value in columns_count.items() if value > 1]
|
|
1385
|
+
if "OMDSEQ" in matched:
|
|
1386
|
+
# OMDSEQ behaves like the TIMESTAMP field
|
|
1387
|
+
matched.remove("OMDSEQ")
|
|
1388
|
+
if len(matched):
|
|
1389
|
+
raise ValueError(f"There are matched columns between sources: {','.join(matched)}")
|
|
1390
|
+
|
|
1391
|
+
|
|
1392
|
+
def apply_query(query,
|
|
1393
|
+
in_sources=None,
|
|
1394
|
+
output_pins=None,
|
|
1395
|
+
shared_state_variables_list=None,
|
|
1396
|
+
output_type_index=None,
|
|
1397
|
+
**params):
|
|
1398
|
+
from onetick.py.sources import query as otp_query
|
|
1399
|
+
|
|
1400
|
+
output_type = output_type_by_index(in_sources, output_type_index)
|
|
1401
|
+
output_pins = output_pins if output_pins else []
|
|
1402
|
+
in_sources = in_sources if in_sources else {}
|
|
1403
|
+
shared_state_variables_list = shared_state_variables_list if shared_state_variables_list else []
|
|
1404
|
+
if isinstance(query, str):
|
|
1405
|
+
# it seems that path is passed
|
|
1406
|
+
query = otp_query(query, **params)
|
|
1407
|
+
|
|
1408
|
+
elif isinstance(query, otp_query) and params:
|
|
1409
|
+
query.update_params(**params)
|
|
1410
|
+
|
|
1411
|
+
columns = {}
|
|
1412
|
+
|
|
1413
|
+
for src in in_sources.values():
|
|
1414
|
+
columns.update(src.columns(skip_meta_fields=True))
|
|
1415
|
+
|
|
1416
|
+
str_params = query.str_params
|
|
1417
|
+
|
|
1418
|
+
shared_state_variables = ",".join(shared_state_variables_list)
|
|
1419
|
+
|
|
1420
|
+
inputs_need_unbound_symbols = {in_pin: src._is_unbound_required() for in_pin, src in in_sources.items()}
|
|
1421
|
+
if query.graph_info is not None and query.graph_info.has_unbound_if_pinned(inputs_need_unbound_symbols):
|
|
1422
|
+
symbol = adaptive
|
|
1423
|
+
else:
|
|
1424
|
+
symbol = None
|
|
1425
|
+
|
|
1426
|
+
nested_src = output_type(
|
|
1427
|
+
node=otq.NestedOtq(query.path, str_params, shared_state_variables=shared_state_variables),
|
|
1428
|
+
_has_output=len(output_pins) > 0,
|
|
1429
|
+
_symbols=symbol,
|
|
1430
|
+
schema=columns,
|
|
1431
|
+
)
|
|
1432
|
+
|
|
1433
|
+
eps = defaultdict()
|
|
1434
|
+
|
|
1435
|
+
for in_pin, src in in_sources.items():
|
|
1436
|
+
nested_src.source(src.node().copy_graph(eps), in_pin)
|
|
1437
|
+
nested_src.node().add_rules(src.node().copy_rules())
|
|
1438
|
+
nested_src._set_sources_dates(src)
|
|
1439
|
+
nested_src._merge_tmp_otq(src)
|
|
1440
|
+
|
|
1441
|
+
if len(output_pins) == 0:
|
|
1442
|
+
return nested_src
|
|
1443
|
+
|
|
1444
|
+
if len(output_pins) > 1:
|
|
1445
|
+
result = []
|
|
1446
|
+
|
|
1447
|
+
for out_pin in output_pins:
|
|
1448
|
+
res_src = nested_src.copy()
|
|
1449
|
+
res_src.node().out_pin(out_pin)
|
|
1450
|
+
# NOTE: need to comment out this node
|
|
1451
|
+
res_src.sink(otq.Passthrough())
|
|
1452
|
+
|
|
1453
|
+
# apply config customization
|
|
1454
|
+
query.config._apply(out_pin, res_src)
|
|
1455
|
+
|
|
1456
|
+
result.append(res_src)
|
|
1457
|
+
|
|
1458
|
+
return tuple(result)
|
|
1459
|
+
else:
|
|
1460
|
+
# TODO: move setting out_pin on the creating step of nested_src
|
|
1461
|
+
# It seems as not working now, because seems .copy() of _Source doesnt
|
|
1462
|
+
# copy out_pin reference, need to check
|
|
1463
|
+
nested_src.node().out_pin(output_pins[0])
|
|
1464
|
+
|
|
1465
|
+
# apply config customization
|
|
1466
|
+
query.config._apply(output_pins[0], nested_src)
|
|
1467
|
+
|
|
1468
|
+
return nested_src
|
|
1469
|
+
|
|
1470
|
+
|
|
1471
|
+
def apply(query, *args, **kwargs):
|
|
1472
|
+
return apply_query(query.path, *args, **kwargs, **query.params)
|
|
1473
|
+
|
|
1474
|
+
|
|
1475
|
+
def cut(column: 'Column', bins: Union[int, List[float]], labels: Optional[List[str]] = None):
|
|
1476
|
+
"""
|
|
1477
|
+
Bin values into discrete intervals (mimics :pandas:`pandas.cut`).
|
|
1478
|
+
|
|
1479
|
+
Parameters
|
|
1480
|
+
----------
|
|
1481
|
+
column: :py:class:`~onetick.py.Column`
|
|
1482
|
+
Column with numeric data used to build bins.
|
|
1483
|
+
bins: int or List[float]
|
|
1484
|
+
|
|
1485
|
+
When List[float] - defines the bin edges.
|
|
1486
|
+
|
|
1487
|
+
When int - Defines the number of equal-width bins in the range of x.
|
|
1488
|
+
labels: List[str]
|
|
1489
|
+
Labels used to name resulting bins.
|
|
1490
|
+
If not set, bins are numeric intervals like (5.0000000000, 7.5000000000].
|
|
1491
|
+
|
|
1492
|
+
Returns
|
|
1493
|
+
-------
|
|
1494
|
+
object that can be set to :py:class:`~onetick.py.Column` via :py:meth:`~onetick.py.Source.__setitem__`
|
|
1495
|
+
|
|
1496
|
+
Examples
|
|
1497
|
+
--------
|
|
1498
|
+
>>> # OTdirective: snippet-name: Source.functions.cut;
|
|
1499
|
+
>>> data = otp.Ticks({"X": [9, 8, 5, 6, 7, 0, ]})
|
|
1500
|
+
>>> data['bin'] = otp.cut(data['X'], bins=3, labels=['a', 'b', 'c'])
|
|
1501
|
+
>>> otp.run(data)[['X', 'bin']]
|
|
1502
|
+
X bin
|
|
1503
|
+
0 9 c
|
|
1504
|
+
1 8 c
|
|
1505
|
+
2 5 b
|
|
1506
|
+
3 6 b
|
|
1507
|
+
4 7 c
|
|
1508
|
+
5 0 a
|
|
1509
|
+
|
|
1510
|
+
"""
|
|
1511
|
+
src = column.obj_ref
|
|
1512
|
+
return _CutBuilder(src, column, bins, labels=labels)
|
|
1513
|
+
|
|
1514
|
+
|
|
1515
|
+
def qcut(column: 'Column', q: Union[int, List[float]], labels: Optional[List[str]] = None):
|
|
1516
|
+
"""
|
|
1517
|
+
Quantile-based discretization function (mimics :pandas:`pandas.qcut`).
|
|
1518
|
+
|
|
1519
|
+
Parameters
|
|
1520
|
+
----------
|
|
1521
|
+
column: :py:class:`~onetick.py.Column`
|
|
1522
|
+
Column with numeric data used to build bins.
|
|
1523
|
+
q: int or List[float]
|
|
1524
|
+
|
|
1525
|
+
When List[float] - array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles.
|
|
1526
|
+
|
|
1527
|
+
When int - Number of quantiles. 10 for deciles, 4 for quartiles, etc.
|
|
1528
|
+
labels: List[str]
|
|
1529
|
+
Labels used to name resulting bins.
|
|
1530
|
+
If not set, bins are numeric intervals like (5.0000000000, 7.5000000000].
|
|
1531
|
+
|
|
1532
|
+
Returns
|
|
1533
|
+
-------
|
|
1534
|
+
object that can be set to :py:class:`~onetick.py.Column` via :py:meth:`~onetick.py.Source.__setitem__`
|
|
1535
|
+
|
|
1536
|
+
Examples
|
|
1537
|
+
--------
|
|
1538
|
+
>>> # OTdirective: snippet-name: Source.functions.qcut;
|
|
1539
|
+
>>> data = otp.Ticks({"X": [10, 3, 5, 6, 7, 1]})
|
|
1540
|
+
>>> data['bin'] = otp.qcut(data['X'], q=3, labels=['a', 'b', 'c'])
|
|
1541
|
+
>>> otp.run(data)[['X', 'bin']]
|
|
1542
|
+
X bin
|
|
1543
|
+
0 10 c
|
|
1544
|
+
1 3 a
|
|
1545
|
+
2 5 b
|
|
1546
|
+
3 6 b
|
|
1547
|
+
4 7 c
|
|
1548
|
+
5 1 a
|
|
1549
|
+
"""
|
|
1550
|
+
# TODO when q is a List[float] like [0, .25, .5, .75, 1.]
|
|
1551
|
+
src = column.obj_ref
|
|
1552
|
+
return _QCutBuilder(src, column, q, labels=labels)
|
|
1553
|
+
|
|
1554
|
+
|
|
1555
|
+
def coalesce(sources, max_source_delay: float = 0.0, output_type_index: Optional[int] = None):
|
|
1556
|
+
"""
|
|
1557
|
+
Used to fill the gaps in one time series with the ticks from one or several other time series.
|
|
1558
|
+
|
|
1559
|
+
This event processor considers ticks that arrive from several sources at the same time as being the same,
|
|
1560
|
+
allowing for possible delay across the sources when determining whether the ticks are the same.
|
|
1561
|
+
When the same tick arrives from several sources, it is only propagated from the source
|
|
1562
|
+
that has the highest priority among those sources.
|
|
1563
|
+
Input ticks do not necessarily have the same structure - they can have different fields.
|
|
1564
|
+
|
|
1565
|
+
In order to distinguish time series the event processor adds the SYMBOL_NAME field.
|
|
1566
|
+
Also SOURCE field is added to each tick which lacks it to identify the source from which the tick is coming.
|
|
1567
|
+
Hence, one must avoid adding SOURCE field in event processors positioned after COALSECE.
|
|
1568
|
+
|
|
1569
|
+
Parameters
|
|
1570
|
+
----------
|
|
1571
|
+
sources: list of :class:`Source`
|
|
1572
|
+
List of the sources to coalesce. Also, this list is treated as priority order.
|
|
1573
|
+
First member of the list has the highest priority when determining whether ticks are the same.
|
|
1574
|
+
max_source_delay: float
|
|
1575
|
+
The maximum time in seconds by which a tick from one input time series
|
|
1576
|
+
can arrive later than the same tick from another time series.
|
|
1577
|
+
output_type_index: int
|
|
1578
|
+
Specifies index of source in ``sources`` from which type and properties of output will be taken.
|
|
1579
|
+
Useful when merging sources that inherited from :class:`Source`.
|
|
1580
|
+
By default, output object type will be :class:`Source`.
|
|
1581
|
+
|
|
1582
|
+
Returns
|
|
1583
|
+
-------
|
|
1584
|
+
:class:`Source`
|
|
1585
|
+
A time series of ticks.
|
|
1586
|
+
|
|
1587
|
+
See also
|
|
1588
|
+
--------
|
|
1589
|
+
**COALESCE** OneTick event processor
|
|
1590
|
+
|
|
1591
|
+
Examples
|
|
1592
|
+
--------
|
|
1593
|
+
If ticks from different sources have the same time,
|
|
1594
|
+
only the tick from source with the highest priority will be propagated.
|
|
1595
|
+
|
|
1596
|
+
>>> data1 = otp.Ticks(A=[1, 2])
|
|
1597
|
+
>>> data2 = otp.Ticks(A=[3, 4])
|
|
1598
|
+
>>> data = otp.coalesce([data2, data1])
|
|
1599
|
+
>>> otp.run(data)[['Time', 'A']]
|
|
1600
|
+
Time A
|
|
1601
|
+
0 2003-12-01 00:00:00.000 3
|
|
1602
|
+
1 2003-12-01 00:00:00.001 4
|
|
1603
|
+
|
|
1604
|
+
We can use ``max_source_delay`` parameter to expand time interval in which
|
|
1605
|
+
ticks are considered to have the "same time".
|
|
1606
|
+
|
|
1607
|
+
>>> data1 = otp.Ticks({
|
|
1608
|
+
... 'A': [1, 2, 3],
|
|
1609
|
+
... 'offset': [0, 3000, 6000],
|
|
1610
|
+
... })
|
|
1611
|
+
>>> data2 = otp.Ticks({
|
|
1612
|
+
... 'A': [4, 5, 6],
|
|
1613
|
+
... # 4 is delayed by less than one second from 1
|
|
1614
|
+
... # 5 is delayed by one second from 2
|
|
1615
|
+
... # 6 is delayed by more than one second from 3
|
|
1616
|
+
... 'offset': [999, 4000, 7001],
|
|
1617
|
+
... })
|
|
1618
|
+
>>> data = otp.coalesce([data2, data1], max_source_delay=1)
|
|
1619
|
+
>>> otp.run(data)[['Time', 'A']]
|
|
1620
|
+
Time A
|
|
1621
|
+
0 2003-12-01 00:00:00.999 4
|
|
1622
|
+
1 2003-12-01 00:00:04.000 5
|
|
1623
|
+
2 2003-12-01 00:00:06.000 3
|
|
1624
|
+
3 2003-12-01 00:00:07.001 6
|
|
1625
|
+
"""
|
|
1626
|
+
if not sources:
|
|
1627
|
+
raise ValueError("Coalesce should have one or more inputs")
|
|
1628
|
+
|
|
1629
|
+
output_type = output_type_by_index(sources, output_type_index)
|
|
1630
|
+
|
|
1631
|
+
# change node names for sources, COALESCE ep needs them
|
|
1632
|
+
new_node_names = [
|
|
1633
|
+
f'__COALESCE_SRC_{i}__' for i, source in enumerate(sources, start=1)
|
|
1634
|
+
]
|
|
1635
|
+
|
|
1636
|
+
node = otq.Coalesce(
|
|
1637
|
+
priority_order=','.join(new_node_names),
|
|
1638
|
+
max_source_delay=max_source_delay,
|
|
1639
|
+
)
|
|
1640
|
+
|
|
1641
|
+
columns = {
|
|
1642
|
+
# these fields will be added by COALESCE ep
|
|
1643
|
+
'SYMBOL_NAME': str,
|
|
1644
|
+
'TICK_TYPE': str,
|
|
1645
|
+
}
|
|
1646
|
+
for source in sources:
|
|
1647
|
+
for name in ['SYMBOL_NAME', 'TICK_TYPE']:
|
|
1648
|
+
if name in source.schema:
|
|
1649
|
+
raise ValueError(f"Field with name '{name}' is already present in the source. "
|
|
1650
|
+
'Please, rename or delete that field prior to invoking coalesce().')
|
|
1651
|
+
shared_columns = set(source.schema).intersection(columns)
|
|
1652
|
+
for name in shared_columns:
|
|
1653
|
+
type_1, type_2 = source.schema[name], columns[name]
|
|
1654
|
+
if type_1 != type_2:
|
|
1655
|
+
raise ValueError(f"Conflicting types for field '{name}' in different sources: {type_1}, {type_2}")
|
|
1656
|
+
columns.update(source.schema)
|
|
1657
|
+
|
|
1658
|
+
# TODO: do we need field SOURCE (especially when node names are auto-generated)?
|
|
1659
|
+
# this field will be added by COALESCE if it's not presented in sources
|
|
1660
|
+
columns.setdefault('SOURCE', str)
|
|
1661
|
+
|
|
1662
|
+
result = output_type(node, schema=columns)
|
|
1663
|
+
|
|
1664
|
+
__copy_sources_on_merge_or_join(result, sources, names=new_node_names, output_type_index=output_type_index)
|
|
1665
|
+
return result
|
|
1666
|
+
|
|
1667
|
+
|
|
1668
|
+
def corp_actions(source,
|
|
1669
|
+
adjustment_date: Union[ott.date, ott.datetime, dt.date, dt.datetime, int, str, None] = None,
|
|
1670
|
+
adjustment_date_tz: Union[str, Type[default]] = default,
|
|
1671
|
+
fields=None,
|
|
1672
|
+
adjust_rule="PRICE",
|
|
1673
|
+
apply_split: bool = True,
|
|
1674
|
+
apply_spinoff: bool = False,
|
|
1675
|
+
apply_rights: Optional[bool] = None,
|
|
1676
|
+
apply_cash_dividend: bool = False,
|
|
1677
|
+
apply_stock_dividend: bool = False,
|
|
1678
|
+
apply_security_splice: bool = False,
|
|
1679
|
+
apply_others: str = "",
|
|
1680
|
+
apply_all: bool = False):
|
|
1681
|
+
"""
|
|
1682
|
+
Adjusts values using corporate actions information loaded into OneTick
|
|
1683
|
+
from the reference data file. To use it, location of reference database must
|
|
1684
|
+
be specified via OneTick configuration.
|
|
1685
|
+
|
|
1686
|
+
Parameters
|
|
1687
|
+
----------
|
|
1688
|
+
source: :py:class:`onetick.py.Source`
|
|
1689
|
+
Source object adjusted by corporate actions information.
|
|
1690
|
+
adjustment_date : :py:class:`onetick.py.date`, :py:class:`onetick.py.datetime`, int, str, None, optional
|
|
1691
|
+
The date as of which the values are adjusted.
|
|
1692
|
+
All corporate actions of the types specified in the parameters
|
|
1693
|
+
that lie between the tick timestamp and the adjustment date will be applied to each tick.
|
|
1694
|
+
|
|
1695
|
+
This parameter can be either date or datetime .
|
|
1696
|
+
`int` and `str` format can be *YYYYMMDD* or *YYYYMMDDhhmmss*.
|
|
1697
|
+
When parameter is a date, the time is assumed to be 17:00:00 GMT
|
|
1698
|
+
and parameter ``adjustment_date_tz`` is ignored.
|
|
1699
|
+
|
|
1700
|
+
If it is not set, the values are adjusted as of the end date in the query.
|
|
1701
|
+
|
|
1702
|
+
Notice that the ``adjustment date`` is not affected neither by *_SYMBOL_PARAM._PARAM_END_TIME_NANOS*
|
|
1703
|
+
nor by the *apply_times_daily* setting in :py:func:`onetick.py.run`.
|
|
1704
|
+
|
|
1705
|
+
adjustment_date_tz : str, optional
|
|
1706
|
+
Timezone for ``adjustment date``.
|
|
1707
|
+
|
|
1708
|
+
By default global :py:attr:`tz<onetick.py.configuration.Config.tz>` value is used.
|
|
1709
|
+
Local timezone can't be used so in this case parameter is set to GMT.
|
|
1710
|
+
When ``adjustment_date`` is in YYYYMMDD format, this parameter is set to GMT.
|
|
1711
|
+
fields : str, optional
|
|
1712
|
+
A comma-separated list of fields to be adjusted. If this parameter is not set,
|
|
1713
|
+
some default adjustments will take place if appropriately named fields exist in the tick:
|
|
1714
|
+
|
|
1715
|
+
- If the ``adjust_rule`` parameter is set to PRICE, and the PRICE field is present,
|
|
1716
|
+
it will get adjusted. If the fields ASK_PRICE or BID_PRICE are present, they will get adjusted.
|
|
1717
|
+
If fields ASK_VALUE or BID_VALUE are present, they will get adjusted
|
|
1718
|
+
|
|
1719
|
+
- If the ``adjust_rule`` parameter is set to SIZE, and the SIZE field is present,
|
|
1720
|
+
it will get adjusted. If the fields ASK_SIZE or BID_SIZE are present, they will get adjusted.
|
|
1721
|
+
If fields ASK_VALUE or BID_VALUE are present, they will get adjusted.
|
|
1722
|
+
|
|
1723
|
+
adjust_rule : str, optional
|
|
1724
|
+
When set to PRICE, adjustments are applied under the assumption that fields to be adjusted contain prices
|
|
1725
|
+
(adjustment direction is determined appropriately).
|
|
1726
|
+
|
|
1727
|
+
When set to SIZE, adjustments are applied under the assumption that fields contain sizes
|
|
1728
|
+
(adjustment direction is opposite to that when the parameter's value is PRICE).
|
|
1729
|
+
|
|
1730
|
+
By default the value is PRICE.
|
|
1731
|
+
apply_split : bool, optional
|
|
1732
|
+
If True, adjustments for splits are applied.
|
|
1733
|
+
apply_spinoff : bool, optional
|
|
1734
|
+
If True, adjustments for spin-offs are applied.
|
|
1735
|
+
apply_cash_dividend : bool, optional
|
|
1736
|
+
If True, adjustments for cash dividends are applied.
|
|
1737
|
+
apply_stock_dividend : bool, optional
|
|
1738
|
+
If True, adjustments for stock dividends are applied.
|
|
1739
|
+
apply_security_splice : bool, optional
|
|
1740
|
+
If True, adjustments for security splices are applied.
|
|
1741
|
+
apply_others : str, optional
|
|
1742
|
+
A comma-separated list of names of custom adjustment types to apply.
|
|
1743
|
+
apply_all : bool, optional
|
|
1744
|
+
If True, applies all types of adjustments, both built-in and custom.
|
|
1745
|
+
|
|
1746
|
+
Returns
|
|
1747
|
+
-------
|
|
1748
|
+
:py:class:`onetick.py.Source`
|
|
1749
|
+
A new source object with applied adjustments.
|
|
1750
|
+
|
|
1751
|
+
See also
|
|
1752
|
+
--------
|
|
1753
|
+
**CORP_ACTIONS** OneTick event processor
|
|
1754
|
+
|
|
1755
|
+
Examples
|
|
1756
|
+
--------
|
|
1757
|
+
>>> src = otp.DataSource('US_COMP',
|
|
1758
|
+
... tick_type='TRD',
|
|
1759
|
+
... start=otp.dt(2022, 5, 20, 9, 30),
|
|
1760
|
+
... end=otp.dt(2022, 5, 26, 16))
|
|
1761
|
+
>>> df = otp.run(src, symbols='MKD', symbol_date=otp.date(2022, 5, 22))
|
|
1762
|
+
>>> df["PRICE"][0]
|
|
1763
|
+
0.0911
|
|
1764
|
+
>>> src = otp.corp_actions(src,
|
|
1765
|
+
... adjustment_date=otp.date(2022, 5, 22),
|
|
1766
|
+
... fields="PRICE")
|
|
1767
|
+
>>> df = otp.run(src, symbols='MKD', symbol_date=otp.date(2022, 5, 22))
|
|
1768
|
+
>>> df["PRICE"][0]
|
|
1769
|
+
1.36649931675
|
|
1770
|
+
"""
|
|
1771
|
+
source = source.copy()
|
|
1772
|
+
|
|
1773
|
+
if isinstance(adjustment_date, int):
|
|
1774
|
+
adjustment_date = str(adjustment_date)
|
|
1775
|
+
|
|
1776
|
+
is_datetime_param = None
|
|
1777
|
+
|
|
1778
|
+
if adjustment_date is None or isinstance(adjustment_date, str) and adjustment_date == '':
|
|
1779
|
+
# default value for otq.CorpActions
|
|
1780
|
+
adjustment_date = ''
|
|
1781
|
+
elif isinstance(adjustment_date, (ott.datetime, ott.date, dt.datetime, dt.date, str)):
|
|
1782
|
+
if isinstance(adjustment_date, str):
|
|
1783
|
+
try:
|
|
1784
|
+
dt.datetime.strptime(adjustment_date, '%Y%m%d%H%M%S')
|
|
1785
|
+
if len(adjustment_date) != 14:
|
|
1786
|
+
# strptime doesn't require leading zeroes for %m%d%H%M%S specificators, but we do
|
|
1787
|
+
raise ValueError()
|
|
1788
|
+
is_datetime_param = True
|
|
1789
|
+
except ValueError:
|
|
1790
|
+
try:
|
|
1791
|
+
dt.datetime.strptime(adjustment_date, '%Y%m%d')
|
|
1792
|
+
if len(adjustment_date) != 8:
|
|
1793
|
+
# strptime doesn't require leading zeroes for %m%d specificators, but we do
|
|
1794
|
+
raise ValueError()
|
|
1795
|
+
is_datetime_param = False
|
|
1796
|
+
except ValueError:
|
|
1797
|
+
raise ValueError("Parameter 'adjustment_date' must be in YYYYMMDDhhmmss or YYYYMMDD formats.")
|
|
1798
|
+
adjustment_date = int(adjustment_date)
|
|
1799
|
+
elif type(adjustment_date) in (ott.datetime, dt.datetime):
|
|
1800
|
+
is_datetime_param = True
|
|
1801
|
+
adjustment_date = int(adjustment_date.strftime('%Y%m%d%H%M%S'))
|
|
1802
|
+
elif type(adjustment_date) in (ott.date, dt.date):
|
|
1803
|
+
is_datetime_param = False
|
|
1804
|
+
adjustment_date = int(adjustment_date.strftime('%Y%m%d'))
|
|
1805
|
+
else:
|
|
1806
|
+
raise ValueError("Parameter 'adjustment_date' must be in YYYYMMDDhhmmss or YYYYMMDD formats.")
|
|
1807
|
+
|
|
1808
|
+
adjustment_date_tz_is_default = adjustment_date_tz is default
|
|
1809
|
+
if adjustment_date_tz_is_default:
|
|
1810
|
+
adjustment_date_tz = config.tz
|
|
1811
|
+
|
|
1812
|
+
if not adjustment_date_tz:
|
|
1813
|
+
warnings.warn("Local timezone can't be used in parameter 'adjustment_date_tz', setting to 'GMT'.")
|
|
1814
|
+
adjustment_date_tz = 'GMT'
|
|
1815
|
+
|
|
1816
|
+
if is_datetime_param is not None and not is_datetime_param and adjustment_date_tz != 'GMT':
|
|
1817
|
+
adjustment_date_tz = 'GMT'
|
|
1818
|
+
if not adjustment_date_tz_is_default:
|
|
1819
|
+
warnings.warn("`adjustment_date_tz` was changed to 'GMT' since "
|
|
1820
|
+
"it is the only valid value when `adjustment_date` is in YYYYMMDD format.")
|
|
1821
|
+
|
|
1822
|
+
kwargs = {}
|
|
1823
|
+
if apply_rights is not None and is_apply_rights_supported(throw_warning=True):
|
|
1824
|
+
kwargs['apply_rights'] = apply_rights
|
|
1825
|
+
|
|
1826
|
+
source.sink(otq.CorpActions(
|
|
1827
|
+
adjustment_date=adjustment_date,
|
|
1828
|
+
adjustment_date_tz=adjustment_date_tz,
|
|
1829
|
+
fields=fields or '',
|
|
1830
|
+
adjust_rule=adjust_rule,
|
|
1831
|
+
apply_split=apply_split,
|
|
1832
|
+
apply_spinoff=apply_spinoff,
|
|
1833
|
+
apply_cash_dividend=apply_cash_dividend,
|
|
1834
|
+
apply_stock_dividend=apply_stock_dividend,
|
|
1835
|
+
apply_security_splice=apply_security_splice,
|
|
1836
|
+
apply_others=apply_others,
|
|
1837
|
+
apply_all=apply_all,
|
|
1838
|
+
**kwargs,
|
|
1839
|
+
))
|
|
1840
|
+
return source
|
|
1841
|
+
|
|
1842
|
+
|
|
1843
|
+
def save_sources_to_single_file(sources,
|
|
1844
|
+
file_path=None,
|
|
1845
|
+
file_suffix='',
|
|
1846
|
+
start=None,
|
|
1847
|
+
end=None,
|
|
1848
|
+
start_time_expression=None,
|
|
1849
|
+
end_time_expression=None,
|
|
1850
|
+
timezone=None,
|
|
1851
|
+
running_query_flag=None):
|
|
1852
|
+
"""
|
|
1853
|
+
Save onetick.py.Source objects to the single file.
|
|
1854
|
+
|
|
1855
|
+
Parameters
|
|
1856
|
+
----------
|
|
1857
|
+
sources: dict or list
|
|
1858
|
+
dict of names -> sources or list of sources to merge into single file.
|
|
1859
|
+
If it's the list then names will be autogenerated.
|
|
1860
|
+
Source can be :class:`otp.Source` object or dictionary with these allowed parameters:
|
|
1861
|
+
{
|
|
1862
|
+
'source': otp.Source,
|
|
1863
|
+
'start': datetime(2022, 1, 1), # optional
|
|
1864
|
+
'end': datetime(2022, 1, 2), # optional
|
|
1865
|
+
'symbols': otp.Source or otp.Symbols, # optional
|
|
1866
|
+
}
|
|
1867
|
+
file_path: str, optional
|
|
1868
|
+
Path to the file where all sources will be saved.
|
|
1869
|
+
If not set, sources will be saved to temporary file and its name will be returned.
|
|
1870
|
+
file_suffix: str
|
|
1871
|
+
Only used if ``file_path`` is not set.
|
|
1872
|
+
This suffix will be added to the name of a generated query file.
|
|
1873
|
+
start: datetime, optional
|
|
1874
|
+
start time for the resulting query file
|
|
1875
|
+
end: datetime, optional
|
|
1876
|
+
end time for the resulting query file
|
|
1877
|
+
start_time_expression: str, optional
|
|
1878
|
+
start time expression for the resulting query file
|
|
1879
|
+
end_time_expression: str, optional
|
|
1880
|
+
end time expression for the resulting query file
|
|
1881
|
+
timezone: str, optional
|
|
1882
|
+
timezone for the resulting query file
|
|
1883
|
+
running_query_flag: bool, optional
|
|
1884
|
+
running query flag for the resulting query file
|
|
1885
|
+
|
|
1886
|
+
Returns
|
|
1887
|
+
-------
|
|
1888
|
+
If `sources` is list then returns list of full query paths (path_to_file::query_name)
|
|
1889
|
+
with autogenerated names corresponding to each source from `sources`.
|
|
1890
|
+
If `sources` is dict then the path to the query file is returned.
|
|
1891
|
+
"""
|
|
1892
|
+
if isinstance(sources, dict):
|
|
1893
|
+
names = sources.keys()
|
|
1894
|
+
sources = sources.values()
|
|
1895
|
+
query_names = None
|
|
1896
|
+
else:
|
|
1897
|
+
names = repeat(None)
|
|
1898
|
+
query_names = []
|
|
1899
|
+
tmp_otq = TmpOtq()
|
|
1900
|
+
for name, source in zip(names, sources):
|
|
1901
|
+
query_start = query_end = query_symbols = query_symbol_date = None
|
|
1902
|
+
if isinstance(source, dict):
|
|
1903
|
+
query_start = source.get('start')
|
|
1904
|
+
query_end = source.get('end')
|
|
1905
|
+
query_symbols = source.get('symbols')
|
|
1906
|
+
query_symbol_date = source.get('symbol_date')
|
|
1907
|
+
source = source['source']
|
|
1908
|
+
query_name = source._store_in_tmp_otq(tmp_otq,
|
|
1909
|
+
name=name,
|
|
1910
|
+
start=query_start,
|
|
1911
|
+
end=query_end,
|
|
1912
|
+
symbols=query_symbols,
|
|
1913
|
+
symbol_date=query_symbol_date)
|
|
1914
|
+
if query_names is not None:
|
|
1915
|
+
query_names.append(query_name)
|
|
1916
|
+
file_path = tmp_otq.save_to_file(
|
|
1917
|
+
file_path=file_path,
|
|
1918
|
+
file_suffix=file_suffix,
|
|
1919
|
+
start=start,
|
|
1920
|
+
end=end,
|
|
1921
|
+
start_time_expression=start_time_expression,
|
|
1922
|
+
end_time_expression=end_time_expression,
|
|
1923
|
+
timezone=timezone,
|
|
1924
|
+
running_query_flag=running_query_flag,
|
|
1925
|
+
)
|
|
1926
|
+
if query_names is not None:
|
|
1927
|
+
return [f'{file_path}::{query_name}' for query_name in query_names]
|
|
1928
|
+
return file_path
|
|
1929
|
+
|
|
1930
|
+
|
|
1931
|
+
class _FormatType(Enum):
|
|
1932
|
+
POSITIONAL = 1
|
|
1933
|
+
OMITTED_POSITIONAL = 2
|
|
1934
|
+
KEY_WORD = 3
|
|
1935
|
+
|
|
1936
|
+
|
|
1937
|
+
def format(format_line: str, *args, **kwargs) -> Operation:
|
|
1938
|
+
"""
|
|
1939
|
+
Perform a string formatting operation.
|
|
1940
|
+
Currently, there are only 2 types of formatting available:
|
|
1941
|
+
|
|
1942
|
+
1. Float precision - ``{:.xf}``, where ``x`` is number, e.g. ``{:.5f}``
|
|
1943
|
+
|
|
1944
|
+
2. Time formatting - the same as in ``Source.dt.strftime``
|
|
1945
|
+
|
|
1946
|
+
See examples for more information.
|
|
1947
|
+
|
|
1948
|
+
Parameters
|
|
1949
|
+
----------
|
|
1950
|
+
format_line: str
|
|
1951
|
+
String which contains literal text or replacement fields delimited by braces {}.
|
|
1952
|
+
Currently content of the braces is not supported.
|
|
1953
|
+
args
|
|
1954
|
+
Values to paste into the line.
|
|
1955
|
+
kwargs
|
|
1956
|
+
Key-word values to paste into the line.
|
|
1957
|
+
|
|
1958
|
+
Returns
|
|
1959
|
+
-------
|
|
1960
|
+
:py:class:`~onetick.py.Operation` with type equal to :py:class:`~onetick.py.types.varstring`
|
|
1961
|
+
|
|
1962
|
+
Examples
|
|
1963
|
+
--------
|
|
1964
|
+
It allows to format :py:class:`~onetick.py.Operation`. For example, :py:class:`~onetick.py.Column`:
|
|
1965
|
+
|
|
1966
|
+
>>> data = otp.Ticks(A=[1, 2], B=['abc', 'def'])
|
|
1967
|
+
>>> data['C'] = otp.format('A field value is `{}` and B field value is `{}`', data['A'], data['B'])
|
|
1968
|
+
>>> otp.run(data)
|
|
1969
|
+
Time A B C
|
|
1970
|
+
0 2003-12-01 00:00:00.000 1 abc A field value is `1` and B field value is `abc`
|
|
1971
|
+
1 2003-12-01 00:00:00.001 2 def A field value is `2` and B field value is `def`
|
|
1972
|
+
|
|
1973
|
+
Formatting can use positional arguments:
|
|
1974
|
+
|
|
1975
|
+
>>> data = otp.Ticks(A=[1, 2], B=['abc', 'def'])
|
|
1976
|
+
>>> data['C'] = otp.format('A is `{0}`, B is `{1}`. Also, A is `{0}`', data['A'], data['B'])
|
|
1977
|
+
>>> otp.run(data)
|
|
1978
|
+
Time A B C
|
|
1979
|
+
0 2003-12-01 00:00:00.000 1 abc A is `1`, B is `abc`. Also, A is `1`
|
|
1980
|
+
1 2003-12-01 00:00:00.001 2 def A is `2`, B is `def`. Also, A is `2`
|
|
1981
|
+
|
|
1982
|
+
Formatting can be used with key-word arguments:
|
|
1983
|
+
|
|
1984
|
+
>>> data = otp.Ticks(A=[1, 2], B=['abc', 'def'])
|
|
1985
|
+
>>> data['C'] = otp.format('A is `{a}`, B is `{b}`. Also, A is `{a}`', a=data['A'], b=data['B'])
|
|
1986
|
+
>>> otp.run(data)
|
|
1987
|
+
Time A B C
|
|
1988
|
+
0 2003-12-01 00:00:00.000 1 abc A is `1`, B is `abc`. Also, A is `1`
|
|
1989
|
+
1 2003-12-01 00:00:00.001 2 def A is `2`, B is `def`. Also, A is `2`
|
|
1990
|
+
|
|
1991
|
+
Float numbers can be formatted:
|
|
1992
|
+
|
|
1993
|
+
>>> data = otp.Ticks(A=[12.3456, 67.8971])
|
|
1994
|
+
>>> data['B'] = otp.format('A is about {:.2f}', data['A'])
|
|
1995
|
+
>>> otp.run(data)
|
|
1996
|
+
Time A B
|
|
1997
|
+
0 2003-12-01 00:00:00.000 12.3456 A is about 12.35
|
|
1998
|
+
1 2003-12-01 00:00:00.001 67.8971 A is about 67.90
|
|
1999
|
+
|
|
2000
|
+
Time can be formatted:
|
|
2001
|
+
|
|
2002
|
+
>>> data = otp.Tick(A=otp.datetime(2020, 4, 5, 17, 56, 3, 789123))
|
|
2003
|
+
>>> data['B'] = otp.format('A is {:%Y/%m/%d %H:%M:%S.%J}', data['A'])
|
|
2004
|
+
>>> otp.run(data)
|
|
2005
|
+
Time A B
|
|
2006
|
+
0 2003-12-01 2020-04-05 17:56:03.789123 A is 2020/04/05 17:56:03.789123000
|
|
2007
|
+
"""
|
|
2008
|
+
_validate_format_line(format_line)
|
|
2009
|
+
format_array = re.split('[{}]', format_line)
|
|
2010
|
+
format_type = _get_format_type(format_array)
|
|
2011
|
+
res = ott.varstring(format_array[0])
|
|
2012
|
+
cur_index = 0
|
|
2013
|
+
format_spec_array = format_array[1::2]
|
|
2014
|
+
regular_string_array = format_array[2::2]
|
|
2015
|
+
for format_spec, regular_string in zip(format_spec_array, regular_string_array):
|
|
2016
|
+
format_spec_array = format_spec.split(':', 1)
|
|
2017
|
+
format_spec_param = format_spec_array[0]
|
|
2018
|
+
format_spec_additional = None if len(format_spec_array) == 1 else format_spec_array[1]
|
|
2019
|
+
if format_type == _FormatType.POSITIONAL:
|
|
2020
|
+
res = _add_element(res, args[int(format_spec_param)], format_spec_additional)
|
|
2021
|
+
elif format_type == _FormatType.OMITTED_POSITIONAL:
|
|
2022
|
+
res = _add_element(res, args[cur_index], format_spec_additional)
|
|
2023
|
+
cur_index += 1
|
|
2024
|
+
else:
|
|
2025
|
+
res = _add_element(res, kwargs[format_spec_param], format_spec_additional)
|
|
2026
|
+
res += regular_string
|
|
2027
|
+
return res
|
|
2028
|
+
|
|
2029
|
+
|
|
2030
|
+
def _add_element(cur_res, element, format_spec_additional=None):
|
|
2031
|
+
if isinstance(element, Operation):
|
|
2032
|
+
if format_spec_additional is None:
|
|
2033
|
+
cur_res += element.apply(str)
|
|
2034
|
+
elif issubclass(element.dtype, (float, ott.decimal)) and re.fullmatch(r'\.\d+f', format_spec_additional):
|
|
2035
|
+
# float has strange behavior when precision=0
|
|
2036
|
+
decimal_elem = element.apply(ott.decimal)
|
|
2037
|
+
precision_str = re.findall(r'\d+', format_spec_additional)[0]
|
|
2038
|
+
try:
|
|
2039
|
+
precision = int(precision_str)
|
|
2040
|
+
except ValueError as exc:
|
|
2041
|
+
raise ValueError('Incorrect value for `precision` for formatting decimal number') from exc
|
|
2042
|
+
|
|
2043
|
+
cur_res += decimal_elem.decimal.str(precision)
|
|
2044
|
+
elif issubclass(element.dtype, (ott.nsectime, ott.msectime)):
|
|
2045
|
+
cur_res += element.dt.strftime(format_spec_additional)
|
|
2046
|
+
else:
|
|
2047
|
+
raise ValueError(f'Unsupported formatting `{format_spec_additional}` for field type {element.dtype}')
|
|
2048
|
+
else:
|
|
2049
|
+
if format_spec_additional is None:
|
|
2050
|
+
cur_res += str(element)
|
|
2051
|
+
elif isinstance(element, (float, ott.decimal)):
|
|
2052
|
+
formatting = f'{{:{format_spec_additional}}}'
|
|
2053
|
+
cur_res += formatting.format(element)
|
|
2054
|
+
else:
|
|
2055
|
+
raise ValueError(f'Unsupported formatting `{format_spec_additional}` for literal {type(element)}')
|
|
2056
|
+
return cur_res
|
|
2057
|
+
|
|
2058
|
+
|
|
2059
|
+
def _validate_format_line(format_line: str):
|
|
2060
|
+
open_brackets_num = 0
|
|
2061
|
+
close_brackets_num = 0
|
|
2062
|
+
for symbol in format_line:
|
|
2063
|
+
if symbol == '{':
|
|
2064
|
+
open_brackets_num += 1
|
|
2065
|
+
if symbol == '}':
|
|
2066
|
+
close_brackets_num += 1
|
|
2067
|
+
if open_brackets_num > close_brackets_num + 1:
|
|
2068
|
+
raise ValueError("'{' appeared before previous '{' was closed")
|
|
2069
|
+
if open_brackets_num < close_brackets_num:
|
|
2070
|
+
raise ValueError("Single '}' encountered in format string")
|
|
2071
|
+
if open_brackets_num != close_brackets_num:
|
|
2072
|
+
raise ValueError("Single '{' encountered in format string")
|
|
2073
|
+
|
|
2074
|
+
|
|
2075
|
+
def _get_format_type(format_array: List[str]) -> _FormatType:
|
|
2076
|
+
if len(format_array) < 2:
|
|
2077
|
+
return _FormatType.OMITTED_POSITIONAL
|
|
2078
|
+
format_spec_array = format_array[1::2]
|
|
2079
|
+
uses_positional = False
|
|
2080
|
+
uses_omitted_positional = False
|
|
2081
|
+
uses_key_word = False
|
|
2082
|
+
for format_spec in format_spec_array:
|
|
2083
|
+
format_spec_param = format_spec.split(':')[0]
|
|
2084
|
+
if not format_spec_param:
|
|
2085
|
+
uses_omitted_positional = True
|
|
2086
|
+
elif format_spec_param[0].isdigit():
|
|
2087
|
+
if not format_spec_param.isnumeric():
|
|
2088
|
+
raise ValueError(f'Incorrect positional argument: `{format_spec_param}`')
|
|
2089
|
+
uses_positional = True
|
|
2090
|
+
elif format_spec_param[0].isalpha():
|
|
2091
|
+
# only word characters are supported
|
|
2092
|
+
if not re.fullmatch(r'\w+', format_spec_param):
|
|
2093
|
+
raise ValueError(f'Incorrect key word argument: `{format_spec_param}`')
|
|
2094
|
+
uses_key_word = True
|
|
2095
|
+
else:
|
|
2096
|
+
raise ValueError(f'Unrecognised format specification: `{format_spec_param}`')
|
|
2097
|
+
if uses_positional and not (uses_omitted_positional or uses_key_word):
|
|
2098
|
+
return _FormatType.POSITIONAL
|
|
2099
|
+
if uses_omitted_positional and not (uses_positional or uses_key_word):
|
|
2100
|
+
return _FormatType.OMITTED_POSITIONAL
|
|
2101
|
+
if uses_key_word and not (uses_positional or uses_omitted_positional):
|
|
2102
|
+
return _FormatType.KEY_WORD
|
|
2103
|
+
raise ValueError("Format string has mixed type of referring to arguments which is not allowed")
|
|
2104
|
+
|
|
2105
|
+
|
|
2106
|
+
def join_with_aggregated_window(
|
|
2107
|
+
agg_src, pass_src, aggregation,
|
|
2108
|
+
boundary_aggr_tick: str = 'next',
|
|
2109
|
+
pass_src_delay_msec: int = 0,
|
|
2110
|
+
bucket_interval: int = 0,
|
|
2111
|
+
bucket_units: Literal['seconds', 'ticks', 'days', 'months', 'flexible'] = 'seconds',
|
|
2112
|
+
output_type_index=None,
|
|
2113
|
+
):
|
|
2114
|
+
"""
|
|
2115
|
+
Computes one or more aggregations on ``agg_src`` time series
|
|
2116
|
+
and joins the result with each incoming tick from ``pass_src`` time series.
|
|
2117
|
+
|
|
2118
|
+
Parameters
|
|
2119
|
+
----------
|
|
2120
|
+
agg_src: :py:class:`onetick.py.Source`
|
|
2121
|
+
Input time series to which aggregation will be applied.
|
|
2122
|
+
pass_src: :py:class:`onetick.py.Source`
|
|
2123
|
+
Input time series that will be joined with the aggregation result.
|
|
2124
|
+
aggregation: dict
|
|
2125
|
+
Dictionary with aggregation output field names and aggregation objects,
|
|
2126
|
+
similar to the one passed to :py:meth:`onetick.py.Source.agg` method.
|
|
2127
|
+
pass_src_delay_msec: int
|
|
2128
|
+
Specifies by how much any incoming tick from the ``pass_src`` is delayed.
|
|
2129
|
+
|
|
2130
|
+
The effective timestamp of a tick from the ``pass_src`` with timestamp ``T`` is ``T - pass_src_delay_msec``.
|
|
2131
|
+
This parameter may be negative, in which case ticks from ``pass_src`` will be joined
|
|
2132
|
+
with the aggregation result of a later timestamp.
|
|
2133
|
+
boundary_aggr_tick: str
|
|
2134
|
+
Controls the logic of joining ticks with the same timestamp.
|
|
2135
|
+
|
|
2136
|
+
If set to **next**, ticks from ``agg_src`` with the same timestamp (+ ``pass_src_delay_msec``)
|
|
2137
|
+
as the latest ticks from ``pass_src`` will not be included in that tick's joined aggregation.
|
|
2138
|
+
bucket_interval: int
|
|
2139
|
+
Determines the length of each bucket (units depends on ``bucket_units``).
|
|
2140
|
+
|
|
2141
|
+
When this parameter is set to 0 (by default),
|
|
2142
|
+
the computation of the aggregation is performed for all ticks starting from the query's start time
|
|
2143
|
+
and until ``pass_src`` effective tick timestamp ``T - pass_src_delay_timestamp``,
|
|
2144
|
+
regardless of the value of ``bucket_units``.
|
|
2145
|
+
bucket_units: 'seconds', 'ticks', 'days', 'months'
|
|
2146
|
+
Set bucket interval units.
|
|
2147
|
+
output_type_index: int
|
|
2148
|
+
Specifies index of source between ``agg_src`` and ``pass_src``
|
|
2149
|
+
from which type and properties of output object will be taken.
|
|
2150
|
+
Useful when merging sources that inherited from :class:`Source`.
|
|
2151
|
+
By default, output object type will be :class:`Source`.
|
|
2152
|
+
|
|
2153
|
+
Returns
|
|
2154
|
+
-------
|
|
2155
|
+
:py:class:`onetick.py.Source`
|
|
2156
|
+
|
|
2157
|
+
See also
|
|
2158
|
+
--------
|
|
2159
|
+
**JOIN_WITH_AGGREGATED_WINDOW** OneTick event processor
|
|
2160
|
+
|
|
2161
|
+
Examples
|
|
2162
|
+
--------
|
|
2163
|
+
|
|
2164
|
+
>>> agg_src = otp.Ticks(A=[0, 1, 2, 3, 4, 5, 6])
|
|
2165
|
+
>>> pass_src = otp.Ticks(B=[1, 3, 5], offset=[1, 3, 5])
|
|
2166
|
+
>>> otp.run(agg_src)
|
|
2167
|
+
Time A
|
|
2168
|
+
0 2003-12-01 00:00:00.000 0
|
|
2169
|
+
1 2003-12-01 00:00:00.001 1
|
|
2170
|
+
2 2003-12-01 00:00:00.002 2
|
|
2171
|
+
3 2003-12-01 00:00:00.003 3
|
|
2172
|
+
4 2003-12-01 00:00:00.004 4
|
|
2173
|
+
5 2003-12-01 00:00:00.005 5
|
|
2174
|
+
6 2003-12-01 00:00:00.006 6
|
|
2175
|
+
>>> otp.run(pass_src)
|
|
2176
|
+
Time B
|
|
2177
|
+
0 2003-12-01 00:00:00.001 1
|
|
2178
|
+
1 2003-12-01 00:00:00.003 3
|
|
2179
|
+
2 2003-12-01 00:00:00.005 5
|
|
2180
|
+
|
|
2181
|
+
By default the aggregation is applied to the ticks from ``agg_src`` in the bucket
|
|
2182
|
+
from query start time until (but not including) the *effective* timestamp of the tick from ``pass_src``:
|
|
2183
|
+
|
|
2184
|
+
.. testcode::
|
|
2185
|
+
:skipif: not is_supported_join_with_aggregated_window()
|
|
2186
|
+
|
|
2187
|
+
agg_src = otp.Ticks(A=[0, 1, 2, 3, 4, 5, 6])
|
|
2188
|
+
pass_src = otp.Ticks(B=[1, 3, 5], offset=[1, 3, 5])
|
|
2189
|
+
data = otp.join_with_aggregated_window(
|
|
2190
|
+
agg_src, pass_src, {
|
|
2191
|
+
'SUM': otp.agg.sum('A'),
|
|
2192
|
+
'COUNT': otp.agg.count(),
|
|
2193
|
+
}
|
|
2194
|
+
)
|
|
2195
|
+
df = otp.run(data)
|
|
2196
|
+
print(df)
|
|
2197
|
+
|
|
2198
|
+
.. testoutput::
|
|
2199
|
+
|
|
2200
|
+
Time SUM COUNT B
|
|
2201
|
+
0 2003-12-01 00:00:00.001 0 1 1
|
|
2202
|
+
1 2003-12-01 00:00:00.003 3 3 3
|
|
2203
|
+
2 2003-12-01 00:00:00.005 10 5 5
|
|
2204
|
+
|
|
2205
|
+
If you want ticks from ``agg_src`` with timestamp equal to *effective* timestamp of tick from ``pass_src``
|
|
2206
|
+
to be included in bucket, you can set ``boundary_aggr_tick`` to ``previous``:
|
|
2207
|
+
|
|
2208
|
+
.. testcode::
|
|
2209
|
+
:skipif: not is_supported_join_with_aggregated_window()
|
|
2210
|
+
|
|
2211
|
+
agg_src = otp.Ticks(A=[0, 1, 2, 3, 4, 5, 6])
|
|
2212
|
+
pass_src = otp.Ticks(B=[1, 3, 5], offset=[1, 3, 5])
|
|
2213
|
+
data = otp.join_with_aggregated_window(
|
|
2214
|
+
agg_src, pass_src, {
|
|
2215
|
+
'SUM': otp.agg.sum('A'),
|
|
2216
|
+
'COUNT': otp.agg.count(),
|
|
2217
|
+
},
|
|
2218
|
+
boundary_aggr_tick='previous',
|
|
2219
|
+
)
|
|
2220
|
+
df = otp.run(data)
|
|
2221
|
+
print(df)
|
|
2222
|
+
|
|
2223
|
+
.. testoutput::
|
|
2224
|
+
|
|
2225
|
+
Time SUM COUNT B
|
|
2226
|
+
0 2003-12-01 00:00:00.001 1 2 1
|
|
2227
|
+
1 2003-12-01 00:00:00.003 6 4 3
|
|
2228
|
+
2 2003-12-01 00:00:00.005 15 6 5
|
|
2229
|
+
|
|
2230
|
+
Set parameters ``bucket_interval`` and ``bucket_units`` to control the size of the aggregation bucket.
|
|
2231
|
+
For example, to aggregate buckets of two ticks:
|
|
2232
|
+
|
|
2233
|
+
.. testcode::
|
|
2234
|
+
:skipif: not is_supported_join_with_aggregated_window()
|
|
2235
|
+
|
|
2236
|
+
agg_src = otp.Ticks(A=[0, 1, 2, 3, 4, 5, 6])
|
|
2237
|
+
pass_src = otp.Ticks(B=[1, 3, 5], offset=[1, 3, 5])
|
|
2238
|
+
data = otp.join_with_aggregated_window(
|
|
2239
|
+
agg_src, pass_src, {
|
|
2240
|
+
'SUM': otp.agg.sum('A'),
|
|
2241
|
+
'COUNT': otp.agg.count(),
|
|
2242
|
+
},
|
|
2243
|
+
boundary_aggr_tick='previous',
|
|
2244
|
+
bucket_interval=2,
|
|
2245
|
+
bucket_units='ticks',
|
|
2246
|
+
)
|
|
2247
|
+
df = otp.run(data)
|
|
2248
|
+
print(df)
|
|
2249
|
+
|
|
2250
|
+
.. testoutput::
|
|
2251
|
+
|
|
2252
|
+
Time SUM COUNT B
|
|
2253
|
+
0 2003-12-01 00:00:00.001 1 2 1
|
|
2254
|
+
1 2003-12-01 00:00:00.003 5 2 3
|
|
2255
|
+
2 2003-12-01 00:00:00.005 9 2 5
|
|
2256
|
+
|
|
2257
|
+
By default the *effective* timestamp of the tick from ``pass_src`` is the same as original.
|
|
2258
|
+
It can be changed with parameter ``pass_src_delay_msec``.
|
|
2259
|
+
The *effective* timestamp of the tick is calculated with ``T - pass_src_delay_msec``,
|
|
2260
|
+
and parameter ``pass_src_delay_msec`` can be negative too.
|
|
2261
|
+
This allows to shift bucket end boundary like this:
|
|
2262
|
+
|
|
2263
|
+
.. testcode::
|
|
2264
|
+
:skipif: not is_supported_join_with_aggregated_window()
|
|
2265
|
+
|
|
2266
|
+
agg_src = otp.Ticks(A=[0, 1, 2, 3, 4, 5, 6])
|
|
2267
|
+
pass_src = otp.Ticks(B=[1, 3, 5], offset=[1, 3, 5])
|
|
2268
|
+
data = otp.join_with_aggregated_window(
|
|
2269
|
+
agg_src, pass_src, {
|
|
2270
|
+
'SUM': otp.agg.sum('A'),
|
|
2271
|
+
'COUNT': otp.agg.count(),
|
|
2272
|
+
},
|
|
2273
|
+
boundary_aggr_tick='previous',
|
|
2274
|
+
pass_src_delay_msec=-1,
|
|
2275
|
+
)
|
|
2276
|
+
df = otp.run(data)
|
|
2277
|
+
print(df)
|
|
2278
|
+
|
|
2279
|
+
.. testoutput::
|
|
2280
|
+
|
|
2281
|
+
Time SUM COUNT B
|
|
2282
|
+
0 2003-12-01 00:00:00.001 3 3 1
|
|
2283
|
+
1 2003-12-01 00:00:00.003 10 5 3
|
|
2284
|
+
2 2003-12-01 00:00:00.005 21 7 5
|
|
2285
|
+
|
|
2286
|
+
Use parameter ``output_type_index`` to specify which input class to use to create output object.
|
|
2287
|
+
It may be useful in case some custom user class was used as input:
|
|
2288
|
+
|
|
2289
|
+
.. testcode::
|
|
2290
|
+
:skipif: not is_supported_join_with_aggregated_window()
|
|
2291
|
+
|
|
2292
|
+
class CustomTick(otp.Tick):
|
|
2293
|
+
def custom_method(self):
|
|
2294
|
+
return 'custom_result'
|
|
2295
|
+
data1 = otp.Tick(A=1)
|
|
2296
|
+
data2 = CustomTick(B=2)
|
|
2297
|
+
data = otp.join_with_aggregated_window(
|
|
2298
|
+
data1, data2, {'A': otp.agg.count()},
|
|
2299
|
+
boundary_aggr_tick='previous',
|
|
2300
|
+
output_type_index=1,
|
|
2301
|
+
)
|
|
2302
|
+
print(type(data))
|
|
2303
|
+
print(repr(data.custom_method()))
|
|
2304
|
+
print(otp.run(data))
|
|
2305
|
+
|
|
2306
|
+
.. testoutput::
|
|
2307
|
+
|
|
2308
|
+
<class 'onetick.py.functions.CustomTick'>
|
|
2309
|
+
'custom_result'
|
|
2310
|
+
Time A B
|
|
2311
|
+
0 2003-12-01 1 2
|
|
2312
|
+
|
|
2313
|
+
Use-case: check the volume in the 60 seconds following this trade (not including this trade):
|
|
2314
|
+
|
|
2315
|
+
>>> data = otp.DataSource('US_COMP', tick_type='TRD', symbols='MSFT', date=otp.dt(2022, 3, 3))
|
|
2316
|
+
>>> otp.run(data)
|
|
2317
|
+
Time PRICE SIZE
|
|
2318
|
+
0 2022-03-03 00:00:00.000 1.0 100
|
|
2319
|
+
1 2022-03-03 00:00:00.001 1.1 101
|
|
2320
|
+
2 2022-03-03 00:00:00.002 1.2 102
|
|
2321
|
+
3 2022-03-03 00:01:00.000 2.0 200
|
|
2322
|
+
4 2022-03-03 00:01:00.001 2.1 201
|
|
2323
|
+
5 2022-03-03 00:01:00.002 2.2 202
|
|
2324
|
+
|
|
2325
|
+
.. testcode::
|
|
2326
|
+
:skipif: not is_supported_join_with_aggregated_window()
|
|
2327
|
+
|
|
2328
|
+
data = otp.DataSource('US_COMP', tick_type='TRD', symbols='MSFT', date=otp.dt(2022, 3, 3))
|
|
2329
|
+
data = otp.join_with_aggregated_window(
|
|
2330
|
+
data, data, {'VOLUME': otp.agg.sum('SIZE')},
|
|
2331
|
+
boundary_aggr_tick='next',
|
|
2332
|
+
pass_src_delay_msec=-60000,
|
|
2333
|
+
bucket_interval=60, bucket_units='seconds',
|
|
2334
|
+
)
|
|
2335
|
+
df = otp.run(data)
|
|
2336
|
+
print(df)
|
|
2337
|
+
|
|
2338
|
+
.. testoutput::
|
|
2339
|
+
|
|
2340
|
+
Time VOLUME PRICE SIZE
|
|
2341
|
+
0 2022-03-03 00:00:00.000 203 1.0 100
|
|
2342
|
+
1 2022-03-03 00:00:00.001 302 1.1 101
|
|
2343
|
+
2 2022-03-03 00:00:00.002 401 1.2 102
|
|
2344
|
+
3 2022-03-03 00:01:00.000 403 2.0 200
|
|
2345
|
+
4 2022-03-03 00:01:00.001 202 2.1 201
|
|
2346
|
+
5 2022-03-03 00:01:00.002 0 2.2 202
|
|
2347
|
+
"""
|
|
2348
|
+
if not is_supported_join_with_aggregated_window():
|
|
2349
|
+
raise RuntimeError('Function join_with_aggregated_window() is not supported on this OneTick build')
|
|
2350
|
+
|
|
2351
|
+
if boundary_aggr_tick not in {'next', 'previous'}:
|
|
2352
|
+
raise ValueError(f"Wrong value of 'boundary_aggr_tick' parameter: '{boundary_aggr_tick}'")
|
|
2353
|
+
if boundary_aggr_tick == 'next':
|
|
2354
|
+
boundary_aggr_tick_behavior = 'NEXT_WINDOW'
|
|
2355
|
+
is_supported_next_in_join_with_aggregated_window(
|
|
2356
|
+
throw_warning=True,
|
|
2357
|
+
feature_name="setting parameter 'boundary_aggr_tick' to 'next' (as this may result in crash)"
|
|
2358
|
+
)
|
|
2359
|
+
else:
|
|
2360
|
+
boundary_aggr_tick_behavior = 'PREV_WINDOW'
|
|
2361
|
+
|
|
2362
|
+
aggregation_str = ','.join([
|
|
2363
|
+
str(aggr) + " " + name
|
|
2364
|
+
for name, aggr in aggregation.items()
|
|
2365
|
+
])
|
|
2366
|
+
|
|
2367
|
+
params = dict(
|
|
2368
|
+
aggregation_source='__AGG_SRC__',
|
|
2369
|
+
pass_source='__PASS_SRC__',
|
|
2370
|
+
boundary_aggr_tick_behavior=boundary_aggr_tick_behavior,
|
|
2371
|
+
append_output_field_name=False,
|
|
2372
|
+
aggregation=aggregation_str,
|
|
2373
|
+
pass_source_delay_msec=pass_src_delay_msec,
|
|
2374
|
+
bucket_interval=bucket_interval,
|
|
2375
|
+
bucket_interval_units=bucket_units.upper(),
|
|
2376
|
+
)
|
|
2377
|
+
|
|
2378
|
+
output_type = output_type_by_index((agg_src, pass_src), output_type_index)
|
|
2379
|
+
|
|
2380
|
+
agg_src = agg_src.copy()
|
|
2381
|
+
pass_src = pass_src.copy()
|
|
2382
|
+
|
|
2383
|
+
agg_src.node_name('__AGG_SRC__')
|
|
2384
|
+
pass_src.node_name('__PASS_SRC__')
|
|
2385
|
+
|
|
2386
|
+
columns = {}
|
|
2387
|
+
for name, aggr in aggregation.items():
|
|
2388
|
+
columns.update(aggr._get_output_schema(agg_src, name=name))
|
|
2389
|
+
columns.update(pass_src.schema)
|
|
2390
|
+
result = output_type(node=otq.JoinWithAggregatedWindow(**params), schema=columns)
|
|
2391
|
+
|
|
2392
|
+
__copy_sources_on_merge_or_join(result, (agg_src, pass_src),
|
|
2393
|
+
names=('__AGG_SRC__', '__PASS_SRC__'),
|
|
2394
|
+
output_type_index=output_type_index)
|
|
2395
|
+
|
|
2396
|
+
# adding table to convert types in schema, e.g. float to int
|
|
2397
|
+
result._add_table(strict=False)
|
|
2398
|
+
return result
|