onetick-py 1.177.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (152) hide show
  1. locator_parser/__init__.py +0 -0
  2. locator_parser/acl.py +73 -0
  3. locator_parser/actions.py +262 -0
  4. locator_parser/common.py +368 -0
  5. locator_parser/io.py +43 -0
  6. locator_parser/locator.py +150 -0
  7. onetick/__init__.py +101 -0
  8. onetick/doc_utilities/__init__.py +3 -0
  9. onetick/doc_utilities/napoleon.py +40 -0
  10. onetick/doc_utilities/ot_doctest.py +140 -0
  11. onetick/doc_utilities/snippets.py +279 -0
  12. onetick/lib/__init__.py +4 -0
  13. onetick/lib/instance.py +141 -0
  14. onetick/py/__init__.py +293 -0
  15. onetick/py/_stack_info.py +89 -0
  16. onetick/py/_version.py +2 -0
  17. onetick/py/aggregations/__init__.py +11 -0
  18. onetick/py/aggregations/_base.py +648 -0
  19. onetick/py/aggregations/_docs.py +948 -0
  20. onetick/py/aggregations/compute.py +286 -0
  21. onetick/py/aggregations/functions.py +2216 -0
  22. onetick/py/aggregations/generic.py +104 -0
  23. onetick/py/aggregations/high_low.py +80 -0
  24. onetick/py/aggregations/num_distinct.py +83 -0
  25. onetick/py/aggregations/order_book.py +501 -0
  26. onetick/py/aggregations/other.py +1014 -0
  27. onetick/py/backports.py +26 -0
  28. onetick/py/cache.py +374 -0
  29. onetick/py/callback/__init__.py +5 -0
  30. onetick/py/callback/callback.py +276 -0
  31. onetick/py/callback/callbacks.py +131 -0
  32. onetick/py/compatibility.py +798 -0
  33. onetick/py/configuration.py +771 -0
  34. onetick/py/core/__init__.py +0 -0
  35. onetick/py/core/_csv_inspector.py +93 -0
  36. onetick/py/core/_internal/__init__.py +0 -0
  37. onetick/py/core/_internal/_manually_bound_value.py +6 -0
  38. onetick/py/core/_internal/_nodes_history.py +250 -0
  39. onetick/py/core/_internal/_op_utils/__init__.py +0 -0
  40. onetick/py/core/_internal/_op_utils/every_operand.py +9 -0
  41. onetick/py/core/_internal/_op_utils/is_const.py +10 -0
  42. onetick/py/core/_internal/_per_tick_scripts/tick_list_sort_template.script +121 -0
  43. onetick/py/core/_internal/_proxy_node.py +140 -0
  44. onetick/py/core/_internal/_state_objects.py +2312 -0
  45. onetick/py/core/_internal/_state_vars.py +93 -0
  46. onetick/py/core/_source/__init__.py +0 -0
  47. onetick/py/core/_source/_symbol_param.py +95 -0
  48. onetick/py/core/_source/schema.py +97 -0
  49. onetick/py/core/_source/source_methods/__init__.py +0 -0
  50. onetick/py/core/_source/source_methods/aggregations.py +809 -0
  51. onetick/py/core/_source/source_methods/applyers.py +296 -0
  52. onetick/py/core/_source/source_methods/columns.py +141 -0
  53. onetick/py/core/_source/source_methods/data_quality.py +301 -0
  54. onetick/py/core/_source/source_methods/debugs.py +272 -0
  55. onetick/py/core/_source/source_methods/drops.py +120 -0
  56. onetick/py/core/_source/source_methods/fields.py +619 -0
  57. onetick/py/core/_source/source_methods/filters.py +1002 -0
  58. onetick/py/core/_source/source_methods/joins.py +1413 -0
  59. onetick/py/core/_source/source_methods/merges.py +605 -0
  60. onetick/py/core/_source/source_methods/misc.py +1455 -0
  61. onetick/py/core/_source/source_methods/pandases.py +155 -0
  62. onetick/py/core/_source/source_methods/renames.py +356 -0
  63. onetick/py/core/_source/source_methods/sorts.py +183 -0
  64. onetick/py/core/_source/source_methods/switches.py +142 -0
  65. onetick/py/core/_source/source_methods/symbols.py +117 -0
  66. onetick/py/core/_source/source_methods/times.py +627 -0
  67. onetick/py/core/_source/source_methods/writes.py +986 -0
  68. onetick/py/core/_source/symbol.py +205 -0
  69. onetick/py/core/_source/tmp_otq.py +222 -0
  70. onetick/py/core/column.py +209 -0
  71. onetick/py/core/column_operations/__init__.py +0 -0
  72. onetick/py/core/column_operations/_methods/__init__.py +4 -0
  73. onetick/py/core/column_operations/_methods/_internal.py +28 -0
  74. onetick/py/core/column_operations/_methods/conversions.py +216 -0
  75. onetick/py/core/column_operations/_methods/methods.py +292 -0
  76. onetick/py/core/column_operations/_methods/op_types.py +160 -0
  77. onetick/py/core/column_operations/accessors/__init__.py +0 -0
  78. onetick/py/core/column_operations/accessors/_accessor.py +28 -0
  79. onetick/py/core/column_operations/accessors/decimal_accessor.py +104 -0
  80. onetick/py/core/column_operations/accessors/dt_accessor.py +537 -0
  81. onetick/py/core/column_operations/accessors/float_accessor.py +184 -0
  82. onetick/py/core/column_operations/accessors/str_accessor.py +1367 -0
  83. onetick/py/core/column_operations/base.py +1121 -0
  84. onetick/py/core/cut_builder.py +150 -0
  85. onetick/py/core/db_constants.py +20 -0
  86. onetick/py/core/eval_query.py +245 -0
  87. onetick/py/core/lambda_object.py +441 -0
  88. onetick/py/core/multi_output_source.py +232 -0
  89. onetick/py/core/per_tick_script.py +2256 -0
  90. onetick/py/core/query_inspector.py +464 -0
  91. onetick/py/core/source.py +1744 -0
  92. onetick/py/db/__init__.py +2 -0
  93. onetick/py/db/_inspection.py +1128 -0
  94. onetick/py/db/db.py +1327 -0
  95. onetick/py/db/utils.py +64 -0
  96. onetick/py/docs/__init__.py +0 -0
  97. onetick/py/docs/docstring_parser.py +112 -0
  98. onetick/py/docs/utils.py +81 -0
  99. onetick/py/functions.py +2398 -0
  100. onetick/py/license.py +190 -0
  101. onetick/py/log.py +88 -0
  102. onetick/py/math.py +935 -0
  103. onetick/py/misc.py +470 -0
  104. onetick/py/oqd/__init__.py +22 -0
  105. onetick/py/oqd/eps.py +1195 -0
  106. onetick/py/oqd/sources.py +325 -0
  107. onetick/py/otq.py +216 -0
  108. onetick/py/pyomd_mock.py +47 -0
  109. onetick/py/run.py +916 -0
  110. onetick/py/servers.py +173 -0
  111. onetick/py/session.py +1347 -0
  112. onetick/py/sources/__init__.py +19 -0
  113. onetick/py/sources/cache.py +167 -0
  114. onetick/py/sources/common.py +128 -0
  115. onetick/py/sources/csv.py +642 -0
  116. onetick/py/sources/custom.py +85 -0
  117. onetick/py/sources/data_file.py +305 -0
  118. onetick/py/sources/data_source.py +1045 -0
  119. onetick/py/sources/empty.py +94 -0
  120. onetick/py/sources/odbc.py +337 -0
  121. onetick/py/sources/order_book.py +271 -0
  122. onetick/py/sources/parquet.py +168 -0
  123. onetick/py/sources/pit.py +191 -0
  124. onetick/py/sources/query.py +495 -0
  125. onetick/py/sources/snapshots.py +419 -0
  126. onetick/py/sources/split_query_output_by_symbol.py +198 -0
  127. onetick/py/sources/symbology_mapping.py +123 -0
  128. onetick/py/sources/symbols.py +374 -0
  129. onetick/py/sources/ticks.py +825 -0
  130. onetick/py/sql.py +70 -0
  131. onetick/py/state.py +251 -0
  132. onetick/py/types.py +2131 -0
  133. onetick/py/utils/__init__.py +70 -0
  134. onetick/py/utils/acl.py +93 -0
  135. onetick/py/utils/config.py +186 -0
  136. onetick/py/utils/default.py +49 -0
  137. onetick/py/utils/file.py +38 -0
  138. onetick/py/utils/helpers.py +76 -0
  139. onetick/py/utils/locator.py +94 -0
  140. onetick/py/utils/perf.py +498 -0
  141. onetick/py/utils/query.py +49 -0
  142. onetick/py/utils/render.py +1374 -0
  143. onetick/py/utils/script.py +244 -0
  144. onetick/py/utils/temp.py +471 -0
  145. onetick/py/utils/types.py +120 -0
  146. onetick/py/utils/tz.py +84 -0
  147. onetick_py-1.177.0.dist-info/METADATA +137 -0
  148. onetick_py-1.177.0.dist-info/RECORD +152 -0
  149. onetick_py-1.177.0.dist-info/WHEEL +5 -0
  150. onetick_py-1.177.0.dist-info/entry_points.txt +2 -0
  151. onetick_py-1.177.0.dist-info/licenses/LICENSE +21 -0
  152. onetick_py-1.177.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,2398 @@
1
+ import itertools
2
+ import warnings
3
+ import inspect
4
+ import re
5
+ import datetime as dt
6
+ from collections import defaultdict, Counter
7
+ from functools import singledispatch
8
+ from itertools import chain, zip_longest, repeat
9
+ from typing import List, Union, Type, Optional, Sequence
10
+ from enum import Enum
11
+
12
+ from onetick.py.otq import otq
13
+
14
+ from onetick.py.configuration import config, default_presort_concurrency
15
+ from onetick.py.core.eval_query import _QueryEvalWrapper
16
+ from onetick.py.core._source._symbol_param import _SymbolParamSource
17
+ from onetick.py.core._source.tmp_otq import TmpOtq
18
+ from onetick.py.utils import get_type_that_includes, adaptive, default
19
+ import onetick.py.types as ott
20
+ from onetick.py.core.column import Column
21
+ from onetick.py.core.column_operations.base import Operation
22
+ from onetick.py.core.cut_builder import _QCutBuilder, _CutBuilder
23
+ from onetick.py.backports import Literal
24
+ from onetick.py.compatibility import (
25
+ is_supported_join_with_aggregated_window,
26
+ is_supported_next_in_join_with_aggregated_window,
27
+ is_apply_rights_supported,
28
+ )
29
+
30
+
31
+ __all__ = ['merge', 'join', 'join_by_time', 'apply_query', 'apply', 'cut', 'qcut', 'coalesce', 'corp_actions', 'format']
32
+
33
+
34
+ def output_type_by_index(sources, index):
35
+ if index is None:
36
+ from onetick.py.core.source import _Source
37
+ return _Source
38
+ return type(sources[index])
39
+
40
+
41
+ def apply_symbol_to_ep(base_ep, symbol, tmp_otq, symbol_date=None):
42
+ if not symbol:
43
+ return base_ep
44
+
45
+ from onetick.py.core.source import _Source
46
+ from onetick.py.sources import query as otp_query
47
+
48
+ if isinstance(symbol, _QueryEvalWrapper):
49
+ symbol = symbol.to_eval_string(tmp_otq=tmp_otq, symbol_date=symbol_date)
50
+ elif isinstance(symbol, otp_query):
51
+ if symbol_date is not None:
52
+ raise ValueError("Parameter 'symbol_date' is not supported if symbols are set with otp.query object")
53
+ symbol = symbol.to_eval_string()
54
+ elif isinstance(symbol, (_Source, otq.GraphQuery)):
55
+ symbol = _Source._convert_symbol_to_string(symbol, tmp_otq=tmp_otq, symbol_date=symbol_date,)
56
+
57
+ return base_ep.symbols(symbol)
58
+
59
+
60
+ def merge(sources, align_schema=True, symbols=None, identify_input_ts=False,
61
+ presort=adaptive, concurrency=default, batch_size=default, output_type_index=None,
62
+ add_symbol_index: bool = False, separate_db_name: bool = False,
63
+ added_field_name_suffix: str = '', stabilize_schema: Union[Type[adaptive], bool] = adaptive,
64
+ enforce_order: bool = False, symbol_date=None):
65
+ """
66
+ Merges ticks from the ``sources`` into a single output ordered by the timestamp.
67
+
68
+ Note
69
+ ----
70
+ If merged ticks have the same timestamp, their order is not guaranteed by default.
71
+ Set parameter ``enforce_order`` to set the order according to parameter ``sources``.
72
+
73
+ Parameters
74
+ ----------
75
+ sources : list
76
+ List of sources to merge
77
+ align_schema : bool
78
+ If set to True, then table is added right after merge.
79
+ We recommended to keep True to prevent problems with
80
+ different tick schemas. Default: True
81
+ symbols: str, list of str or functions, :class:`Source`, :py:class:`onetick.query.GraphQuery`
82
+ Symbol(s) to run the query for passed as a string, a list of strings, or as a "symbols" query which results
83
+ include the ``SYMBOL_NAME`` column. The start/end times for the
84
+ symbols query will taken from the :meth:`run` params.
85
+ See :ref:`symbols <static/concepts/symbols:Symbols: bound and unbound>` for more details.
86
+ identify_input_ts: bool
87
+ If set to False, the fields *SYMBOL_NAME* and *TICK_TYPE* are not appended to the output ticks.
88
+ presort: bool
89
+ Add the **PRESORT** EP before merging.
90
+ By default, it is set to True if ``symbols`` are set
91
+ and to False otherwise.
92
+ concurrency: int
93
+ Specifies the number of CPU cores to utilize for the ``presort``.
94
+ By default, the value is inherited from the value of the query where this PRESORT is used.
95
+
96
+ For the main query it may be specified in the ``concurrency`` parameter of :meth:`run` method
97
+ (which by default is set to
98
+ :py:attr:`otp.config.default_concurrency<onetick.py.configuration.Config.default_concurrency>`).
99
+
100
+ For the auxiliary queries (like first-stage queries) empty value means OneTick's default of 1.
101
+ If :py:attr:`otp.config.presort_force_default_concurrency<onetick.py.configuration.Config.presort_force_default_concurrency>`
102
+ is set then default concurrency value will be set in all PRESORT EPs in all queries.
103
+ batch_size: int
104
+ Specifies the query batch size for the ``presort``.
105
+ By default, the value from
106
+ :py:attr:`otp.config.default_batch_size<onetick.py.configuration.Config.default_batch_size>`
107
+ is used.
108
+ output_type_index: int
109
+ Specifies index of source in ``sources`` from which type and properties of output will be taken.
110
+ Useful when merging sources that inherited from :class:`Source`.
111
+ By default, output object type will be :class:`Source`.
112
+ add_symbol_index: bool
113
+ If set to True, this function adds a field *SYMBOL_INDEX* to each tick,
114
+ with a numeric index (1-based) corresponding to the symbol the tick is for.
115
+ separate_db_name: bool
116
+ If set to True, the security name of the input time series is separated into
117
+ the pure symbol name and the database name parts
118
+ propagated in the *SYMBOL_NAME* and *DB_NAME* fields, respectively.
119
+ Otherwise, the full symbol name is propagated in a single field called *SYMBOL_NAME*.
120
+ added_field_name_suffix: str
121
+ The suffix to add to the names of additional fields
122
+ (that is, *SYMBOL_NAME*, *TICK_TYPE*, *DB_NAME* and *SYMBOL_INDEX*).
123
+ stabilize_schema: bool
124
+ If set to True, any fields that were present on any tick in the input time series
125
+ will be present in the ticks of the output time series.
126
+ New fields will be added to the output tick at the point they are first seen in the input time series.
127
+ If any field already present in the input is not present on a given input tick,
128
+ its type will be determined by the widest encountered type under that field name.
129
+ Incompatible types (for example, int and float) under the same field name will result in an exception.
130
+
131
+ Default is False.
132
+ enforce_order: bool
133
+ If merged ticks have the same timestamp, their order is not guaranteed by default.
134
+ Set this parameter to True to set the order according to parameter ``sources``.
135
+
136
+ Special OneTick field *OMDSEQ* will be used to order sources.
137
+ If it exists then it will be overwritten and deleted.
138
+ symbol_date: :py:class:`otp.datetime <onetick.py.datetime>` or :py:class:`datetime.datetime` or int
139
+ Symbol date or integer in the YYYYMMDD format.
140
+ Can only be specified if parameters ``symbols`` is set.
141
+
142
+ Returns
143
+ -------
144
+ :class:`Source` or same class as ``sources[output_type_index]``
145
+ A time series of ticks.
146
+
147
+ See also
148
+ --------
149
+ **MERGE** and **PRESORT** OneTick event processors
150
+
151
+ Examples
152
+ --------
153
+
154
+ ``merge`` is used to merge different data sources:
155
+
156
+ >>> data1 = otp.Ticks(X=[1, 2], Y=['a', 'd'])
157
+ >>> data2 = otp.Ticks(X=[-1, -2], Y=['*', '-'])
158
+ >>> data = otp.merge([data1, data2]) # OTdirective: snippet-name:merge.as list;
159
+ >>> otp.run(data)
160
+ Time X Y
161
+ 0 2003-12-01 00:00:00.000 1 a
162
+ 1 2003-12-01 00:00:00.000 -1 *
163
+ 2 2003-12-01 00:00:00.001 2 d
164
+ 3 2003-12-01 00:00:00.001 -2 -
165
+
166
+ Merge series from multiple symbols into one series:
167
+
168
+ >>> # OTdirective: snippet-name:merge.bound symbols;
169
+ >>> data = otp.Ticks(X=[1])
170
+ >>> data['SYMBOL_NAME'] = data['_SYMBOL_NAME']
171
+ >>> symbols = otp.Ticks(SYMBOL_NAME=['A', 'B'])
172
+ >>> data = otp.merge([data], symbols=symbols)
173
+ >>> otp.run(data)
174
+ Time X SYMBOL_NAME
175
+ 0 2003-12-01 1 A
176
+ 1 2003-12-01 1 B
177
+
178
+ Use ``identify_input_ts`` and other parameters to add information about symbol to each tick:
179
+
180
+ >>> symbols = otp.Ticks(SYMBOL_NAME=['COMMON::S1', 'DEMO_L1::S2'])
181
+ >>> data = otp.Tick(A=1, db=None, tick_type='TT')
182
+ >>> data = otp.merge([data], symbols=symbols, identify_input_ts=True,
183
+ ... separate_db_name=True, add_symbol_index=True, added_field_name_suffix='__')
184
+ >>> otp.run(data)
185
+ Time A SYMBOL_NAME__ DB_NAME__ TICK_TYPE__ SYMBOL_INDEX__
186
+ 0 2003-12-01 1 S1 COMMON TT 1
187
+ 1 2003-12-01 1 S2 DEMO_L1 TT 2
188
+
189
+ Adding symbol parameters before merge:
190
+
191
+ >>> symbols = otp.Ticks(SYMBOL_NAME=['S1', 'S2'], param=[1, -1])
192
+ >>> def func(symbol):
193
+ ... pre = otp.Ticks(X=[1])
194
+ ... pre["SYMBOL_NAME"] = symbol.name
195
+ ... pre["PARAM"] = symbol.param
196
+ ... return pre
197
+ >>> data = otp.merge([func], symbols=symbols)
198
+ >>> otp.run(data)[['PARAM', 'SYMBOL_NAME']]
199
+ PARAM SYMBOL_NAME
200
+ 0 1 S1
201
+ 1 -1 S2
202
+
203
+ Use parameter ``output_type_index`` to specify which input class to use to create output object.
204
+ It may be useful in case some custom user class was used as input:
205
+
206
+ >>> class CustomTick(otp.Tick):
207
+ ... def custom_method(self):
208
+ ... return 'custom_result'
209
+ >>> data1 = otp.Tick(A=1)
210
+ >>> data2 = CustomTick(B=2)
211
+ >>> data = otp.merge([data1, data2], output_type_index=1)
212
+ >>> type(data)
213
+ <class 'onetick.py.functions.CustomTick'>
214
+ >>> data.custom_method()
215
+ 'custom_result'
216
+ >>> otp.run(data)
217
+ Time A B
218
+ 0 2003-12-01 1 0
219
+ 1 2003-12-01 0 2
220
+ """ # noqa: E501
221
+ from onetick.py.core.source import _Source
222
+
223
+ if not sources:
224
+ raise ValueError("Merge should have one or more inputs")
225
+
226
+ output_type = output_type_by_index(sources, output_type_index)
227
+
228
+ if presort is adaptive:
229
+ presort = True if symbols is not None else False
230
+
231
+ if concurrency is not default and not presort:
232
+ warnings.warn("Using the `concurrency` parameter makes effect only when "
233
+ "the `presort` parameter is set to True")
234
+ if batch_size is not default and not presort:
235
+ warnings.warn("Using the `batch_size` parameter makes effect only when "
236
+ "the `presort` parameter is set to True")
237
+
238
+ if concurrency is default:
239
+ concurrency = default_presort_concurrency()
240
+ if concurrency is None:
241
+ # None means inherit concurrency from the query where this EP is used
242
+ # otq.Presort does not support None
243
+ concurrency = ''
244
+
245
+ if batch_size is default:
246
+ batch_size = config.default_batch_size
247
+
248
+ merge_kwargs = {
249
+ 'identify_input_ts': identify_input_ts,
250
+ 'add_symbol_index': add_symbol_index,
251
+ 'separate_db_name': separate_db_name,
252
+ 'added_field_name_suffix': added_field_name_suffix,
253
+ }
254
+
255
+ if 'stabilize_schema' in otq.Merge.Parameters.list_parameters():
256
+ if stabilize_schema is adaptive:
257
+ stabilize_schema = False
258
+ merge_kwargs['stabilize_schema'] = stabilize_schema
259
+ elif stabilize_schema is not adaptive:
260
+ raise ValueError("Parameter 'stabilize_schema' is not supported in this OneTick build")
261
+
262
+ if symbol_date is not None:
263
+ if symbols is None:
264
+ raise ValueError("Parameter 'symbol_date' can only be specified together with parameter 'symbols'")
265
+ if isinstance(symbols, (str, list)):
266
+ # this is a hack
267
+ # onetick.query doesn't have an interface to set symbol_date for the EP node
268
+ # so instead of setting symbols for the EP node,
269
+ # we will turn symbol list into the first stage query, and symbol_date will be set for this query
270
+ import onetick.py as otp
271
+ if isinstance(symbols, str):
272
+ symbols = [symbols]
273
+ symbols = otp.Ticks(SYMBOL_NAME=symbols)
274
+
275
+ def _base_ep_for_cross_symbol(symbol, tmp_otq, symbol_date=None):
276
+ if presort:
277
+ base_ep = otq.Presort(batch_size=batch_size, max_concurrency=concurrency)
278
+ else:
279
+ base_ep = otq.Merge(**merge_kwargs)
280
+
281
+ base_ep = apply_symbol_to_ep(base_ep, symbol, tmp_otq, symbol_date=symbol_date)
282
+
283
+ return base_ep
284
+
285
+ def _evaluate_functions_in_sources_list(sources, symbols):
286
+ result = []
287
+
288
+ if not isinstance(sources, list):
289
+ sources = [sources]
290
+
291
+ for s in sources:
292
+ if not isinstance(s, _Source) and callable(s):
293
+ num_params = len(inspect.signature(s).parameters)
294
+
295
+ if num_params == 0:
296
+ s = s()
297
+ elif num_params == 1:
298
+ s = s(symbols.to_symbol_param() if isinstance(symbols, (_Source, _QueryEvalWrapper))
299
+ else _SymbolParamSource())
300
+ else:
301
+ raise ValueError(
302
+ f"It is expected only one parameter from the callback, but {num_params} passed"
303
+ ) # TODO: test this case
304
+ if isinstance(s, _Source):
305
+ result.append(s)
306
+ else:
307
+ raise ValueError("Source and functions (returning _source) are expected as preprocessors")
308
+ return result
309
+
310
+ sources = _evaluate_functions_in_sources_list(sources, symbols)
311
+ if enforce_order:
312
+ sources = _enforce_order_for_sources(sources)
313
+ need_table = False
314
+ merged_columns, need_table, used_columns = _collect_merged_columns(need_table, sources)
315
+ need_table = _is_table_after_merge_needed(need_table, used_columns)
316
+
317
+ # we need to store internal graphs somewhere while we create base ep from eval
318
+ intermediate_tmp_otq = TmpOtq()
319
+ result = output_type(node=_base_ep_for_cross_symbol(symbols, tmp_otq=intermediate_tmp_otq, symbol_date=symbol_date),
320
+ schema=merged_columns)
321
+ result._tmp_otq.merge(intermediate_tmp_otq)
322
+
323
+ __copy_sources_on_merge_or_join(result, sources, symbols, output_type_index=output_type_index)
324
+
325
+ if presort:
326
+ result.sink(otq.Merge(**merge_kwargs))
327
+
328
+ if enforce_order:
329
+ result.drop('OMDSEQ', inplace=True)
330
+ merged_columns.pop('OMDSEQ')
331
+
332
+ if identify_input_ts:
333
+ result.schema['SYMBOL_NAME' + added_field_name_suffix] = str
334
+ result.schema['TICK_TYPE' + added_field_name_suffix] = str
335
+ if separate_db_name:
336
+ result.schema['DB_NAME' + added_field_name_suffix] = str
337
+
338
+ if add_symbol_index:
339
+ result.schema['SYMBOL_INDEX' + added_field_name_suffix] = int
340
+
341
+ result = _add_table_after_merge(align_schema, merged_columns, need_table, result)
342
+ result._fix_varstrings()
343
+ return result
344
+
345
+
346
+ def _add_table_after_merge(add_table, merged_columns, need_table, result):
347
+ if add_table and need_table:
348
+ # a special case, when the add_table parameter is a list of common columns that should
349
+ # be added to a final table
350
+ # it is used internally
351
+ if isinstance(add_table, list):
352
+ merged_columns = {key: value for key, value in merged_columns.items() if key in add_table}
353
+
354
+ if len(merged_columns):
355
+ table = otq.Table(
356
+ fields=",".join(ott.type2str(dtype) + " " + name for name, dtype in merged_columns.items()),
357
+ keep_input_fields=True,
358
+ )
359
+ result.sink(table)
360
+ return result
361
+
362
+
363
+ def __copy_sources_on_merge_or_join(result,
364
+ sources,
365
+ symbols=None,
366
+ names=None,
367
+ drop_meta=False,
368
+ leading=None,
369
+ output_type_index=None,
370
+ use_rename_ep=True):
371
+ """ copy columns, state vars and other metadata from joining, merging sources
372
+
373
+ Parameters
374
+ ----------
375
+ result: _Source
376
+ Source object constructed as join, merge operation, e.g. result = _Source(otq.Merge(sources))
377
+ sources: list of _Source, tuple of _Source
378
+ Sources were joined, merged
379
+ symbols:
380
+ Symbols to copy
381
+ names: list of str or None, tuple of str or None, bool, optional
382
+ - If collection of string or None than add passthrough eps with such name to `sources` if name is specify
383
+ or do not add anything if corresponding item in names is None.
384
+ - If True, than autogenerate such names in __SRC_{number}__ format
385
+ - If None, False than do not add passthrough eps and do not change node names.
386
+ drop_meta : bool, optional
387
+ If True drop TIMESTAMP and OMDSEQ field
388
+ leading : List of str, Tuple of str, Optional
389
+ List of leading sources names
390
+ output_type_index: int, optional
391
+ Specifies index of source in `sources` from which properties of `result` will be taken.
392
+ Useful when merging sources that inherited from otp.Source.
393
+ use_rename_ep: bool
394
+ Use :py:class:`onetick.query.RenameFields` event processor or not.
395
+ This event processor can't be used in generic aggregation.
396
+
397
+ Returns
398
+ -------
399
+ None
400
+ Modify result directly
401
+ """
402
+ from onetick.py.core.source import _Source
403
+
404
+ result._copy_state_vars_from(sources)
405
+ result._clean_sources_dates() # because it is not a real _source
406
+
407
+ for source in sources:
408
+ result._merge_tmp_otq(source)
409
+ if source.get_name():
410
+ if not result.get_name():
411
+ result.set_name(source.get_name())
412
+ if result.get_name() != source.get_name():
413
+ warnings.warn(f"Merging/joining sources with different names: '{result.get_name()}' "
414
+ f"and '{source.get_name()}'. Some of those names will be lost")
415
+
416
+ if isinstance(symbols, _Source):
417
+ result._merge_tmp_otq(symbols)
418
+
419
+ names = __copy_and_rename_nodes_on_merge_join(result, names, sources, symbols)
420
+
421
+ if drop_meta:
422
+ to_drop = list(map(lambda x: x + ".TIMESTAMP", names))
423
+ to_drop += list(map(lambda x: x + ".OMDSEQ", names))
424
+ __rename_leading_omdseq(leading, names, result, sources, use_rename_ep=use_rename_ep)
425
+ result.sink(otq.Passthrough(fields=",".join(to_drop), drop_fields=True))
426
+
427
+ if output_type_index is not None:
428
+ result._copy_properties_from(sources[output_type_index])
429
+
430
+
431
+ def __rename_fields(source, mapping, use_rename_ep=True):
432
+ """
433
+ Function to rename fields from ``mapping`` in ``source``.
434
+ Note that it is a low-level function that doesn't change python schema of the ``source``.
435
+ Modifies ``source`` inplace, doesn't return anything.
436
+ If ``use_rename_ep`` is `True`, then :py:class:`onetick.query.RenameFields` event processor will be used.
437
+ """
438
+ if use_rename_ep:
439
+ source.sink(otq.RenameFields(','.join(f'{k}={v}' for k, v in mapping.items())))
440
+ return
441
+ # May be needed, because RenameFields ep is not supported in generic aggregation
442
+ for old, new in mapping.items():
443
+ # RenameFields ignores non-existent fields,
444
+ # all this mess is needed to mimic that logic
445
+ source.sink(otq.WhereClause(where=f'UNDEFINED("{old}")'))
446
+ if_branch_graph = source.node().copy_graph()
447
+ if_branch_rules = source.node().copy_rules()
448
+ source.sink(otq.AddField(new, old), out_pin='ELSE')
449
+ source.sink(otq.Passthrough(old, drop_fields=True))
450
+ source.sink(otq.Merge(identify_input_ts=False))
451
+ source.source(if_branch_graph)
452
+ source.node().add_rules(if_branch_rules)
453
+
454
+
455
+ def __rename_leading_omdseq(leading, names, result, sources, use_rename_ep=True):
456
+ if leading is not None:
457
+ if len(leading) == 1:
458
+ leading = leading.pop()
459
+ __rename_fields(result, {f"{leading}.OMDSEQ": "OMDSEQ"}, use_rename_ep=use_rename_ep)
460
+ else:
461
+ number, indexes = __get_number_and_indexes_of_sources_have_field(sources, "OMDSEQ")
462
+ if number == 1:
463
+ __rename_fields(result, {f"{names[indexes.pop()]}.OMDSEQ": "OMDSEQ"}, use_rename_ep=use_rename_ep)
464
+ elif number:
465
+ raise ValueError(
466
+ "Several sources was specified as leading and OMDSEQ field is presented in more than "
467
+ "one source. Resulted OMDSEQ can't be derived in such case."
468
+ )
469
+
470
+
471
+ def __get_number_and_indexes_of_sources_have_field(sources, field):
472
+ number = 0
473
+ indexes = []
474
+ for s in sources:
475
+ if field in s.columns():
476
+ indexes.append(number)
477
+ number += 1
478
+ return number, indexes
479
+
480
+
481
+ def __copy_and_rename_nodes_on_merge_join(result, names, sources, symbols):
482
+ # shared eps between sources
483
+ eps = defaultdict()
484
+ if names is True:
485
+ names = [f"__SRC_{n}__" for n in range(len(sources))]
486
+ if not names:
487
+ names = itertools.repeat(None)
488
+ if sources:
489
+ for name, src in zip(names, sources):
490
+ obj = src
491
+ if name:
492
+ obj = src.copy()
493
+ obj.sink(otq.Passthrough())
494
+ obj.node_name(name)
495
+
496
+ result.source(obj.node().copy_graph(eps))
497
+ result.node().add_rules(obj.node().copy_rules())
498
+ result._set_sources_dates(obj, copy_symbols=not bool(symbols))
499
+ return names
500
+
501
+
502
+ def _is_table_after_merge_needed(need_table, used_columns):
503
+ if not need_table:
504
+ for key, value in used_columns.items():
505
+ if not value:
506
+ need_table = True
507
+ break
508
+
509
+ return need_table
510
+
511
+
512
+ def _collect_merged_columns(need_table, sources):
513
+ merged_columns = sources[0].columns(skip_meta_fields=True)
514
+ used_columns = {key: False for key in merged_columns.keys()}
515
+ for src in sources[1:]:
516
+ for key, value in src.columns(skip_meta_fields=True).items():
517
+ if key in merged_columns:
518
+ orig_type = merged_columns[key]
519
+ try:
520
+ merged_dtype, merged_need_table = get_type_that_includes([orig_type, value])
521
+ except ValueError as e:
522
+ raise ValueError(f"Column '{key}' has different types for "
523
+ f"different branches: {orig_type} {value}") from e
524
+
525
+ need_table |= merged_need_table
526
+ merged_columns[key] = merged_dtype
527
+ else:
528
+ need_table = True
529
+ merged_columns[key] = value
530
+
531
+ if key in used_columns:
532
+ used_columns[key] = True
533
+
534
+ return merged_columns, need_table, used_columns
535
+
536
+
537
+ def concat(sources=None, add_table=True, symbols=None):
538
+ """ Deprecated: Merges ticks from the sources into a single output _source ordered by the timestamp
539
+
540
+ This function is deprecated due the wrong name notation.
541
+ Use 'merge' instead.
542
+
543
+ Parameters
544
+ ----------
545
+ sources : list
546
+ List of sources to merge
547
+ align_schema : bool
548
+ If set to True, then table is added right after merge.
549
+ We recommended to keep True to prevent problems with
550
+ different tick schemas. Default: True
551
+
552
+ Returns
553
+ -------
554
+ A new _source that holds a result of the merged sources
555
+ """
556
+ warnings.warn("This function is deprecated due the wrong name notation. Use `merge` instead.", FutureWarning)
557
+ return merge(sources=sources, align_schema=add_table, symbols=symbols)
558
+
559
+
560
+ def _add_node_name_prefix_to_columns_in_operation(op, src):
561
+ """
562
+ Add node name of souce ``src`` as prefix to all columns names in operation ``op``.
563
+ """
564
+ if not isinstance(op, Operation):
565
+ return op
566
+
567
+ def fun(operation):
568
+ if isinstance(operation, ott.ExpressionDefinedTimeOffset) and isinstance(operation.n, Operation):
569
+ operation.n = operation.n._replace_parameters(fun)
570
+ if isinstance(operation, Column) and operation.obj_ref is src:
571
+ column = operation
572
+ if not src.node_name().strip():
573
+ raise ValueError('You set to use name for column prefix, but name is empty')
574
+ name = f'{src.node_name()}.{column.name}'
575
+ return Column(name, column.dtype, column.obj_ref, precision=getattr(column, "_precision", None))
576
+ return None
577
+
578
+ return op._replace_parameters(fun)
579
+
580
+
581
+ def _enforce_order_for_sources(sources):
582
+ """
583
+ Enforce order of sources by adding/modifying OMDSEQ field.
584
+ """
585
+ result = []
586
+ for i, source in enumerate(sources):
587
+ source = source.copy()
588
+ source = source.table(strict=False, **{'OMDSEQ': int})
589
+ source['OMDSEQ'] = i
590
+ # this update_field is needed to let OneTick know that OMDSEQ was changed
591
+ source.sink(otq.UpdateField(field='TIMESTAMP', value='TIMESTAMP'))
592
+ result.append(source)
593
+ return result
594
+
595
+
596
+ def join(left, right, on, how='left_outer', rprefix='RIGHT', keep_fields_not_in_schema=False, output_type_index=None):
597
+ """
598
+ Joins two sources ``left`` and ``right`` based on ``on`` condition.
599
+
600
+ In case you willing to add prefix/suffix to all columns in one of the sources you should use
601
+ :func:`Source.add_prefix` or :func:`Source.add_suffix`
602
+
603
+ Parameters
604
+ ----------
605
+ left: :class:`Source`
606
+ left source to join
607
+ right: :class:`Source`
608
+ right source to join
609
+ on: :py:class:`~onetick.py.Operation` or 'all' or 'same_size' or list of strings
610
+
611
+ If 'all' joins every tick from ``left`` with every tick from ``right``.
612
+
613
+ If 'same_size' and size of sources are same, joins ticks from two sources directly, else raises exception.
614
+
615
+ If it is list of strings, then ticks with same ``on`` fields will be joined.
616
+
617
+ If :py:class:`~onetick.py.Operation` then only ticks on which the condition evaluates to True will be joined.
618
+ how: 'inner' or 'left_outer'
619
+ Joining type.
620
+ Inner join will only produce ticks that matched the ``on`` condition.
621
+ Left outer join will also produce the ticks from the ``left`` source
622
+ that didn't match the condition.
623
+
624
+ Doesn't matter for ``on='same_size'``.
625
+ rprefix: str
626
+ The name of ``right`` data source. It will be added as prefix to overlapping columns arrived
627
+ from right to result
628
+ keep_fields_not_in_schema: bool
629
+
630
+ If True - join function will try to preserve any fields of original sources that are not in the source schema,
631
+ propagating them to output. This means a possibility of runtime error if fields are duplicating.
632
+
633
+ If False, will remove all fields that are not in schema.
634
+ output_type_index: int
635
+ Specifies index of source in sources from which type and properties of output will be taken.
636
+ Useful when joining sources that inherited from :class:`Source`.
637
+ By default output object type will be :class:`Source`.
638
+
639
+ Note
640
+ ----
641
+ ``join`` does some internal optimization in case of using time-based ``on`` conditions. Optimization doesn't apply
642
+ if ``on`` expression has functions in it. So it is recommended to use addition/subtraction number of
643
+ milliseconds (integers).
644
+
645
+ See examples for more details.
646
+
647
+ Returns
648
+ -------
649
+ :class:`Source` or same class as ``[left, right][output_type_index]``
650
+ joined data
651
+
652
+ See also
653
+ --------
654
+ **JOIN** OneTick event processor
655
+
656
+ Examples
657
+ --------
658
+ >>> d1 = otp.Ticks({'ID': [1, 2, 3], 'A': ['a', 'b', 'c']})
659
+ >>> d2 = otp.Ticks({'ID': [2, 3, 4], 'B': ['q', 'w', 'e']})
660
+
661
+ Outer join:
662
+
663
+ >>> data = otp.join(d1, d2, on=d1['ID'] == d2['ID'], how='left_outer')
664
+ >>> otp.run(data)
665
+ Time ID A RIGHT_ID B
666
+ 0 2003-12-01 00:00:00.000 1 a 0
667
+ 1 2003-12-01 00:00:00.001 2 b 2 q
668
+ 2 2003-12-01 00:00:00.002 3 c 3 w
669
+
670
+ Inner join:
671
+
672
+ >>> data = otp.join(d1, d2, on=d1['ID'] == d2['ID'], how='inner')
673
+ >>> otp.run(data)
674
+ Time ID A RIGHT_ID B
675
+ 0 2003-12-01 00:00:00.001 2 b 2 q
676
+ 1 2003-12-01 00:00:00.002 3 c 3 w
677
+
678
+ Join all ticks:
679
+
680
+ >>> data = otp.join(d1, d2, on='all')
681
+ >>> otp.run(data)
682
+ Time ID A RIGHT_ID B
683
+ 0 2003-12-01 00:00:00.000 1 a 2 q
684
+ 1 2003-12-01 00:00:00.000 1 a 3 w
685
+ 2 2003-12-01 00:00:00.000 1 a 4 e
686
+ 3 2003-12-01 00:00:00.001 2 b 2 q
687
+ 4 2003-12-01 00:00:00.001 2 b 3 w
688
+ 5 2003-12-01 00:00:00.001 2 b 4 e
689
+ 6 2003-12-01 00:00:00.002 3 c 2 q
690
+ 7 2003-12-01 00:00:00.002 3 c 3 w
691
+ 8 2003-12-01 00:00:00.002 3 c 4 e
692
+
693
+ Join same size sources:
694
+
695
+ >>> data = otp.join(d1, d2, on='same_size')
696
+ >>> otp.run(data)
697
+ Time ID A RIGHT_ID B
698
+ 0 2003-12-01 00:00:00.000 1 a 2 q
699
+ 1 2003-12-01 00:00:00.001 2 b 3 w
700
+ 2 2003-12-01 00:00:00.002 3 c 4 e
701
+
702
+ Adding prefix to the right source for all columns:
703
+
704
+ >>> d_right = d2.add_prefix('right_')
705
+ >>> data = otp.join(d1, d_right, on=d1['ID'] == d_right['right_ID'])
706
+ >>> otp.run(data)
707
+ Time ID A right_ID right_B
708
+ 0 2003-12-01 00:00:00.000 1 a 0
709
+ 1 2003-12-01 00:00:00.001 2 b 2 q
710
+ 2 2003-12-01 00:00:00.002 3 c 3 w
711
+
712
+ This condition will be optimized during run time:
713
+
714
+ >>> data = otp.join(d1, d2, on=(d1['ID'] == d2['ID']) & (d1['Time'] >= d2['Time']), how='left_outer')
715
+ >>> otp.run(data)
716
+ Time ID A RIGHT_ID B
717
+ 0 2003-12-01 00:00:00.000 1 a 0
718
+ 1 2003-12-01 00:00:00.001 2 b 2 q
719
+ 2 2003-12-01 00:00:00.002 3 c 3 w
720
+
721
+ This condition won't be optimized during run time since in transforms addition to time into function.
722
+ So please note, this way of using ``join`` is not recommended.
723
+
724
+ >>> data = otp.join(d1, d2, on=(d1['ID'] == d2['ID']) & (d1['Time'] >= d2['Time'] + otp.Milli(1)), how='left_outer')
725
+ >>> otp.run(data)
726
+ Time ID A RIGHT_ID B
727
+ 0 2003-12-01 00:00:00.000 1 a 0
728
+ 1 2003-12-01 00:00:00.001 2 b 2 q
729
+ 2 2003-12-01 00:00:00.002 3 c 3 w
730
+
731
+ In such cases (adding/subtracting constants to time) adding/subtraction number of milliseconds should be done.
732
+ This example will return exactly the same result as previous one, but it will be optimized, so runtime will be
733
+ shorter.
734
+
735
+ >>> data = otp.join(d1, d2, on=(d1['ID'] == d2['ID']) & (d1['Time'] >= d2['Time'] + 1), how='left_outer')
736
+ >>> otp.run(data)
737
+ Time ID A RIGHT_ID B
738
+ 0 2003-12-01 00:00:00.000 1 a 0
739
+ 1 2003-12-01 00:00:00.001 2 b 2 q
740
+ 2 2003-12-01 00:00:00.002 3 c 3 w
741
+
742
+ ``on`` can be list of strings:
743
+
744
+ >>> left = otp.Ticks(A=[1, 2, 3], B=[4, 6, 7])
745
+ >>> right = otp.Ticks(A=[2, 3, 4], B=[6, 9, 8], C=[7, 2, 0])
746
+ >>> data = otp.join(left, right, on=['A', 'B'], how='inner')
747
+ >>> otp.run(data)
748
+ Time A B C
749
+ 0 2003-12-01 00:00:00.001 2 6 7
750
+
751
+ Use parameter ``output_type_index`` to specify which input class to use to create output object.
752
+ It may be useful in case some custom user class was used as input:
753
+
754
+ >>> class CustomTick(otp.Tick):
755
+ ... def custom_method(self):
756
+ ... return 'custom_result'
757
+ >>> data1 = otp.Tick(A=1)
758
+ >>> data2 = CustomTick(B=2)
759
+ >>> data = otp.join(data1, data2, on='same_size', output_type_index=1)
760
+ >>> type(data)
761
+ <class 'onetick.py.functions.CustomTick'>
762
+ >>> data.custom_method()
763
+ 'custom_result'
764
+ >>> otp.run(data)
765
+ Time A B
766
+ 0 2003-12-01 1 2
767
+ """
768
+ output_type = output_type_by_index((left, right), output_type_index)
769
+
770
+ on_list = []
771
+ if isinstance(on, list):
772
+ for column in on:
773
+ if column not in left.schema:
774
+ raise ValueError(f'`{column}` column does not exist in the left source.')
775
+ if column not in right.schema:
776
+ raise ValueError(f'`{column}` column does not exist in the right source.')
777
+ if len(on) == 0:
778
+ raise ValueError('`on` parameter can not be empty list.')
779
+ on_list = on
780
+ on = (left[on_list[0]] == right[on_list[0]])
781
+ for column in on_list[1:]:
782
+ on = on & (left[column] == right[column])
783
+
784
+ timezone_hack = None
785
+ if re.search(r'\b_TIMEZONE\b', str(on)):
786
+ # join does not support using _TIMEZONE pseudo-field in join_criteria,
787
+ # replacing it with temporary fields in the branches
788
+ timezone_hack = '__TIMEZONE_HACK__'
789
+ left[timezone_hack] = left['_TIMEZONE']
790
+ right[timezone_hack] = right['_TIMEZONE']
791
+
792
+ if str(on) == "all":
793
+ on = f'1 = 1 or {rprefix}.TIMESTAMP >= 0'
794
+
795
+ _LEFT_NODE_NAME = "__SRC_LEFT__" # this is internal name
796
+ _RIGHT_NODE_NAME = rprefix
797
+
798
+ initial_left_source_node_name = left.node_name()
799
+ initial_right_source_node_name = right.node_name()
800
+
801
+ # we have to add _source prefix to all column operations
802
+ # `on` expression is written with right, so we should modify it, we will restore it later
803
+ left.node_name(_LEFT_NODE_NAME)
804
+ right.node_name(_RIGHT_NODE_NAME)
805
+
806
+ on = _add_node_name_prefix_to_columns_in_operation(on, left)
807
+ on = _add_node_name_prefix_to_columns_in_operation(on, right)
808
+
809
+ columns_name_set = set()
810
+ columns = {}
811
+ fields_to_skip_right_source = {'TIMESTAMP'}
812
+ for name, dtype in chain(left.columns(skip_meta_fields=True).items(), right.columns(skip_meta_fields=True).items()):
813
+ if name in columns_name_set:
814
+ columns[_RIGHT_NODE_NAME + "_" + name] = dtype
815
+ fields_to_skip_right_source.add(name)
816
+ else:
817
+ columns[name] = dtype
818
+ columns_name_set.add(name)
819
+
820
+ if how in ("left_outer", "outer"):
821
+ join_type = "LEFT_OUTER"
822
+ if how == "outer":
823
+ warnings.warn("Value 'outer' for parameter 'how' is deprecated. Use 'left_outer' instead.",
824
+ FutureWarning)
825
+ elif how == "inner":
826
+ join_type = "INNER"
827
+ else:
828
+ raise ValueError("The 'how' parameter has wrong value. Only 'left_outer' and 'inner' are supported")
829
+
830
+ if timezone_hack:
831
+ on = re.sub(r'\._TIMEZONE\b', f'.{timezone_hack}', str(on))
832
+ on = re.sub(r'\b_TIMEZONE\b', f'{_LEFT_NODE_NAME}.{timezone_hack}', str(on))
833
+
834
+ # ------------------
835
+ # create objects
836
+ params = {"join_criteria": str(on), "join_type": join_type, "left_source": _LEFT_NODE_NAME}
837
+
838
+ # return states of sources back
839
+ left.node_name(initial_left_source_node_name)
840
+ right.node_name(initial_right_source_node_name)
841
+ if str(on) == "same_size":
842
+ result = output_type(node=otq.JoinSameSizeTs(), schema=columns)
843
+ else:
844
+ result = output_type(node=otq.Join(**params), schema=columns)
845
+
846
+ __copy_sources_on_merge_or_join(result, (left, right),
847
+ names=(_LEFT_NODE_NAME, _RIGHT_NODE_NAME),
848
+ output_type_index=output_type_index)
849
+
850
+ rename_fields_dict = {}
851
+ for lc, rc in zip_longest(left.columns(skip_meta_fields=True), right.columns(skip_meta_fields=True)):
852
+ if lc:
853
+ rename_fields_dict[f"{_LEFT_NODE_NAME}.{lc}"] = lc
854
+ if rc:
855
+ if rc not in fields_to_skip_right_source:
856
+ rename_fields_dict[f"{_RIGHT_NODE_NAME}.{rc}"] = rc
857
+ else:
858
+ rename_fields_dict[f"{_RIGHT_NODE_NAME}.{rc}"] = f"{_RIGHT_NODE_NAME}_{rc}"
859
+
860
+ __rename_fields(result, rename_fields_dict)
861
+ result.sink(otq.Passthrough(fields=_LEFT_NODE_NAME + ".TIMESTAMP", drop_fields=True))
862
+
863
+ items = []
864
+ for name, dtype in result.columns(skip_meta_fields=True).items():
865
+ items.append(ott.type2str(dtype) + " " + name)
866
+
867
+ if keep_fields_not_in_schema:
868
+ # Here we try to preserve fields of original sources that were not in schema
869
+ # in their original form. If there's a duplication of fields or any other problem
870
+ # in runtime, we'll be able to do nothing
871
+ result.sink(otq.Passthrough(fields=_RIGHT_NODE_NAME + ".TIMESTAMP", drop_fields=True))
872
+ result.sink(otq.RenameFieldsEp(rename_fields=rf"{_LEFT_NODE_NAME}\.(.*)=\1,{_RIGHT_NODE_NAME}\.(.*)=\1",
873
+ use_regex=True))
874
+ result.sink(otq.Table(fields=",".join(items), keep_input_fields=True))
875
+ else:
876
+ result.sink(otq.Table(fields=",".join(items)))
877
+
878
+ if timezone_hack:
879
+ result = result.drop([
880
+ field for field in result.schema
881
+ if field.endswith(timezone_hack)
882
+ ])
883
+ left.drop(timezone_hack, inplace=True)
884
+ right.drop(timezone_hack, inplace=True)
885
+
886
+ for column in on_list:
887
+ result.drop(f'{_RIGHT_NODE_NAME}_{column}', inplace=True)
888
+
889
+ return result
890
+
891
+
892
+ def join_by_time(sources, how="outer", on=None, policy=None, check_schema=True, leading=0,
893
+ match_if_identical_times=None, output_type_index=None, use_rename_ep=True,
894
+ source_fields_order=None, symbols=None):
895
+ """
896
+ Joins ticks from multiple input time series, based on input tick timestamps.
897
+
898
+ ``leading`` source tick joined with already arrived ticks from other sources.
899
+
900
+ >>> leading = otp.Ticks(A=[1, 2], offset=[1, 3])
901
+ >>> other = otp.Ticks(B=[1], offset=[2])
902
+ >>> otp.run(otp.join_by_time([leading, other]))
903
+ Time A B
904
+ 0 2003-12-01 00:00:00.001 1 0
905
+ 1 2003-12-01 00:00:00.003 2 1
906
+
907
+ Note
908
+ ----
909
+ In case different ``sources`` have matching columns, the exception will be raised.
910
+
911
+ To fix this error,
912
+ functions :func:`Source.add_prefix` or :func:`Source.add_suffix` can be used to rename all columns in the source.
913
+
914
+ Note that resulting **TIMESTAMP** pseudo-column will be taken from the leading source,
915
+ and timestamps of ticks from non-leading sources will not be added to the output,
916
+ so if you need to save them, you need to copy the timestamp to some other column.
917
+
918
+ See examples below.
919
+
920
+ Parameters
921
+ ----------
922
+ sources: Collection[:class:`Source`]
923
+ The collection of Source objects which will be joined
924
+ how: 'outer' or 'inner'
925
+ The method of join ("inner" or "outer").
926
+ Inner join logic will propagate ticks only if all sources participated in forming it.
927
+ Outer join will propagate all ticks even if they couldn't be joined with other sources
928
+ (in this case the fields from other sources will have "zero" values depending on the type of the field).
929
+ Default is "outer".
930
+ on: Collection[:class:`Column`]
931
+ ``on`` add an extra check to join - only ticks with same ``on`` fields will be joined
932
+
933
+ >>> leading = otp.Ticks(A=[1, 2], offset=[1, 3])
934
+ >>> other = otp.Ticks(A=[2, 2], B=[1, 2], offset=[0, 2])
935
+ >>> otp.run(otp.join_by_time([leading, other], on=['A']))
936
+ Time A B
937
+ 0 2003-12-01 00:00:00.001 1 0
938
+ 1 2003-12-01 00:00:00.003 2 2
939
+
940
+ policy: 'arrival_order', 'latest_ticks', 'each_for_leader_with_first' or 'each_for_leader_with_latest'
941
+ Policy of joining ticks with the same timestamps.
942
+ The default value is "arrival_order" by default, but is set to "latest_ticks"
943
+ if parameter ``match_if_identical_times`` is set to True.
944
+
945
+ >>> leading = otp.Ticks(A=[1, 2], offset=[0, 0], OMDSEQ=[0, 3])
946
+ >>> other = otp.Ticks(B=[1, 2], offset=[0, 0], OMDSEQ=[2, 4])
947
+
948
+ Note: in the examples below we assume that all ticks have same timestamps, but order of ticks as in example.
949
+ OMDSEQ is a special field that store order of ticks with same timestamp
950
+
951
+ - ``arrival_order``
952
+ output tick generated on arrival of ``leading`` source tick
953
+
954
+ >>> data = otp.join_by_time([leading, other], policy='arrival_order')
955
+ >>> otp.run(data)[['Time', 'A', 'B']]
956
+ Time A B
957
+ 0 2003-12-01 1 0
958
+ 1 2003-12-01 2 1
959
+
960
+ - ``latest_ticks``
961
+ Tick generated at the time of expiration of a particular timestamp (when all ticks from all sources
962
+ for current timestamp arrived). Only latest tick from ``leading`` source will be used.
963
+
964
+ >>> data = otp.join_by_time([leading, other], policy='latest_ticks')
965
+ >>> otp.run(data)[['Time', 'A', 'B']]
966
+ Time A B
967
+ 0 2003-12-01 2 2
968
+
969
+ - ``each_for_leader_with_first``
970
+ Each tick from ``leading`` source will be joined with first tick from other sources for current timestamp
971
+
972
+ >>> data = otp.join_by_time(
973
+ ... [leading, other],
974
+ ... policy='each_for_leader_with_first'
975
+ ... )
976
+ >>> otp.run(data)[['Time', 'A', 'B']]
977
+ Time A B
978
+ 0 2003-12-01 1 1
979
+ 1 2003-12-01 2 1
980
+
981
+ - ``each_for_leader_with_latest``
982
+ Each tick from ``leading`` source will be joined with last tick from other sources for current timestamp
983
+
984
+ >>> data = otp.join_by_time(
985
+ ... [leading, other],
986
+ ... policy='each_for_leader_with_latest'
987
+ ... )
988
+ >>> otp.run(data)[['Time', 'A', 'B']]
989
+ Time A B
990
+ 0 2003-12-01 1 2
991
+ 1 2003-12-01 2 2
992
+
993
+ check_schema: bool
994
+ If True onetick.py will check that all columns names are unambiguous
995
+ and columns listed in `on` param are exists in sources schema.
996
+ Which can lead to false positive error
997
+ in case of some event processors were sink to Source. To avoid this set check_scheme to False.
998
+ leading: int, 'all', :class:`Source`, list of int, list of :class:`Source`
999
+ A list of sources or their indexes. If this parameter is 'all', every source is considered to be leading.
1000
+ The logic of the leading source depends on ``policy`` parameter.
1001
+ The default value is 0, meaning the first specified source will be the leader.
1002
+
1003
+ match_if_identical_times: bool
1004
+ A True value of this parameter causes an output tick to be formed from input ticks with identical timestamps
1005
+ only.
1006
+ If parameter ``how`` is set to 'outer',
1007
+ default values of fields (``otp.nan``, 0, empty string) are propagated for
1008
+ sources that did not tick at a given timestamp.
1009
+ If this parameter is set to True, the default value of ``policy`` parameter is set to 'latest_ticks'.
1010
+ output_type_index: int
1011
+ Specifies index of source in ``sources`` from which type and properties of output will be taken.
1012
+ Useful when joining sources that inherited from :class:`Source`.
1013
+ By default output object type will be :class:`Source`.
1014
+ use_rename_ep: bool
1015
+ This parameter specifies if :py:class:`onetick.query.RenameFields`
1016
+ event processor will be used in internal implementation of this function or not.
1017
+ This event processor can't be used in generic aggregations, so set this parameter to False
1018
+ if ``join_by_time`` is used in generic aggregation logic.
1019
+ source_fields_order: list of int, list of :class:`Source`
1020
+ Controls the order of fields in output ticks.
1021
+ If set, all input sources indexes or objects must be specified.
1022
+ By default, the order of the sources is the same as in the ``sources`` list.
1023
+ symbols: str, list of str or functions, :class:`Source`, :py:class:`onetick.query.GraphQuery`
1024
+ Bound symbol(s) passed as a string, a list of strings, or as a "symbols" query which results
1025
+ include the ``SYMBOL_NAME`` column. The start/end times for the
1026
+ symbols query will taken from the :meth:`run` params.
1027
+ See :ref:`symbols <static/concepts/symbols:Symbols: bound and unbound>` for more details.
1028
+
1029
+ .. warning::
1030
+ Passing more than one source for join and setting ``symbols`` parameter at the same time aren't supported
1031
+
1032
+ .. note::
1033
+ If bound symbols are specified as :class:`Source` or :py:class:`onetick.query.GraphQuery`,
1034
+ you **should** set schema for returned :class:`Source` object manually:
1035
+ ``onetick-py`` couldn't determine symbols from sub-query before running the query.
1036
+
1037
+ .. note::
1038
+ If bound symbols are specified as :class:`Source` or :py:class:`onetick.query.GraphQuery`,
1039
+ and this sub-query returns only one symbol name, output columns wouldn't have a prefix with symbol name.
1040
+
1041
+ See also
1042
+ --------
1043
+ **JOIN_BY_TIME** OneTick event processor
1044
+
1045
+ Examples
1046
+ --------
1047
+ >>> d1 = otp.Ticks({'A': [1, 2, 3], 'offset': [1, 2, 3]})
1048
+ >>> d2 = otp.Ticks({'B': [1, 2, 4], 'offset': [1, 2, 4]})
1049
+ >>> otp.run(d1)
1050
+ Time A
1051
+ 0 2003-12-01 00:00:00.001 1
1052
+ 1 2003-12-01 00:00:00.002 2
1053
+ 2 2003-12-01 00:00:00.003 3
1054
+ >>> otp.run(d2)
1055
+ Time B
1056
+ 0 2003-12-01 00:00:00.001 1
1057
+ 1 2003-12-01 00:00:00.002 2
1058
+ 2 2003-12-01 00:00:00.004 4
1059
+
1060
+ Default joining logic, outer join with the first source is the leader by default:
1061
+
1062
+ >>> data = otp.join_by_time([d1, d2])
1063
+ >>> otp.run(data)
1064
+ Time A B
1065
+ 0 2003-12-01 00:00:00.001 1 0
1066
+ 1 2003-12-01 00:00:00.002 2 1
1067
+ 2 2003-12-01 00:00:00.003 3 2
1068
+
1069
+ Leading source can be changed by using parameter ``leading``:
1070
+
1071
+ >>> data = otp.join_by_time([d1, d2], leading=1)
1072
+ >>> otp.run(data)
1073
+ Time A B
1074
+ 0 2003-12-01 00:00:00.001 1 1
1075
+ 1 2003-12-01 00:00:00.002 2 2
1076
+ 2 2003-12-01 00:00:00.004 3 4
1077
+
1078
+ Note that OneTick's logic is different depending on the order of sources specified,
1079
+ so specifying ``leading`` parameter in the previous example is not the same as changing the order of sources here:
1080
+
1081
+ >>> data = otp.join_by_time([d2, d1], leading=0)
1082
+ >>> otp.run(data)
1083
+ Time B A
1084
+ 0 2003-12-01 00:00:00.001 1 0
1085
+ 1 2003-12-01 00:00:00.002 2 1
1086
+ 2 2003-12-01 00:00:00.004 4 3
1087
+
1088
+ Parameter ``source_fields_order`` can be used to change the order of fields in the output,
1089
+ but it also affects the joining logic the same way as changing the order of sources:
1090
+
1091
+ >>> data = otp.join_by_time([d1, d2], leading=1, source_fields_order=[1, 0])
1092
+ >>> otp.run(data)
1093
+ Time B A
1094
+ 0 2003-12-01 00:00:00.001 1 0
1095
+ 1 2003-12-01 00:00:00.002 2 1
1096
+ 2 2003-12-01 00:00:00.004 4 3
1097
+
1098
+ Parameter ``how`` can be set to "inner".
1099
+ In this case only ticks that were successfully joined from all sources will be propagated:
1100
+
1101
+ >>> data = otp.join_by_time([d1, d2], how='inner')
1102
+ >>> otp.run(data)
1103
+ Time A B
1104
+ 0 2003-12-01 00:00:00.002 2 1
1105
+ 1 2003-12-01 00:00:00.003 3 2
1106
+
1107
+ Set parameter ``match_if_identical_times`` to only join ticks with the same timestamps:
1108
+
1109
+ >>> data = otp.join_by_time([d1, d2], how='inner', match_if_identical_times=True)
1110
+ >>> otp.run(data)
1111
+ Time A B
1112
+ 0 2003-12-01 00:00:00.001 1 1
1113
+ 1 2003-12-01 00:00:00.002 2 2
1114
+
1115
+ In case of conflicting names in different sources, exception will be raised:
1116
+
1117
+ >>> d3 = otp.Ticks({'A': [1, 2, 4], 'offset': [1, 2, 4]})
1118
+ >>> data = otp.join_by_time([d1, d3])
1119
+ Traceback (most recent call last):
1120
+ ...
1121
+ ValueError: There are matched columns between sources: A
1122
+
1123
+ Adding prefix to right source for all columns will fix this problem:
1124
+
1125
+ >>> data = otp.join_by_time([d1, d3.add_prefix('right_')])
1126
+ >>> otp.run(data)
1127
+ Time A right_A
1128
+ 0 2003-12-01 00:00:00.001 1 0
1129
+ 1 2003-12-01 00:00:00.002 2 1
1130
+ 2 2003-12-01 00:00:00.003 3 2
1131
+
1132
+ Note that timestamps from the non-leading source are not added to the output.
1133
+ You can add them manually in a different field:
1134
+
1135
+ >>> d3['D3_TIMESTAMP'] = d3['TIMESTAMP']
1136
+ >>> data = otp.join_by_time([d1, d3.add_prefix('right_')])
1137
+ >>> otp.run(data)
1138
+ Time A right_A right_D3_TIMESTAMP
1139
+ 0 2003-12-01 00:00:00.001 1 0 1969-12-31 19:00:00.000
1140
+ 1 2003-12-01 00:00:00.002 2 1 2003-12-01 00:00:00.001
1141
+ 2 2003-12-01 00:00:00.003 3 2 2003-12-01 00:00:00.002
1142
+
1143
+ Use parameter ``output_type_index`` to specify which input class to use to create output object.
1144
+ It may be useful in case some custom user class was used as input:
1145
+
1146
+ >>> class CustomTick(otp.Tick):
1147
+ ... def custom_method(self):
1148
+ ... return 'custom_result'
1149
+ >>> data1 = otp.Tick(A=1)
1150
+ >>> data2 = CustomTick(B=2)
1151
+ >>> data = otp.join_by_time([data1, data2], match_if_identical_times=True, output_type_index=1)
1152
+ >>> type(data)
1153
+ <class 'onetick.py.functions.CustomTick'>
1154
+ >>> data.custom_method()
1155
+ 'custom_result'
1156
+ >>> otp.run(data)
1157
+ Time A B
1158
+ 0 2003-12-01 1 2
1159
+
1160
+ Use parameter ``source_fields_order`` to specify the order of output fields:
1161
+
1162
+ >>> a = otp.Ticks(A=[1, 2])
1163
+ >>> b = otp.Ticks(B=[1, 2])
1164
+ >>> c = otp.Ticks(C=[1, 2])
1165
+ >>> data = otp.join_by_time([a, b, c], match_if_identical_times=True, source_fields_order=[c, b, a])
1166
+ >>> otp.run(data)
1167
+ Time C B A
1168
+ 0 2003-12-01 00:00:00.000 1 1 1
1169
+ 1 2003-12-01 00:00:00.001 2 2 2
1170
+
1171
+ Indexes can be used too:
1172
+
1173
+ >>> data = otp.join_by_time([a, b, c], match_if_identical_times=True, source_fields_order=[1, 2, 0])
1174
+ >>> otp.run(data)
1175
+ Time B C A
1176
+ 0 2003-12-01 00:00:00.000 1 1 1
1177
+ 1 2003-12-01 00:00:00.001 2 2 2
1178
+
1179
+ Use parameter `symbols` to specify bound symbols:
1180
+
1181
+ >>> data = otp.Ticks(X=[1, 2, 3, 4])
1182
+ >>> data = otp.join_by_time([data], symbols=['A', 'B'], match_if_identical_times=True)
1183
+ >>> otp.run(data)
1184
+ Time A.X B.X
1185
+ 0 2003-12-01 00:00:00.000 1 1
1186
+ 1 2003-12-01 00:00:00.001 2 2
1187
+ 2 2003-12-01 00:00:00.002 3 3
1188
+ 3 2003-12-01 00:00:00.003 4 4
1189
+
1190
+ Returns
1191
+ -------
1192
+ :class:`Source` or same class as ``sources[output_type_index]``
1193
+ A time series of ticks.
1194
+ """
1195
+ from onetick.py.core.source import _Source
1196
+
1197
+ output_type = output_type_by_index(sources, output_type_index)
1198
+
1199
+ if len(sources) > 1 and symbols:
1200
+ raise ValueError(
1201
+ 'It\'s impossible to use `join_by_time` with multiple sources, '
1202
+ 'when bound symbols are set via `symbols` parameter.'
1203
+ )
1204
+
1205
+ join_str_keys = []
1206
+
1207
+ # if key is set, then generalize it, ie convert into list;
1208
+ # then remove keys from 'columns_count' dict to pass validation after
1209
+ if on is not None:
1210
+ if isinstance(on, list):
1211
+ # okay
1212
+ pass
1213
+ elif isinstance(on, Column):
1214
+ on = [on]
1215
+ elif isinstance(on, str):
1216
+ on = [on]
1217
+ else:
1218
+ raise TypeError(f"It is not supported to have '{type(on)}' type as a key")
1219
+
1220
+ for join_key in on:
1221
+ dtypes = set()
1222
+ if check_schema:
1223
+ for source in sources:
1224
+ try:
1225
+ key_type = source.schema[str(join_key)]
1226
+ except KeyError as e:
1227
+ raise KeyError(f"Column '{join_key}' not found in source schema {source}") from e
1228
+ type_name = ott.type2str(key_type)
1229
+ if type_name == "string[64]":
1230
+ type_name = "string"
1231
+ dtypes.add(type_name)
1232
+ if len(dtypes) > 1:
1233
+ raise TypeError(f"Column '{join_key}' has different types in sources: {dtypes}")
1234
+
1235
+ if isinstance(join_key, Column):
1236
+ join_str_keys.append(str(join_key))
1237
+ elif isinstance(join_key, str):
1238
+ join_str_keys.append(join_key)
1239
+
1240
+ if check_schema:
1241
+ _check_schema_for_join_by_time(join_str_keys, sources)
1242
+
1243
+ if how not in ["inner", "outer"]:
1244
+ raise ValueError('Wrong value for the "how" parameter. It is allowed to use "inner" or "outer" values')
1245
+ join_type = how.upper()
1246
+
1247
+ # ------------------
1248
+ # create objects
1249
+ params = {"add_source_prefix": False, "join_type": join_type}
1250
+ leading = _fill_leading_sources_param(leading, params, sources)
1251
+ ordered_sources = _fill_source_fields_order_param(source_fields_order, params, sources)
1252
+
1253
+ if on is not None:
1254
+ params["join_keys"] = ",".join(join_str_keys)
1255
+
1256
+ if policy is not None:
1257
+ policies = {"arrival_order", "latest_ticks", "each_for_leader_with_first", "each_for_leader_with_latest"}
1258
+ if policy.lower() not in policies:
1259
+ raise ValueError("Invalid policy. Only the following ones are allowed: " + ", ".join(policies) + ".")
1260
+ params["same_timestamp_join_policy"] = policy.upper()
1261
+
1262
+ if match_if_identical_times is not None:
1263
+ params["match_if_identical_times"] = match_if_identical_times
1264
+
1265
+ is_bound_multi_symbol = False
1266
+ is_source_symbols = isinstance(symbols, (_Source, _QueryEvalWrapper))
1267
+
1268
+ if isinstance(symbols, list) and len(symbols) > 1 or is_source_symbols:
1269
+ is_bound_multi_symbol = True
1270
+ params['add_source_prefix'] = True
1271
+
1272
+ columns = {name: dtype for src in ordered_sources for name, dtype in src.columns(skip_meta_fields=True).items()}
1273
+
1274
+ tmp_otq = TmpOtq()
1275
+ result = output_type(node=apply_symbol_to_ep(otq.JoinByTime(**params), symbols, tmp_otq), schema=columns)
1276
+ result._tmp_otq.merge(tmp_otq)
1277
+
1278
+ __copy_sources_on_merge_or_join(result, sources,
1279
+ symbols=symbols,
1280
+ names=True,
1281
+ drop_meta=True,
1282
+ leading=leading,
1283
+ output_type_index=output_type_index,
1284
+ use_rename_ep=use_rename_ep)
1285
+
1286
+ if is_bound_multi_symbol:
1287
+ if not is_source_symbols:
1288
+ # this isn't supported for symbols defined as otp.Source
1289
+ new_columns = {
1290
+ f"{sym}.{col}": dtype for col, dtype in columns.items() for sym in symbols
1291
+ }
1292
+ result.schema.update(**new_columns)
1293
+
1294
+ result = result.drop(columns=list(columns.keys()))
1295
+
1296
+ if is_source_symbols:
1297
+ result = result.rename({r'__SRC_0__\.(.*)': r'\1'}, use_regex=True)
1298
+
1299
+ if how == "outer":
1300
+ # adding table to convert types in schema, e.g. float to int
1301
+ result._add_table(strict=False)
1302
+
1303
+ return result
1304
+
1305
+
1306
+ def _fill_source_fields_order_param(source_fields_order, params, sources):
1307
+ if source_fields_order is None:
1308
+ return sources
1309
+ if not isinstance(source_fields_order, Sequence):
1310
+ raise ValueError(f"Wrong type for parameter 'source_fields_order': {type(source_fields_order)}")
1311
+ if len(source_fields_order) != len(sources):
1312
+ raise ValueError("Wrong number of sources in parameter 'source_fields_order':"
1313
+ f" {len(source_fields_order)} (need {len(sources)})")
1314
+ if isinstance(source_fields_order[0], int):
1315
+ indexes = source_fields_order
1316
+ ordered_sources = [sources[i] for i in indexes]
1317
+ else:
1318
+ indexes = [__find_by_id(sources, src) for src in source_fields_order]
1319
+ ordered_sources = source_fields_order
1320
+ params['source_order'] = ','.join(f'__SRC_{i}__' for i in indexes)
1321
+ return ordered_sources
1322
+
1323
+
1324
+ @singledispatch
1325
+ def _fill_leading_sources_param(leading, params, sources):
1326
+ from onetick.py.core.source import _Source
1327
+
1328
+ if isinstance(leading, _Source): # TODO: PY-104 Get rid of circular dependencies in code to avoid local import
1329
+ result = f"__SRC_{__find_by_id(sources, leading)}__"
1330
+ params["leading_sources"] = result
1331
+ result = [result]
1332
+ elif leading == "all": # all sources are leading which is specified by empty string
1333
+ params["leading_sources"] = ""
1334
+ result = []
1335
+ else:
1336
+ raise ValueError(
1337
+ "wrong leading param was specified, please use any of int, 'all' literal, list of int, list of _Source"
1338
+ )
1339
+ return result
1340
+
1341
+
1342
+ @_fill_leading_sources_param.register(int)
1343
+ def _(leading, params, sources):
1344
+ if leading < 0:
1345
+ leading = len(sources) + leading
1346
+ if 0 <= leading < len(sources):
1347
+ result = f"__SRC_{leading}__"
1348
+ params["leading_sources"] = result
1349
+ return [result]
1350
+ else:
1351
+ raise ValueError(
1352
+ f"leading source index should be in range(-len(source), len(source)), but {leading} was specified."
1353
+ )
1354
+
1355
+
1356
+ @_fill_leading_sources_param.register(list) # type: ignore # _ already defined above
1357
+ @_fill_leading_sources_param.register(tuple)
1358
+ def _(leading, params, sources):
1359
+ if len(leading) > len(sources):
1360
+ raise ValueError("Number of leading sources can't be bigger number of sources")
1361
+ if isinstance(leading[0], int):
1362
+ result = leading
1363
+ else:
1364
+ result = [__find_by_id(sources, lead) for lead in leading]
1365
+ indexes = ",".join(f"__SRC_{i}__" for i in result)
1366
+ params["leading_sources"] = indexes
1367
+ return result
1368
+
1369
+
1370
+ def __find_by_id(collection, item):
1371
+ for index, s in enumerate(collection):
1372
+ if s is item:
1373
+ return index
1374
+ raise ValueError("The source should be in join sources list")
1375
+
1376
+
1377
+ def _check_schema_for_join_by_time(join_str_keys, sources):
1378
+ # check that there aren't matching columns
1379
+ columns_count = Counter()
1380
+ for src in sources:
1381
+ columns_count.update(src.columns(skip_meta_fields=True).keys())
1382
+ for join_key in join_str_keys:
1383
+ del columns_count[join_key]
1384
+ matched = [k for k, value in columns_count.items() if value > 1]
1385
+ if "OMDSEQ" in matched:
1386
+ # OMDSEQ behaves like the TIMESTAMP field
1387
+ matched.remove("OMDSEQ")
1388
+ if len(matched):
1389
+ raise ValueError(f"There are matched columns between sources: {','.join(matched)}")
1390
+
1391
+
1392
+ def apply_query(query,
1393
+ in_sources=None,
1394
+ output_pins=None,
1395
+ shared_state_variables_list=None,
1396
+ output_type_index=None,
1397
+ **params):
1398
+ from onetick.py.sources import query as otp_query
1399
+
1400
+ output_type = output_type_by_index(in_sources, output_type_index)
1401
+ output_pins = output_pins if output_pins else []
1402
+ in_sources = in_sources if in_sources else {}
1403
+ shared_state_variables_list = shared_state_variables_list if shared_state_variables_list else []
1404
+ if isinstance(query, str):
1405
+ # it seems that path is passed
1406
+ query = otp_query(query, **params)
1407
+
1408
+ elif isinstance(query, otp_query) and params:
1409
+ query.update_params(**params)
1410
+
1411
+ columns = {}
1412
+
1413
+ for src in in_sources.values():
1414
+ columns.update(src.columns(skip_meta_fields=True))
1415
+
1416
+ str_params = query.str_params
1417
+
1418
+ shared_state_variables = ",".join(shared_state_variables_list)
1419
+
1420
+ inputs_need_unbound_symbols = {in_pin: src._is_unbound_required() for in_pin, src in in_sources.items()}
1421
+ if query.graph_info is not None and query.graph_info.has_unbound_if_pinned(inputs_need_unbound_symbols):
1422
+ symbol = adaptive
1423
+ else:
1424
+ symbol = None
1425
+
1426
+ nested_src = output_type(
1427
+ node=otq.NestedOtq(query.path, str_params, shared_state_variables=shared_state_variables),
1428
+ _has_output=len(output_pins) > 0,
1429
+ _symbols=symbol,
1430
+ schema=columns,
1431
+ )
1432
+
1433
+ eps = defaultdict()
1434
+
1435
+ for in_pin, src in in_sources.items():
1436
+ nested_src.source(src.node().copy_graph(eps), in_pin)
1437
+ nested_src.node().add_rules(src.node().copy_rules())
1438
+ nested_src._set_sources_dates(src)
1439
+ nested_src._merge_tmp_otq(src)
1440
+
1441
+ if len(output_pins) == 0:
1442
+ return nested_src
1443
+
1444
+ if len(output_pins) > 1:
1445
+ result = []
1446
+
1447
+ for out_pin in output_pins:
1448
+ res_src = nested_src.copy()
1449
+ res_src.node().out_pin(out_pin)
1450
+ # NOTE: need to comment out this node
1451
+ res_src.sink(otq.Passthrough())
1452
+
1453
+ # apply config customization
1454
+ query.config._apply(out_pin, res_src)
1455
+
1456
+ result.append(res_src)
1457
+
1458
+ return tuple(result)
1459
+ else:
1460
+ # TODO: move setting out_pin on the creating step of nested_src
1461
+ # It seems as not working now, because seems .copy() of _Source doesnt
1462
+ # copy out_pin reference, need to check
1463
+ nested_src.node().out_pin(output_pins[0])
1464
+
1465
+ # apply config customization
1466
+ query.config._apply(output_pins[0], nested_src)
1467
+
1468
+ return nested_src
1469
+
1470
+
1471
+ def apply(query, *args, **kwargs):
1472
+ return apply_query(query.path, *args, **kwargs, **query.params)
1473
+
1474
+
1475
+ def cut(column: 'Column', bins: Union[int, List[float]], labels: Optional[List[str]] = None):
1476
+ """
1477
+ Bin values into discrete intervals (mimics :pandas:`pandas.cut`).
1478
+
1479
+ Parameters
1480
+ ----------
1481
+ column: :py:class:`~onetick.py.Column`
1482
+ Column with numeric data used to build bins.
1483
+ bins: int or List[float]
1484
+
1485
+ When List[float] - defines the bin edges.
1486
+
1487
+ When int - Defines the number of equal-width bins in the range of x.
1488
+ labels: List[str]
1489
+ Labels used to name resulting bins.
1490
+ If not set, bins are numeric intervals like (5.0000000000, 7.5000000000].
1491
+
1492
+ Returns
1493
+ -------
1494
+ object that can be set to :py:class:`~onetick.py.Column` via :py:meth:`~onetick.py.Source.__setitem__`
1495
+
1496
+ Examples
1497
+ --------
1498
+ >>> # OTdirective: snippet-name: Source.functions.cut;
1499
+ >>> data = otp.Ticks({"X": [9, 8, 5, 6, 7, 0, ]})
1500
+ >>> data['bin'] = otp.cut(data['X'], bins=3, labels=['a', 'b', 'c'])
1501
+ >>> otp.run(data)[['X', 'bin']]
1502
+ X bin
1503
+ 0 9 c
1504
+ 1 8 c
1505
+ 2 5 b
1506
+ 3 6 b
1507
+ 4 7 c
1508
+ 5 0 a
1509
+
1510
+ """
1511
+ src = column.obj_ref
1512
+ return _CutBuilder(src, column, bins, labels=labels)
1513
+
1514
+
1515
+ def qcut(column: 'Column', q: Union[int, List[float]], labels: Optional[List[str]] = None):
1516
+ """
1517
+ Quantile-based discretization function (mimics :pandas:`pandas.qcut`).
1518
+
1519
+ Parameters
1520
+ ----------
1521
+ column: :py:class:`~onetick.py.Column`
1522
+ Column with numeric data used to build bins.
1523
+ q: int or List[float]
1524
+
1525
+ When List[float] - array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles.
1526
+
1527
+ When int - Number of quantiles. 10 for deciles, 4 for quartiles, etc.
1528
+ labels: List[str]
1529
+ Labels used to name resulting bins.
1530
+ If not set, bins are numeric intervals like (5.0000000000, 7.5000000000].
1531
+
1532
+ Returns
1533
+ -------
1534
+ object that can be set to :py:class:`~onetick.py.Column` via :py:meth:`~onetick.py.Source.__setitem__`
1535
+
1536
+ Examples
1537
+ --------
1538
+ >>> # OTdirective: snippet-name: Source.functions.qcut;
1539
+ >>> data = otp.Ticks({"X": [10, 3, 5, 6, 7, 1]})
1540
+ >>> data['bin'] = otp.qcut(data['X'], q=3, labels=['a', 'b', 'c'])
1541
+ >>> otp.run(data)[['X', 'bin']]
1542
+ X bin
1543
+ 0 10 c
1544
+ 1 3 a
1545
+ 2 5 b
1546
+ 3 6 b
1547
+ 4 7 c
1548
+ 5 1 a
1549
+ """
1550
+ # TODO when q is a List[float] like [0, .25, .5, .75, 1.]
1551
+ src = column.obj_ref
1552
+ return _QCutBuilder(src, column, q, labels=labels)
1553
+
1554
+
1555
+ def coalesce(sources, max_source_delay: float = 0.0, output_type_index: Optional[int] = None):
1556
+ """
1557
+ Used to fill the gaps in one time series with the ticks from one or several other time series.
1558
+
1559
+ This event processor considers ticks that arrive from several sources at the same time as being the same,
1560
+ allowing for possible delay across the sources when determining whether the ticks are the same.
1561
+ When the same tick arrives from several sources, it is only propagated from the source
1562
+ that has the highest priority among those sources.
1563
+ Input ticks do not necessarily have the same structure - they can have different fields.
1564
+
1565
+ In order to distinguish time series the event processor adds the SYMBOL_NAME field.
1566
+ Also SOURCE field is added to each tick which lacks it to identify the source from which the tick is coming.
1567
+ Hence, one must avoid adding SOURCE field in event processors positioned after COALSECE.
1568
+
1569
+ Parameters
1570
+ ----------
1571
+ sources: list of :class:`Source`
1572
+ List of the sources to coalesce. Also, this list is treated as priority order.
1573
+ First member of the list has the highest priority when determining whether ticks are the same.
1574
+ max_source_delay: float
1575
+ The maximum time in seconds by which a tick from one input time series
1576
+ can arrive later than the same tick from another time series.
1577
+ output_type_index: int
1578
+ Specifies index of source in ``sources`` from which type and properties of output will be taken.
1579
+ Useful when merging sources that inherited from :class:`Source`.
1580
+ By default, output object type will be :class:`Source`.
1581
+
1582
+ Returns
1583
+ -------
1584
+ :class:`Source`
1585
+ A time series of ticks.
1586
+
1587
+ See also
1588
+ --------
1589
+ **COALESCE** OneTick event processor
1590
+
1591
+ Examples
1592
+ --------
1593
+ If ticks from different sources have the same time,
1594
+ only the tick from source with the highest priority will be propagated.
1595
+
1596
+ >>> data1 = otp.Ticks(A=[1, 2])
1597
+ >>> data2 = otp.Ticks(A=[3, 4])
1598
+ >>> data = otp.coalesce([data2, data1])
1599
+ >>> otp.run(data)[['Time', 'A']]
1600
+ Time A
1601
+ 0 2003-12-01 00:00:00.000 3
1602
+ 1 2003-12-01 00:00:00.001 4
1603
+
1604
+ We can use ``max_source_delay`` parameter to expand time interval in which
1605
+ ticks are considered to have the "same time".
1606
+
1607
+ >>> data1 = otp.Ticks({
1608
+ ... 'A': [1, 2, 3],
1609
+ ... 'offset': [0, 3000, 6000],
1610
+ ... })
1611
+ >>> data2 = otp.Ticks({
1612
+ ... 'A': [4, 5, 6],
1613
+ ... # 4 is delayed by less than one second from 1
1614
+ ... # 5 is delayed by one second from 2
1615
+ ... # 6 is delayed by more than one second from 3
1616
+ ... 'offset': [999, 4000, 7001],
1617
+ ... })
1618
+ >>> data = otp.coalesce([data2, data1], max_source_delay=1)
1619
+ >>> otp.run(data)[['Time', 'A']]
1620
+ Time A
1621
+ 0 2003-12-01 00:00:00.999 4
1622
+ 1 2003-12-01 00:00:04.000 5
1623
+ 2 2003-12-01 00:00:06.000 3
1624
+ 3 2003-12-01 00:00:07.001 6
1625
+ """
1626
+ if not sources:
1627
+ raise ValueError("Coalesce should have one or more inputs")
1628
+
1629
+ output_type = output_type_by_index(sources, output_type_index)
1630
+
1631
+ # change node names for sources, COALESCE ep needs them
1632
+ new_node_names = [
1633
+ f'__COALESCE_SRC_{i}__' for i, source in enumerate(sources, start=1)
1634
+ ]
1635
+
1636
+ node = otq.Coalesce(
1637
+ priority_order=','.join(new_node_names),
1638
+ max_source_delay=max_source_delay,
1639
+ )
1640
+
1641
+ columns = {
1642
+ # these fields will be added by COALESCE ep
1643
+ 'SYMBOL_NAME': str,
1644
+ 'TICK_TYPE': str,
1645
+ }
1646
+ for source in sources:
1647
+ for name in ['SYMBOL_NAME', 'TICK_TYPE']:
1648
+ if name in source.schema:
1649
+ raise ValueError(f"Field with name '{name}' is already present in the source. "
1650
+ 'Please, rename or delete that field prior to invoking coalesce().')
1651
+ shared_columns = set(source.schema).intersection(columns)
1652
+ for name in shared_columns:
1653
+ type_1, type_2 = source.schema[name], columns[name]
1654
+ if type_1 != type_2:
1655
+ raise ValueError(f"Conflicting types for field '{name}' in different sources: {type_1}, {type_2}")
1656
+ columns.update(source.schema)
1657
+
1658
+ # TODO: do we need field SOURCE (especially when node names are auto-generated)?
1659
+ # this field will be added by COALESCE if it's not presented in sources
1660
+ columns.setdefault('SOURCE', str)
1661
+
1662
+ result = output_type(node, schema=columns)
1663
+
1664
+ __copy_sources_on_merge_or_join(result, sources, names=new_node_names, output_type_index=output_type_index)
1665
+ return result
1666
+
1667
+
1668
+ def corp_actions(source,
1669
+ adjustment_date: Union[ott.date, ott.datetime, dt.date, dt.datetime, int, str, None] = None,
1670
+ adjustment_date_tz: Union[str, Type[default]] = default,
1671
+ fields=None,
1672
+ adjust_rule="PRICE",
1673
+ apply_split: bool = True,
1674
+ apply_spinoff: bool = False,
1675
+ apply_rights: Optional[bool] = None,
1676
+ apply_cash_dividend: bool = False,
1677
+ apply_stock_dividend: bool = False,
1678
+ apply_security_splice: bool = False,
1679
+ apply_others: str = "",
1680
+ apply_all: bool = False):
1681
+ """
1682
+ Adjusts values using corporate actions information loaded into OneTick
1683
+ from the reference data file. To use it, location of reference database must
1684
+ be specified via OneTick configuration.
1685
+
1686
+ Parameters
1687
+ ----------
1688
+ source: :py:class:`onetick.py.Source`
1689
+ Source object adjusted by corporate actions information.
1690
+ adjustment_date : :py:class:`onetick.py.date`, :py:class:`onetick.py.datetime`, int, str, None, optional
1691
+ The date as of which the values are adjusted.
1692
+ All corporate actions of the types specified in the parameters
1693
+ that lie between the tick timestamp and the adjustment date will be applied to each tick.
1694
+
1695
+ This parameter can be either date or datetime .
1696
+ `int` and `str` format can be *YYYYMMDD* or *YYYYMMDDhhmmss*.
1697
+ When parameter is a date, the time is assumed to be 17:00:00 GMT
1698
+ and parameter ``adjustment_date_tz`` is ignored.
1699
+
1700
+ If it is not set, the values are adjusted as of the end date in the query.
1701
+
1702
+ Notice that the ``adjustment date`` is not affected neither by *_SYMBOL_PARAM._PARAM_END_TIME_NANOS*
1703
+ nor by the *apply_times_daily* setting in :py:func:`onetick.py.run`.
1704
+
1705
+ adjustment_date_tz : str, optional
1706
+ Timezone for ``adjustment date``.
1707
+
1708
+ By default global :py:attr:`tz<onetick.py.configuration.Config.tz>` value is used.
1709
+ Local timezone can't be used so in this case parameter is set to GMT.
1710
+ When ``adjustment_date`` is in YYYYMMDD format, this parameter is set to GMT.
1711
+ fields : str, optional
1712
+ A comma-separated list of fields to be adjusted. If this parameter is not set,
1713
+ some default adjustments will take place if appropriately named fields exist in the tick:
1714
+
1715
+ - If the ``adjust_rule`` parameter is set to PRICE, and the PRICE field is present,
1716
+ it will get adjusted. If the fields ASK_PRICE or BID_PRICE are present, they will get adjusted.
1717
+ If fields ASK_VALUE or BID_VALUE are present, they will get adjusted
1718
+
1719
+ - If the ``adjust_rule`` parameter is set to SIZE, and the SIZE field is present,
1720
+ it will get adjusted. If the fields ASK_SIZE or BID_SIZE are present, they will get adjusted.
1721
+ If fields ASK_VALUE or BID_VALUE are present, they will get adjusted.
1722
+
1723
+ adjust_rule : str, optional
1724
+ When set to PRICE, adjustments are applied under the assumption that fields to be adjusted contain prices
1725
+ (adjustment direction is determined appropriately).
1726
+
1727
+ When set to SIZE, adjustments are applied under the assumption that fields contain sizes
1728
+ (adjustment direction is opposite to that when the parameter's value is PRICE).
1729
+
1730
+ By default the value is PRICE.
1731
+ apply_split : bool, optional
1732
+ If True, adjustments for splits are applied.
1733
+ apply_spinoff : bool, optional
1734
+ If True, adjustments for spin-offs are applied.
1735
+ apply_cash_dividend : bool, optional
1736
+ If True, adjustments for cash dividends are applied.
1737
+ apply_stock_dividend : bool, optional
1738
+ If True, adjustments for stock dividends are applied.
1739
+ apply_security_splice : bool, optional
1740
+ If True, adjustments for security splices are applied.
1741
+ apply_others : str, optional
1742
+ A comma-separated list of names of custom adjustment types to apply.
1743
+ apply_all : bool, optional
1744
+ If True, applies all types of adjustments, both built-in and custom.
1745
+
1746
+ Returns
1747
+ -------
1748
+ :py:class:`onetick.py.Source`
1749
+ A new source object with applied adjustments.
1750
+
1751
+ See also
1752
+ --------
1753
+ **CORP_ACTIONS** OneTick event processor
1754
+
1755
+ Examples
1756
+ --------
1757
+ >>> src = otp.DataSource('US_COMP',
1758
+ ... tick_type='TRD',
1759
+ ... start=otp.dt(2022, 5, 20, 9, 30),
1760
+ ... end=otp.dt(2022, 5, 26, 16))
1761
+ >>> df = otp.run(src, symbols='MKD', symbol_date=otp.date(2022, 5, 22))
1762
+ >>> df["PRICE"][0]
1763
+ 0.0911
1764
+ >>> src = otp.corp_actions(src,
1765
+ ... adjustment_date=otp.date(2022, 5, 22),
1766
+ ... fields="PRICE")
1767
+ >>> df = otp.run(src, symbols='MKD', symbol_date=otp.date(2022, 5, 22))
1768
+ >>> df["PRICE"][0]
1769
+ 1.36649931675
1770
+ """
1771
+ source = source.copy()
1772
+
1773
+ if isinstance(adjustment_date, int):
1774
+ adjustment_date = str(adjustment_date)
1775
+
1776
+ is_datetime_param = None
1777
+
1778
+ if adjustment_date is None or isinstance(adjustment_date, str) and adjustment_date == '':
1779
+ # default value for otq.CorpActions
1780
+ adjustment_date = ''
1781
+ elif isinstance(adjustment_date, (ott.datetime, ott.date, dt.datetime, dt.date, str)):
1782
+ if isinstance(adjustment_date, str):
1783
+ try:
1784
+ dt.datetime.strptime(adjustment_date, '%Y%m%d%H%M%S')
1785
+ if len(adjustment_date) != 14:
1786
+ # strptime doesn't require leading zeroes for %m%d%H%M%S specificators, but we do
1787
+ raise ValueError()
1788
+ is_datetime_param = True
1789
+ except ValueError:
1790
+ try:
1791
+ dt.datetime.strptime(adjustment_date, '%Y%m%d')
1792
+ if len(adjustment_date) != 8:
1793
+ # strptime doesn't require leading zeroes for %m%d specificators, but we do
1794
+ raise ValueError()
1795
+ is_datetime_param = False
1796
+ except ValueError:
1797
+ raise ValueError("Parameter 'adjustment_date' must be in YYYYMMDDhhmmss or YYYYMMDD formats.")
1798
+ adjustment_date = int(adjustment_date)
1799
+ elif type(adjustment_date) in (ott.datetime, dt.datetime):
1800
+ is_datetime_param = True
1801
+ adjustment_date = int(adjustment_date.strftime('%Y%m%d%H%M%S'))
1802
+ elif type(adjustment_date) in (ott.date, dt.date):
1803
+ is_datetime_param = False
1804
+ adjustment_date = int(adjustment_date.strftime('%Y%m%d'))
1805
+ else:
1806
+ raise ValueError("Parameter 'adjustment_date' must be in YYYYMMDDhhmmss or YYYYMMDD formats.")
1807
+
1808
+ adjustment_date_tz_is_default = adjustment_date_tz is default
1809
+ if adjustment_date_tz_is_default:
1810
+ adjustment_date_tz = config.tz
1811
+
1812
+ if not adjustment_date_tz:
1813
+ warnings.warn("Local timezone can't be used in parameter 'adjustment_date_tz', setting to 'GMT'.")
1814
+ adjustment_date_tz = 'GMT'
1815
+
1816
+ if is_datetime_param is not None and not is_datetime_param and adjustment_date_tz != 'GMT':
1817
+ adjustment_date_tz = 'GMT'
1818
+ if not adjustment_date_tz_is_default:
1819
+ warnings.warn("`adjustment_date_tz` was changed to 'GMT' since "
1820
+ "it is the only valid value when `adjustment_date` is in YYYYMMDD format.")
1821
+
1822
+ kwargs = {}
1823
+ if apply_rights is not None and is_apply_rights_supported(throw_warning=True):
1824
+ kwargs['apply_rights'] = apply_rights
1825
+
1826
+ source.sink(otq.CorpActions(
1827
+ adjustment_date=adjustment_date,
1828
+ adjustment_date_tz=adjustment_date_tz,
1829
+ fields=fields or '',
1830
+ adjust_rule=adjust_rule,
1831
+ apply_split=apply_split,
1832
+ apply_spinoff=apply_spinoff,
1833
+ apply_cash_dividend=apply_cash_dividend,
1834
+ apply_stock_dividend=apply_stock_dividend,
1835
+ apply_security_splice=apply_security_splice,
1836
+ apply_others=apply_others,
1837
+ apply_all=apply_all,
1838
+ **kwargs,
1839
+ ))
1840
+ return source
1841
+
1842
+
1843
+ def save_sources_to_single_file(sources,
1844
+ file_path=None,
1845
+ file_suffix='',
1846
+ start=None,
1847
+ end=None,
1848
+ start_time_expression=None,
1849
+ end_time_expression=None,
1850
+ timezone=None,
1851
+ running_query_flag=None):
1852
+ """
1853
+ Save onetick.py.Source objects to the single file.
1854
+
1855
+ Parameters
1856
+ ----------
1857
+ sources: dict or list
1858
+ dict of names -> sources or list of sources to merge into single file.
1859
+ If it's the list then names will be autogenerated.
1860
+ Source can be :class:`otp.Source` object or dictionary with these allowed parameters:
1861
+ {
1862
+ 'source': otp.Source,
1863
+ 'start': datetime(2022, 1, 1), # optional
1864
+ 'end': datetime(2022, 1, 2), # optional
1865
+ 'symbols': otp.Source or otp.Symbols, # optional
1866
+ }
1867
+ file_path: str, optional
1868
+ Path to the file where all sources will be saved.
1869
+ If not set, sources will be saved to temporary file and its name will be returned.
1870
+ file_suffix: str
1871
+ Only used if ``file_path`` is not set.
1872
+ This suffix will be added to the name of a generated query file.
1873
+ start: datetime, optional
1874
+ start time for the resulting query file
1875
+ end: datetime, optional
1876
+ end time for the resulting query file
1877
+ start_time_expression: str, optional
1878
+ start time expression for the resulting query file
1879
+ end_time_expression: str, optional
1880
+ end time expression for the resulting query file
1881
+ timezone: str, optional
1882
+ timezone for the resulting query file
1883
+ running_query_flag: bool, optional
1884
+ running query flag for the resulting query file
1885
+
1886
+ Returns
1887
+ -------
1888
+ If `sources` is list then returns list of full query paths (path_to_file::query_name)
1889
+ with autogenerated names corresponding to each source from `sources`.
1890
+ If `sources` is dict then the path to the query file is returned.
1891
+ """
1892
+ if isinstance(sources, dict):
1893
+ names = sources.keys()
1894
+ sources = sources.values()
1895
+ query_names = None
1896
+ else:
1897
+ names = repeat(None)
1898
+ query_names = []
1899
+ tmp_otq = TmpOtq()
1900
+ for name, source in zip(names, sources):
1901
+ query_start = query_end = query_symbols = query_symbol_date = None
1902
+ if isinstance(source, dict):
1903
+ query_start = source.get('start')
1904
+ query_end = source.get('end')
1905
+ query_symbols = source.get('symbols')
1906
+ query_symbol_date = source.get('symbol_date')
1907
+ source = source['source']
1908
+ query_name = source._store_in_tmp_otq(tmp_otq,
1909
+ name=name,
1910
+ start=query_start,
1911
+ end=query_end,
1912
+ symbols=query_symbols,
1913
+ symbol_date=query_symbol_date)
1914
+ if query_names is not None:
1915
+ query_names.append(query_name)
1916
+ file_path = tmp_otq.save_to_file(
1917
+ file_path=file_path,
1918
+ file_suffix=file_suffix,
1919
+ start=start,
1920
+ end=end,
1921
+ start_time_expression=start_time_expression,
1922
+ end_time_expression=end_time_expression,
1923
+ timezone=timezone,
1924
+ running_query_flag=running_query_flag,
1925
+ )
1926
+ if query_names is not None:
1927
+ return [f'{file_path}::{query_name}' for query_name in query_names]
1928
+ return file_path
1929
+
1930
+
1931
+ class _FormatType(Enum):
1932
+ POSITIONAL = 1
1933
+ OMITTED_POSITIONAL = 2
1934
+ KEY_WORD = 3
1935
+
1936
+
1937
+ def format(format_line: str, *args, **kwargs) -> Operation:
1938
+ """
1939
+ Perform a string formatting operation.
1940
+ Currently, there are only 2 types of formatting available:
1941
+
1942
+ 1. Float precision - ``{:.xf}``, where ``x`` is number, e.g. ``{:.5f}``
1943
+
1944
+ 2. Time formatting - the same as in ``Source.dt.strftime``
1945
+
1946
+ See examples for more information.
1947
+
1948
+ Parameters
1949
+ ----------
1950
+ format_line: str
1951
+ String which contains literal text or replacement fields delimited by braces {}.
1952
+ Currently content of the braces is not supported.
1953
+ args
1954
+ Values to paste into the line.
1955
+ kwargs
1956
+ Key-word values to paste into the line.
1957
+
1958
+ Returns
1959
+ -------
1960
+ :py:class:`~onetick.py.Operation` with type equal to :py:class:`~onetick.py.types.varstring`
1961
+
1962
+ Examples
1963
+ --------
1964
+ It allows to format :py:class:`~onetick.py.Operation`. For example, :py:class:`~onetick.py.Column`:
1965
+
1966
+ >>> data = otp.Ticks(A=[1, 2], B=['abc', 'def'])
1967
+ >>> data['C'] = otp.format('A field value is `{}` and B field value is `{}`', data['A'], data['B'])
1968
+ >>> otp.run(data)
1969
+ Time A B C
1970
+ 0 2003-12-01 00:00:00.000 1 abc A field value is `1` and B field value is `abc`
1971
+ 1 2003-12-01 00:00:00.001 2 def A field value is `2` and B field value is `def`
1972
+
1973
+ Formatting can use positional arguments:
1974
+
1975
+ >>> data = otp.Ticks(A=[1, 2], B=['abc', 'def'])
1976
+ >>> data['C'] = otp.format('A is `{0}`, B is `{1}`. Also, A is `{0}`', data['A'], data['B'])
1977
+ >>> otp.run(data)
1978
+ Time A B C
1979
+ 0 2003-12-01 00:00:00.000 1 abc A is `1`, B is `abc`. Also, A is `1`
1980
+ 1 2003-12-01 00:00:00.001 2 def A is `2`, B is `def`. Also, A is `2`
1981
+
1982
+ Formatting can be used with key-word arguments:
1983
+
1984
+ >>> data = otp.Ticks(A=[1, 2], B=['abc', 'def'])
1985
+ >>> data['C'] = otp.format('A is `{a}`, B is `{b}`. Also, A is `{a}`', a=data['A'], b=data['B'])
1986
+ >>> otp.run(data)
1987
+ Time A B C
1988
+ 0 2003-12-01 00:00:00.000 1 abc A is `1`, B is `abc`. Also, A is `1`
1989
+ 1 2003-12-01 00:00:00.001 2 def A is `2`, B is `def`. Also, A is `2`
1990
+
1991
+ Float numbers can be formatted:
1992
+
1993
+ >>> data = otp.Ticks(A=[12.3456, 67.8971])
1994
+ >>> data['B'] = otp.format('A is about {:.2f}', data['A'])
1995
+ >>> otp.run(data)
1996
+ Time A B
1997
+ 0 2003-12-01 00:00:00.000 12.3456 A is about 12.35
1998
+ 1 2003-12-01 00:00:00.001 67.8971 A is about 67.90
1999
+
2000
+ Time can be formatted:
2001
+
2002
+ >>> data = otp.Tick(A=otp.datetime(2020, 4, 5, 17, 56, 3, 789123))
2003
+ >>> data['B'] = otp.format('A is {:%Y/%m/%d %H:%M:%S.%J}', data['A'])
2004
+ >>> otp.run(data)
2005
+ Time A B
2006
+ 0 2003-12-01 2020-04-05 17:56:03.789123 A is 2020/04/05 17:56:03.789123000
2007
+ """
2008
+ _validate_format_line(format_line)
2009
+ format_array = re.split('[{}]', format_line)
2010
+ format_type = _get_format_type(format_array)
2011
+ res = ott.varstring(format_array[0])
2012
+ cur_index = 0
2013
+ format_spec_array = format_array[1::2]
2014
+ regular_string_array = format_array[2::2]
2015
+ for format_spec, regular_string in zip(format_spec_array, regular_string_array):
2016
+ format_spec_array = format_spec.split(':', 1)
2017
+ format_spec_param = format_spec_array[0]
2018
+ format_spec_additional = None if len(format_spec_array) == 1 else format_spec_array[1]
2019
+ if format_type == _FormatType.POSITIONAL:
2020
+ res = _add_element(res, args[int(format_spec_param)], format_spec_additional)
2021
+ elif format_type == _FormatType.OMITTED_POSITIONAL:
2022
+ res = _add_element(res, args[cur_index], format_spec_additional)
2023
+ cur_index += 1
2024
+ else:
2025
+ res = _add_element(res, kwargs[format_spec_param], format_spec_additional)
2026
+ res += regular_string
2027
+ return res
2028
+
2029
+
2030
+ def _add_element(cur_res, element, format_spec_additional=None):
2031
+ if isinstance(element, Operation):
2032
+ if format_spec_additional is None:
2033
+ cur_res += element.apply(str)
2034
+ elif issubclass(element.dtype, (float, ott.decimal)) and re.fullmatch(r'\.\d+f', format_spec_additional):
2035
+ # float has strange behavior when precision=0
2036
+ decimal_elem = element.apply(ott.decimal)
2037
+ precision_str = re.findall(r'\d+', format_spec_additional)[0]
2038
+ try:
2039
+ precision = int(precision_str)
2040
+ except ValueError as exc:
2041
+ raise ValueError('Incorrect value for `precision` for formatting decimal number') from exc
2042
+
2043
+ cur_res += decimal_elem.decimal.str(precision)
2044
+ elif issubclass(element.dtype, (ott.nsectime, ott.msectime)):
2045
+ cur_res += element.dt.strftime(format_spec_additional)
2046
+ else:
2047
+ raise ValueError(f'Unsupported formatting `{format_spec_additional}` for field type {element.dtype}')
2048
+ else:
2049
+ if format_spec_additional is None:
2050
+ cur_res += str(element)
2051
+ elif isinstance(element, (float, ott.decimal)):
2052
+ formatting = f'{{:{format_spec_additional}}}'
2053
+ cur_res += formatting.format(element)
2054
+ else:
2055
+ raise ValueError(f'Unsupported formatting `{format_spec_additional}` for literal {type(element)}')
2056
+ return cur_res
2057
+
2058
+
2059
+ def _validate_format_line(format_line: str):
2060
+ open_brackets_num = 0
2061
+ close_brackets_num = 0
2062
+ for symbol in format_line:
2063
+ if symbol == '{':
2064
+ open_brackets_num += 1
2065
+ if symbol == '}':
2066
+ close_brackets_num += 1
2067
+ if open_brackets_num > close_brackets_num + 1:
2068
+ raise ValueError("'{' appeared before previous '{' was closed")
2069
+ if open_brackets_num < close_brackets_num:
2070
+ raise ValueError("Single '}' encountered in format string")
2071
+ if open_brackets_num != close_brackets_num:
2072
+ raise ValueError("Single '{' encountered in format string")
2073
+
2074
+
2075
+ def _get_format_type(format_array: List[str]) -> _FormatType:
2076
+ if len(format_array) < 2:
2077
+ return _FormatType.OMITTED_POSITIONAL
2078
+ format_spec_array = format_array[1::2]
2079
+ uses_positional = False
2080
+ uses_omitted_positional = False
2081
+ uses_key_word = False
2082
+ for format_spec in format_spec_array:
2083
+ format_spec_param = format_spec.split(':')[0]
2084
+ if not format_spec_param:
2085
+ uses_omitted_positional = True
2086
+ elif format_spec_param[0].isdigit():
2087
+ if not format_spec_param.isnumeric():
2088
+ raise ValueError(f'Incorrect positional argument: `{format_spec_param}`')
2089
+ uses_positional = True
2090
+ elif format_spec_param[0].isalpha():
2091
+ # only word characters are supported
2092
+ if not re.fullmatch(r'\w+', format_spec_param):
2093
+ raise ValueError(f'Incorrect key word argument: `{format_spec_param}`')
2094
+ uses_key_word = True
2095
+ else:
2096
+ raise ValueError(f'Unrecognised format specification: `{format_spec_param}`')
2097
+ if uses_positional and not (uses_omitted_positional or uses_key_word):
2098
+ return _FormatType.POSITIONAL
2099
+ if uses_omitted_positional and not (uses_positional or uses_key_word):
2100
+ return _FormatType.OMITTED_POSITIONAL
2101
+ if uses_key_word and not (uses_positional or uses_omitted_positional):
2102
+ return _FormatType.KEY_WORD
2103
+ raise ValueError("Format string has mixed type of referring to arguments which is not allowed")
2104
+
2105
+
2106
+ def join_with_aggregated_window(
2107
+ agg_src, pass_src, aggregation,
2108
+ boundary_aggr_tick: str = 'next',
2109
+ pass_src_delay_msec: int = 0,
2110
+ bucket_interval: int = 0,
2111
+ bucket_units: Literal['seconds', 'ticks', 'days', 'months', 'flexible'] = 'seconds',
2112
+ output_type_index=None,
2113
+ ):
2114
+ """
2115
+ Computes one or more aggregations on ``agg_src`` time series
2116
+ and joins the result with each incoming tick from ``pass_src`` time series.
2117
+
2118
+ Parameters
2119
+ ----------
2120
+ agg_src: :py:class:`onetick.py.Source`
2121
+ Input time series to which aggregation will be applied.
2122
+ pass_src: :py:class:`onetick.py.Source`
2123
+ Input time series that will be joined with the aggregation result.
2124
+ aggregation: dict
2125
+ Dictionary with aggregation output field names and aggregation objects,
2126
+ similar to the one passed to :py:meth:`onetick.py.Source.agg` method.
2127
+ pass_src_delay_msec: int
2128
+ Specifies by how much any incoming tick from the ``pass_src`` is delayed.
2129
+
2130
+ The effective timestamp of a tick from the ``pass_src`` with timestamp ``T`` is ``T - pass_src_delay_msec``.
2131
+ This parameter may be negative, in which case ticks from ``pass_src`` will be joined
2132
+ with the aggregation result of a later timestamp.
2133
+ boundary_aggr_tick: str
2134
+ Controls the logic of joining ticks with the same timestamp.
2135
+
2136
+ If set to **next**, ticks from ``agg_src`` with the same timestamp (+ ``pass_src_delay_msec``)
2137
+ as the latest ticks from ``pass_src`` will not be included in that tick's joined aggregation.
2138
+ bucket_interval: int
2139
+ Determines the length of each bucket (units depends on ``bucket_units``).
2140
+
2141
+ When this parameter is set to 0 (by default),
2142
+ the computation of the aggregation is performed for all ticks starting from the query's start time
2143
+ and until ``pass_src`` effective tick timestamp ``T - pass_src_delay_timestamp``,
2144
+ regardless of the value of ``bucket_units``.
2145
+ bucket_units: 'seconds', 'ticks', 'days', 'months'
2146
+ Set bucket interval units.
2147
+ output_type_index: int
2148
+ Specifies index of source between ``agg_src`` and ``pass_src``
2149
+ from which type and properties of output object will be taken.
2150
+ Useful when merging sources that inherited from :class:`Source`.
2151
+ By default, output object type will be :class:`Source`.
2152
+
2153
+ Returns
2154
+ -------
2155
+ :py:class:`onetick.py.Source`
2156
+
2157
+ See also
2158
+ --------
2159
+ **JOIN_WITH_AGGREGATED_WINDOW** OneTick event processor
2160
+
2161
+ Examples
2162
+ --------
2163
+
2164
+ >>> agg_src = otp.Ticks(A=[0, 1, 2, 3, 4, 5, 6])
2165
+ >>> pass_src = otp.Ticks(B=[1, 3, 5], offset=[1, 3, 5])
2166
+ >>> otp.run(agg_src)
2167
+ Time A
2168
+ 0 2003-12-01 00:00:00.000 0
2169
+ 1 2003-12-01 00:00:00.001 1
2170
+ 2 2003-12-01 00:00:00.002 2
2171
+ 3 2003-12-01 00:00:00.003 3
2172
+ 4 2003-12-01 00:00:00.004 4
2173
+ 5 2003-12-01 00:00:00.005 5
2174
+ 6 2003-12-01 00:00:00.006 6
2175
+ >>> otp.run(pass_src)
2176
+ Time B
2177
+ 0 2003-12-01 00:00:00.001 1
2178
+ 1 2003-12-01 00:00:00.003 3
2179
+ 2 2003-12-01 00:00:00.005 5
2180
+
2181
+ By default the aggregation is applied to the ticks from ``agg_src`` in the bucket
2182
+ from query start time until (but not including) the *effective* timestamp of the tick from ``pass_src``:
2183
+
2184
+ .. testcode::
2185
+ :skipif: not is_supported_join_with_aggregated_window()
2186
+
2187
+ agg_src = otp.Ticks(A=[0, 1, 2, 3, 4, 5, 6])
2188
+ pass_src = otp.Ticks(B=[1, 3, 5], offset=[1, 3, 5])
2189
+ data = otp.join_with_aggregated_window(
2190
+ agg_src, pass_src, {
2191
+ 'SUM': otp.agg.sum('A'),
2192
+ 'COUNT': otp.agg.count(),
2193
+ }
2194
+ )
2195
+ df = otp.run(data)
2196
+ print(df)
2197
+
2198
+ .. testoutput::
2199
+
2200
+ Time SUM COUNT B
2201
+ 0 2003-12-01 00:00:00.001 0 1 1
2202
+ 1 2003-12-01 00:00:00.003 3 3 3
2203
+ 2 2003-12-01 00:00:00.005 10 5 5
2204
+
2205
+ If you want ticks from ``agg_src`` with timestamp equal to *effective* timestamp of tick from ``pass_src``
2206
+ to be included in bucket, you can set ``boundary_aggr_tick`` to ``previous``:
2207
+
2208
+ .. testcode::
2209
+ :skipif: not is_supported_join_with_aggregated_window()
2210
+
2211
+ agg_src = otp.Ticks(A=[0, 1, 2, 3, 4, 5, 6])
2212
+ pass_src = otp.Ticks(B=[1, 3, 5], offset=[1, 3, 5])
2213
+ data = otp.join_with_aggregated_window(
2214
+ agg_src, pass_src, {
2215
+ 'SUM': otp.agg.sum('A'),
2216
+ 'COUNT': otp.agg.count(),
2217
+ },
2218
+ boundary_aggr_tick='previous',
2219
+ )
2220
+ df = otp.run(data)
2221
+ print(df)
2222
+
2223
+ .. testoutput::
2224
+
2225
+ Time SUM COUNT B
2226
+ 0 2003-12-01 00:00:00.001 1 2 1
2227
+ 1 2003-12-01 00:00:00.003 6 4 3
2228
+ 2 2003-12-01 00:00:00.005 15 6 5
2229
+
2230
+ Set parameters ``bucket_interval`` and ``bucket_units`` to control the size of the aggregation bucket.
2231
+ For example, to aggregate buckets of two ticks:
2232
+
2233
+ .. testcode::
2234
+ :skipif: not is_supported_join_with_aggregated_window()
2235
+
2236
+ agg_src = otp.Ticks(A=[0, 1, 2, 3, 4, 5, 6])
2237
+ pass_src = otp.Ticks(B=[1, 3, 5], offset=[1, 3, 5])
2238
+ data = otp.join_with_aggregated_window(
2239
+ agg_src, pass_src, {
2240
+ 'SUM': otp.agg.sum('A'),
2241
+ 'COUNT': otp.agg.count(),
2242
+ },
2243
+ boundary_aggr_tick='previous',
2244
+ bucket_interval=2,
2245
+ bucket_units='ticks',
2246
+ )
2247
+ df = otp.run(data)
2248
+ print(df)
2249
+
2250
+ .. testoutput::
2251
+
2252
+ Time SUM COUNT B
2253
+ 0 2003-12-01 00:00:00.001 1 2 1
2254
+ 1 2003-12-01 00:00:00.003 5 2 3
2255
+ 2 2003-12-01 00:00:00.005 9 2 5
2256
+
2257
+ By default the *effective* timestamp of the tick from ``pass_src`` is the same as original.
2258
+ It can be changed with parameter ``pass_src_delay_msec``.
2259
+ The *effective* timestamp of the tick is calculated with ``T - pass_src_delay_msec``,
2260
+ and parameter ``pass_src_delay_msec`` can be negative too.
2261
+ This allows to shift bucket end boundary like this:
2262
+
2263
+ .. testcode::
2264
+ :skipif: not is_supported_join_with_aggregated_window()
2265
+
2266
+ agg_src = otp.Ticks(A=[0, 1, 2, 3, 4, 5, 6])
2267
+ pass_src = otp.Ticks(B=[1, 3, 5], offset=[1, 3, 5])
2268
+ data = otp.join_with_aggregated_window(
2269
+ agg_src, pass_src, {
2270
+ 'SUM': otp.agg.sum('A'),
2271
+ 'COUNT': otp.agg.count(),
2272
+ },
2273
+ boundary_aggr_tick='previous',
2274
+ pass_src_delay_msec=-1,
2275
+ )
2276
+ df = otp.run(data)
2277
+ print(df)
2278
+
2279
+ .. testoutput::
2280
+
2281
+ Time SUM COUNT B
2282
+ 0 2003-12-01 00:00:00.001 3 3 1
2283
+ 1 2003-12-01 00:00:00.003 10 5 3
2284
+ 2 2003-12-01 00:00:00.005 21 7 5
2285
+
2286
+ Use parameter ``output_type_index`` to specify which input class to use to create output object.
2287
+ It may be useful in case some custom user class was used as input:
2288
+
2289
+ .. testcode::
2290
+ :skipif: not is_supported_join_with_aggregated_window()
2291
+
2292
+ class CustomTick(otp.Tick):
2293
+ def custom_method(self):
2294
+ return 'custom_result'
2295
+ data1 = otp.Tick(A=1)
2296
+ data2 = CustomTick(B=2)
2297
+ data = otp.join_with_aggregated_window(
2298
+ data1, data2, {'A': otp.agg.count()},
2299
+ boundary_aggr_tick='previous',
2300
+ output_type_index=1,
2301
+ )
2302
+ print(type(data))
2303
+ print(repr(data.custom_method()))
2304
+ print(otp.run(data))
2305
+
2306
+ .. testoutput::
2307
+
2308
+ <class 'onetick.py.functions.CustomTick'>
2309
+ 'custom_result'
2310
+ Time A B
2311
+ 0 2003-12-01 1 2
2312
+
2313
+ Use-case: check the volume in the 60 seconds following this trade (not including this trade):
2314
+
2315
+ >>> data = otp.DataSource('US_COMP', tick_type='TRD', symbols='MSFT', date=otp.dt(2022, 3, 3))
2316
+ >>> otp.run(data)
2317
+ Time PRICE SIZE
2318
+ 0 2022-03-03 00:00:00.000 1.0 100
2319
+ 1 2022-03-03 00:00:00.001 1.1 101
2320
+ 2 2022-03-03 00:00:00.002 1.2 102
2321
+ 3 2022-03-03 00:01:00.000 2.0 200
2322
+ 4 2022-03-03 00:01:00.001 2.1 201
2323
+ 5 2022-03-03 00:01:00.002 2.2 202
2324
+
2325
+ .. testcode::
2326
+ :skipif: not is_supported_join_with_aggregated_window()
2327
+
2328
+ data = otp.DataSource('US_COMP', tick_type='TRD', symbols='MSFT', date=otp.dt(2022, 3, 3))
2329
+ data = otp.join_with_aggregated_window(
2330
+ data, data, {'VOLUME': otp.agg.sum('SIZE')},
2331
+ boundary_aggr_tick='next',
2332
+ pass_src_delay_msec=-60000,
2333
+ bucket_interval=60, bucket_units='seconds',
2334
+ )
2335
+ df = otp.run(data)
2336
+ print(df)
2337
+
2338
+ .. testoutput::
2339
+
2340
+ Time VOLUME PRICE SIZE
2341
+ 0 2022-03-03 00:00:00.000 203 1.0 100
2342
+ 1 2022-03-03 00:00:00.001 302 1.1 101
2343
+ 2 2022-03-03 00:00:00.002 401 1.2 102
2344
+ 3 2022-03-03 00:01:00.000 403 2.0 200
2345
+ 4 2022-03-03 00:01:00.001 202 2.1 201
2346
+ 5 2022-03-03 00:01:00.002 0 2.2 202
2347
+ """
2348
+ if not is_supported_join_with_aggregated_window():
2349
+ raise RuntimeError('Function join_with_aggregated_window() is not supported on this OneTick build')
2350
+
2351
+ if boundary_aggr_tick not in {'next', 'previous'}:
2352
+ raise ValueError(f"Wrong value of 'boundary_aggr_tick' parameter: '{boundary_aggr_tick}'")
2353
+ if boundary_aggr_tick == 'next':
2354
+ boundary_aggr_tick_behavior = 'NEXT_WINDOW'
2355
+ is_supported_next_in_join_with_aggregated_window(
2356
+ throw_warning=True,
2357
+ feature_name="setting parameter 'boundary_aggr_tick' to 'next' (as this may result in crash)"
2358
+ )
2359
+ else:
2360
+ boundary_aggr_tick_behavior = 'PREV_WINDOW'
2361
+
2362
+ aggregation_str = ','.join([
2363
+ str(aggr) + " " + name
2364
+ for name, aggr in aggregation.items()
2365
+ ])
2366
+
2367
+ params = dict(
2368
+ aggregation_source='__AGG_SRC__',
2369
+ pass_source='__PASS_SRC__',
2370
+ boundary_aggr_tick_behavior=boundary_aggr_tick_behavior,
2371
+ append_output_field_name=False,
2372
+ aggregation=aggregation_str,
2373
+ pass_source_delay_msec=pass_src_delay_msec,
2374
+ bucket_interval=bucket_interval,
2375
+ bucket_interval_units=bucket_units.upper(),
2376
+ )
2377
+
2378
+ output_type = output_type_by_index((agg_src, pass_src), output_type_index)
2379
+
2380
+ agg_src = agg_src.copy()
2381
+ pass_src = pass_src.copy()
2382
+
2383
+ agg_src.node_name('__AGG_SRC__')
2384
+ pass_src.node_name('__PASS_SRC__')
2385
+
2386
+ columns = {}
2387
+ for name, aggr in aggregation.items():
2388
+ columns.update(aggr._get_output_schema(agg_src, name=name))
2389
+ columns.update(pass_src.schema)
2390
+ result = output_type(node=otq.JoinWithAggregatedWindow(**params), schema=columns)
2391
+
2392
+ __copy_sources_on_merge_or_join(result, (agg_src, pass_src),
2393
+ names=('__AGG_SRC__', '__PASS_SRC__'),
2394
+ output_type_index=output_type_index)
2395
+
2396
+ # adding table to convert types in schema, e.g. float to int
2397
+ result._add_table(strict=False)
2398
+ return result