onetick-py 1.177.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- locator_parser/__init__.py +0 -0
- locator_parser/acl.py +73 -0
- locator_parser/actions.py +262 -0
- locator_parser/common.py +368 -0
- locator_parser/io.py +43 -0
- locator_parser/locator.py +150 -0
- onetick/__init__.py +101 -0
- onetick/doc_utilities/__init__.py +3 -0
- onetick/doc_utilities/napoleon.py +40 -0
- onetick/doc_utilities/ot_doctest.py +140 -0
- onetick/doc_utilities/snippets.py +279 -0
- onetick/lib/__init__.py +4 -0
- onetick/lib/instance.py +141 -0
- onetick/py/__init__.py +293 -0
- onetick/py/_stack_info.py +89 -0
- onetick/py/_version.py +2 -0
- onetick/py/aggregations/__init__.py +11 -0
- onetick/py/aggregations/_base.py +648 -0
- onetick/py/aggregations/_docs.py +948 -0
- onetick/py/aggregations/compute.py +286 -0
- onetick/py/aggregations/functions.py +2216 -0
- onetick/py/aggregations/generic.py +104 -0
- onetick/py/aggregations/high_low.py +80 -0
- onetick/py/aggregations/num_distinct.py +83 -0
- onetick/py/aggregations/order_book.py +501 -0
- onetick/py/aggregations/other.py +1014 -0
- onetick/py/backports.py +26 -0
- onetick/py/cache.py +374 -0
- onetick/py/callback/__init__.py +5 -0
- onetick/py/callback/callback.py +276 -0
- onetick/py/callback/callbacks.py +131 -0
- onetick/py/compatibility.py +798 -0
- onetick/py/configuration.py +771 -0
- onetick/py/core/__init__.py +0 -0
- onetick/py/core/_csv_inspector.py +93 -0
- onetick/py/core/_internal/__init__.py +0 -0
- onetick/py/core/_internal/_manually_bound_value.py +6 -0
- onetick/py/core/_internal/_nodes_history.py +250 -0
- onetick/py/core/_internal/_op_utils/__init__.py +0 -0
- onetick/py/core/_internal/_op_utils/every_operand.py +9 -0
- onetick/py/core/_internal/_op_utils/is_const.py +10 -0
- onetick/py/core/_internal/_per_tick_scripts/tick_list_sort_template.script +121 -0
- onetick/py/core/_internal/_proxy_node.py +140 -0
- onetick/py/core/_internal/_state_objects.py +2312 -0
- onetick/py/core/_internal/_state_vars.py +93 -0
- onetick/py/core/_source/__init__.py +0 -0
- onetick/py/core/_source/_symbol_param.py +95 -0
- onetick/py/core/_source/schema.py +97 -0
- onetick/py/core/_source/source_methods/__init__.py +0 -0
- onetick/py/core/_source/source_methods/aggregations.py +809 -0
- onetick/py/core/_source/source_methods/applyers.py +296 -0
- onetick/py/core/_source/source_methods/columns.py +141 -0
- onetick/py/core/_source/source_methods/data_quality.py +301 -0
- onetick/py/core/_source/source_methods/debugs.py +272 -0
- onetick/py/core/_source/source_methods/drops.py +120 -0
- onetick/py/core/_source/source_methods/fields.py +619 -0
- onetick/py/core/_source/source_methods/filters.py +1002 -0
- onetick/py/core/_source/source_methods/joins.py +1413 -0
- onetick/py/core/_source/source_methods/merges.py +605 -0
- onetick/py/core/_source/source_methods/misc.py +1455 -0
- onetick/py/core/_source/source_methods/pandases.py +155 -0
- onetick/py/core/_source/source_methods/renames.py +356 -0
- onetick/py/core/_source/source_methods/sorts.py +183 -0
- onetick/py/core/_source/source_methods/switches.py +142 -0
- onetick/py/core/_source/source_methods/symbols.py +117 -0
- onetick/py/core/_source/source_methods/times.py +627 -0
- onetick/py/core/_source/source_methods/writes.py +986 -0
- onetick/py/core/_source/symbol.py +205 -0
- onetick/py/core/_source/tmp_otq.py +222 -0
- onetick/py/core/column.py +209 -0
- onetick/py/core/column_operations/__init__.py +0 -0
- onetick/py/core/column_operations/_methods/__init__.py +4 -0
- onetick/py/core/column_operations/_methods/_internal.py +28 -0
- onetick/py/core/column_operations/_methods/conversions.py +216 -0
- onetick/py/core/column_operations/_methods/methods.py +292 -0
- onetick/py/core/column_operations/_methods/op_types.py +160 -0
- onetick/py/core/column_operations/accessors/__init__.py +0 -0
- onetick/py/core/column_operations/accessors/_accessor.py +28 -0
- onetick/py/core/column_operations/accessors/decimal_accessor.py +104 -0
- onetick/py/core/column_operations/accessors/dt_accessor.py +537 -0
- onetick/py/core/column_operations/accessors/float_accessor.py +184 -0
- onetick/py/core/column_operations/accessors/str_accessor.py +1367 -0
- onetick/py/core/column_operations/base.py +1121 -0
- onetick/py/core/cut_builder.py +150 -0
- onetick/py/core/db_constants.py +20 -0
- onetick/py/core/eval_query.py +245 -0
- onetick/py/core/lambda_object.py +441 -0
- onetick/py/core/multi_output_source.py +232 -0
- onetick/py/core/per_tick_script.py +2256 -0
- onetick/py/core/query_inspector.py +464 -0
- onetick/py/core/source.py +1744 -0
- onetick/py/db/__init__.py +2 -0
- onetick/py/db/_inspection.py +1128 -0
- onetick/py/db/db.py +1327 -0
- onetick/py/db/utils.py +64 -0
- onetick/py/docs/__init__.py +0 -0
- onetick/py/docs/docstring_parser.py +112 -0
- onetick/py/docs/utils.py +81 -0
- onetick/py/functions.py +2398 -0
- onetick/py/license.py +190 -0
- onetick/py/log.py +88 -0
- onetick/py/math.py +935 -0
- onetick/py/misc.py +470 -0
- onetick/py/oqd/__init__.py +22 -0
- onetick/py/oqd/eps.py +1195 -0
- onetick/py/oqd/sources.py +325 -0
- onetick/py/otq.py +216 -0
- onetick/py/pyomd_mock.py +47 -0
- onetick/py/run.py +916 -0
- onetick/py/servers.py +173 -0
- onetick/py/session.py +1347 -0
- onetick/py/sources/__init__.py +19 -0
- onetick/py/sources/cache.py +167 -0
- onetick/py/sources/common.py +128 -0
- onetick/py/sources/csv.py +642 -0
- onetick/py/sources/custom.py +85 -0
- onetick/py/sources/data_file.py +305 -0
- onetick/py/sources/data_source.py +1045 -0
- onetick/py/sources/empty.py +94 -0
- onetick/py/sources/odbc.py +337 -0
- onetick/py/sources/order_book.py +271 -0
- onetick/py/sources/parquet.py +168 -0
- onetick/py/sources/pit.py +191 -0
- onetick/py/sources/query.py +495 -0
- onetick/py/sources/snapshots.py +419 -0
- onetick/py/sources/split_query_output_by_symbol.py +198 -0
- onetick/py/sources/symbology_mapping.py +123 -0
- onetick/py/sources/symbols.py +374 -0
- onetick/py/sources/ticks.py +825 -0
- onetick/py/sql.py +70 -0
- onetick/py/state.py +251 -0
- onetick/py/types.py +2131 -0
- onetick/py/utils/__init__.py +70 -0
- onetick/py/utils/acl.py +93 -0
- onetick/py/utils/config.py +186 -0
- onetick/py/utils/default.py +49 -0
- onetick/py/utils/file.py +38 -0
- onetick/py/utils/helpers.py +76 -0
- onetick/py/utils/locator.py +94 -0
- onetick/py/utils/perf.py +498 -0
- onetick/py/utils/query.py +49 -0
- onetick/py/utils/render.py +1374 -0
- onetick/py/utils/script.py +244 -0
- onetick/py/utils/temp.py +471 -0
- onetick/py/utils/types.py +120 -0
- onetick/py/utils/tz.py +84 -0
- onetick_py-1.177.0.dist-info/METADATA +137 -0
- onetick_py-1.177.0.dist-info/RECORD +152 -0
- onetick_py-1.177.0.dist-info/WHEEL +5 -0
- onetick_py-1.177.0.dist-info/entry_points.txt +2 -0
- onetick_py-1.177.0.dist-info/licenses/LICENSE +21 -0
- onetick_py-1.177.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,1367 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from onetick.py import types as ott
|
|
4
|
+
from onetick.py import configuration, utils
|
|
5
|
+
from onetick.py.core.column_operations.accessors._accessor import _Accessor
|
|
6
|
+
from onetick.py.core.column_operations.base import _Operation
|
|
7
|
+
from onetick.py.backports import Literal
|
|
8
|
+
from onetick.py.docs.utils import alias
|
|
9
|
+
from onetick.py.compatibility import is_ilike_supported
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _get_onetick_bool_string(value: bool) -> str:
|
|
13
|
+
if value:
|
|
14
|
+
return '"true"'
|
|
15
|
+
return '"false"'
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class _StrAccessor(_Accessor):
|
|
19
|
+
""" Accessor for string functions
|
|
20
|
+
|
|
21
|
+
>>> data = otp.Ticks(X=['some string'])
|
|
22
|
+
>>> data["Y"] = data["X"].str.<function_name>() # doctest: +SKIP
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def to_datetime(self,
|
|
26
|
+
format='%Y/%m/%d %H:%M:%S.%J',
|
|
27
|
+
timezone=None,
|
|
28
|
+
unit: Optional[Literal['ms', 'ns']] = None):
|
|
29
|
+
"""
|
|
30
|
+
Converts the formatted time to the number of nanoseconds (datetime) since 1970/01/01 GMT.
|
|
31
|
+
|
|
32
|
+
Parameters
|
|
33
|
+
----------
|
|
34
|
+
format: str, Operation, Column
|
|
35
|
+
The format might contain any characters, but the following combinations of
|
|
36
|
+
characters have special meanings
|
|
37
|
+
|
|
38
|
+
%Y - Year (4 digits)
|
|
39
|
+
|
|
40
|
+
%y - Year (2 digits)
|
|
41
|
+
|
|
42
|
+
%m - Month (2 digits)
|
|
43
|
+
|
|
44
|
+
%d - Day of month (2 digits)
|
|
45
|
+
|
|
46
|
+
%H - Hours (2 digits, 24-hour format)
|
|
47
|
+
|
|
48
|
+
%I - Hours (2 digits, 12-hour format)
|
|
49
|
+
|
|
50
|
+
%M - Minutes (2 digits)
|
|
51
|
+
|
|
52
|
+
%S - Seconds (2 digits)
|
|
53
|
+
|
|
54
|
+
%J - Nanoseconds (9 digits)
|
|
55
|
+
|
|
56
|
+
%p - AM/PM (2 characters)
|
|
57
|
+
|
|
58
|
+
timezone: str | Operation | Column
|
|
59
|
+
Timezone. The timezone of the query will be used if no ``timezone`` was passed.
|
|
60
|
+
|
|
61
|
+
unit: str, optional
|
|
62
|
+
If set, `format` and `timezone` are ignored.
|
|
63
|
+
If equals to `ns`, constructs a nanosecond-granularity timestamp from a millisecond-granularity
|
|
64
|
+
string. It has the following format: < milliseconds since 1970/01/01 GMT >.< fraction of a millisecond >.
|
|
65
|
+
The fraction might have at most six digits. If the fraction is equal to zero,
|
|
66
|
+
.< fraction of a millisecond > is optional.
|
|
67
|
+
If equals to `ms`, constructs a millisecond-granularity timestamp from a millisecond-granularity
|
|
68
|
+
string. It has the following format: < milliseconds since 1970/01/01 GMT >.
|
|
69
|
+
|
|
70
|
+
Returns
|
|
71
|
+
-------
|
|
72
|
+
Operation
|
|
73
|
+
:py:class:`nsectime <onetick.py.types.nsectime>` Operation obtained from the string
|
|
74
|
+
|
|
75
|
+
Examples
|
|
76
|
+
--------
|
|
77
|
+
>>> # OTdirective: snippet-name: string.to timestamp;
|
|
78
|
+
>>> data = otp.Tick(X='5/17/22-11:10:56.123456789')
|
|
79
|
+
>>> data['Y'] = data['X'].str.to_datetime('%m/%d/%y-%H:%M:%S.%J', 'Europe/London')
|
|
80
|
+
>>> otp.run(data)
|
|
81
|
+
Time X Y
|
|
82
|
+
0 2003-12-01 5/17/22-11:10:56.123456789 2022-05-17 06:10:56.123456789
|
|
83
|
+
|
|
84
|
+
>>> data = otp.Ticks(A=['1693825877111.002001', '1693825877112'])
|
|
85
|
+
>>> data['NSECTIME_A'] = data['A'].str.to_datetime(unit='ns')
|
|
86
|
+
>>> otp.run(data)
|
|
87
|
+
Time A NSECTIME_A
|
|
88
|
+
0 2003-12-01 00:00:00.000 1693825877111.002001 2023-09-04 07:11:17.111002001
|
|
89
|
+
1 2003-12-01 00:00:00.001 1693825877112 2023-09-04 07:11:17.112000000
|
|
90
|
+
|
|
91
|
+
>>> data = otp.Tick(A='1693825877111')
|
|
92
|
+
>>> data['MSECTIME_A'] = data['A'].str.to_datetime(unit='ms')
|
|
93
|
+
>>> otp.run(data)
|
|
94
|
+
Time A MSECTIME_A
|
|
95
|
+
0 2003-12-01 1693825877111 2023-09-04 07:11:17.111
|
|
96
|
+
"""
|
|
97
|
+
if unit is None:
|
|
98
|
+
if timezone is utils.default:
|
|
99
|
+
timezone = configuration.config.tz
|
|
100
|
+
|
|
101
|
+
def formatter(column, fmt, tz):
|
|
102
|
+
column_str = ott.value2str(column)
|
|
103
|
+
tz_str, format_str = self._preprocess_tz_and_format(tz, fmt)
|
|
104
|
+
format_str = format_str.replace('%f', '%J')
|
|
105
|
+
return f'parse_nsectime({format_str},{column_str},{tz_str})'
|
|
106
|
+
|
|
107
|
+
return _StrAccessor.Formatter(
|
|
108
|
+
op_params=[self._base_column, format, timezone],
|
|
109
|
+
dtype=ott.nsectime,
|
|
110
|
+
formatter=formatter,
|
|
111
|
+
)
|
|
112
|
+
else:
|
|
113
|
+
if unit == 'ns':
|
|
114
|
+
return _StrAccessor.Formatter(
|
|
115
|
+
op_params=[self._base_column],
|
|
116
|
+
dtype=ott.nsectime,
|
|
117
|
+
formatter=lambda column: f'MSEC_STR_TO_NSECTIME({ott.value2str(column)})',
|
|
118
|
+
)
|
|
119
|
+
if unit == 'ms':
|
|
120
|
+
return _StrAccessor.Formatter(
|
|
121
|
+
op_params=[self._base_column],
|
|
122
|
+
dtype=ott.msectime,
|
|
123
|
+
formatter=lambda column: f'GET_MSECS(MSEC_STR_TO_NSECTIME({ott.value2str(column)}))',
|
|
124
|
+
)
|
|
125
|
+
raise ValueError(f'`{unit}` is unsupported value for `unit` parameter')
|
|
126
|
+
|
|
127
|
+
strptime = alias(to_datetime,
|
|
128
|
+
doc_replacer=lambda doc: doc.replace('to_datetime', 'strptime'))
|
|
129
|
+
|
|
130
|
+
def token(self, sep=" ", n=0):
|
|
131
|
+
"""
|
|
132
|
+
Breaks the value into tokens based on the delimiter ``sep``
|
|
133
|
+
and returns token at position ``n`` (zero-based).
|
|
134
|
+
|
|
135
|
+
If there are not enough tokens to get the one at position ``n``, then empty string is returned.
|
|
136
|
+
|
|
137
|
+
Parameters
|
|
138
|
+
----------
|
|
139
|
+
sep: str or Column or Operation
|
|
140
|
+
The delimiter, which must be a single character used to split the string into tokens.
|
|
141
|
+
n: int, Operation
|
|
142
|
+
Token index to return. For a negative ``n``, count from the end instead of the beginning.
|
|
143
|
+
If index is out of range, then empty string is returned.
|
|
144
|
+
|
|
145
|
+
Returns
|
|
146
|
+
-------
|
|
147
|
+
Operation
|
|
148
|
+
token at position ``n`` or empty string.
|
|
149
|
+
|
|
150
|
+
Examples
|
|
151
|
+
-------
|
|
152
|
+
>>> # OTdirective: snippet-name: string.token;
|
|
153
|
+
>>> data = otp.Tick(X='US_COMP::TRD')
|
|
154
|
+
>>> data['Y'] = data['X'].str.token(':', -1)
|
|
155
|
+
>>> otp.run(data)
|
|
156
|
+
Time X Y
|
|
157
|
+
0 2003-12-01 US_COMP::TRD TRD
|
|
158
|
+
|
|
159
|
+
Other columns can be used as parameters too:
|
|
160
|
+
|
|
161
|
+
>>> data = otp.Tick(X='US_COMP::TRD', SEP=':', N=-1)
|
|
162
|
+
>>> data['Y'] = data['X'].str.token(data['SEP'], data['N'])
|
|
163
|
+
>>> otp.run(data)
|
|
164
|
+
Time X SEP N Y
|
|
165
|
+
0 2003-12-01 US_COMP::TRD : -1 TRD
|
|
166
|
+
|
|
167
|
+
If index is out of range, then empty string is returned:
|
|
168
|
+
|
|
169
|
+
>>> data = otp.Tick(X='US_COMP::TRD')
|
|
170
|
+
>>> data['Y'] = data['X'].str.token(':', 999)
|
|
171
|
+
>>> otp.run(data)
|
|
172
|
+
Time X Y
|
|
173
|
+
0 2003-12-01 US_COMP::TRD
|
|
174
|
+
"""
|
|
175
|
+
if isinstance(sep, str) and len(sep) != 1:
|
|
176
|
+
raise ValueError("Function '.str.token()' expects parameter 'sep' to be a single character")
|
|
177
|
+
return _StrAccessor.Formatter(
|
|
178
|
+
op_params=[self._base_column, sep, n],
|
|
179
|
+
dtype=self._base_column.dtype,
|
|
180
|
+
formatter=lambda column, sep, n: f'token({ott.value2str(column)},{ott.value2str(n)},{ott.value2str(sep)})'
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
def match(self, pat, case=True):
|
|
184
|
+
r"""
|
|
185
|
+
Match the text against a regular expression specified in the ``pat`` parameter.
|
|
186
|
+
|
|
187
|
+
Parameters
|
|
188
|
+
----------
|
|
189
|
+
pat: str or Column or Operation
|
|
190
|
+
A pattern specified via the POSIX extended regular expression syntax.
|
|
191
|
+
case: bool
|
|
192
|
+
If ``True``, then regular expression is case-sensitive.
|
|
193
|
+
|
|
194
|
+
Returns
|
|
195
|
+
-------
|
|
196
|
+
Operation
|
|
197
|
+
``True`` if the match was successful, ``False`` otherwise.
|
|
198
|
+
Note that boolean Operation is converted to float if added as a column.
|
|
199
|
+
|
|
200
|
+
Examples
|
|
201
|
+
--------
|
|
202
|
+
>>> # OTdirective: snippet-name: string.match;
|
|
203
|
+
>>> data = otp.Ticks(X=['hello', 'there were 77 ticks'])
|
|
204
|
+
>>> data['Y'] = data['X'].str.match(r'\d\d')
|
|
205
|
+
>>> otp.run(data)
|
|
206
|
+
Time X Y
|
|
207
|
+
0 2003-12-01 00:00:00.000 hello 0.0
|
|
208
|
+
1 2003-12-01 00:00:00.001 there were 77 ticks 1.0
|
|
209
|
+
|
|
210
|
+
Other columns can be used as parameter ``pat`` too:
|
|
211
|
+
|
|
212
|
+
>>> data = otp.Tick(X='OneTick', PAT='onetick')
|
|
213
|
+
>>> data['Y'] = data['X'].str.match(data['PAT'], case=False)
|
|
214
|
+
>>> otp.run(data)
|
|
215
|
+
Time X PAT Y
|
|
216
|
+
0 2003-12-01 OneTick onetick 1.0
|
|
217
|
+
|
|
218
|
+
``match`` function can also be used as a filter.
|
|
219
|
+
For example, to filter on-exchange continuous trading trades:
|
|
220
|
+
|
|
221
|
+
>>> q = otp.DataSource('US_COMP', tick_type='TRD', symbols=['SPY']) # doctest: +SKIP
|
|
222
|
+
>>> q = q[['PRICE', 'SIZE', 'COND', 'EXCHANGE']] # doctest: +SKIP
|
|
223
|
+
>>> q = q.where(q['COND'].str.match('^[^O6TUHILNRWZ47QMBCGPV]*$')) # doctest: +SKIP
|
|
224
|
+
>>> otp.run(q, start=otp.dt(2023, 5, 15, 9, 30), end=otp.dt(2023, 5, 15, 9, 30, 1)) # doctest: +SKIP
|
|
225
|
+
Time PRICE SIZE COND EXCHANGE
|
|
226
|
+
0 2023-05-15 09:30:00.000776704 412.220 247 Z
|
|
227
|
+
1 2023-05-15 09:30:00.019069440 412.230 100 F K
|
|
228
|
+
.. ... ... ... ... ...
|
|
229
|
+
"""
|
|
230
|
+
caseless = _get_onetick_bool_string(not case)
|
|
231
|
+
return _StrAccessor.Formatter(
|
|
232
|
+
op_params=[self._base_column, pat],
|
|
233
|
+
dtype=bool,
|
|
234
|
+
formatter=lambda column, pat: f'regex_match({ott.value2str(column)},{ott.value2str(pat)},{caseless})',
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
def len(self):
|
|
238
|
+
"""
|
|
239
|
+
Get the length of a string.
|
|
240
|
+
|
|
241
|
+
Returns
|
|
242
|
+
-------
|
|
243
|
+
Operation
|
|
244
|
+
The length of the string.
|
|
245
|
+
If a null-character (byte with value ``0``) is present in the string,
|
|
246
|
+
its position (0-based) is returned.
|
|
247
|
+
|
|
248
|
+
Examples
|
|
249
|
+
--------
|
|
250
|
+
>>> # OTdirective: snippet-name: string.len;
|
|
251
|
+
>>> data = otp.Ticks(X=['hello', 'world!'])
|
|
252
|
+
>>> data['LEN'] = data['X'].str.len()
|
|
253
|
+
>>> otp.run(data)
|
|
254
|
+
Time X LEN
|
|
255
|
+
0 2003-12-01 00:00:00.000 hello 5
|
|
256
|
+
1 2003-12-01 00:00:00.001 world! 6
|
|
257
|
+
"""
|
|
258
|
+
return _StrAccessor.Formatter(op_params=[self._base_column],
|
|
259
|
+
dtype=int,
|
|
260
|
+
formatter=lambda column: f'strlen({ott.value2str(column)})')
|
|
261
|
+
|
|
262
|
+
def contains(self, substr):
|
|
263
|
+
"""
|
|
264
|
+
Check if the string contains ``substr``.
|
|
265
|
+
|
|
266
|
+
Note
|
|
267
|
+
----
|
|
268
|
+
This function does not support regular expressions.
|
|
269
|
+
Use :func:`match` for this purpose.
|
|
270
|
+
|
|
271
|
+
Parameters
|
|
272
|
+
----------
|
|
273
|
+
substr: str or Column or Operation
|
|
274
|
+
A substring to search for within the string.
|
|
275
|
+
|
|
276
|
+
Returns
|
|
277
|
+
-------
|
|
278
|
+
Operation
|
|
279
|
+
``True`` if the string contains the substring, ``False`` otherwise.
|
|
280
|
+
Note that boolean Operation is converted to float if added as a column.
|
|
281
|
+
|
|
282
|
+
Examples
|
|
283
|
+
--------
|
|
284
|
+
>>> # OTdirective: snippet-name: string.contains;
|
|
285
|
+
>>> data = otp.Ticks(X=['hello', 'world!'])
|
|
286
|
+
>>> data['CONTAINS'] = data['X'].str.contains('hel')
|
|
287
|
+
>>> otp.run(data)
|
|
288
|
+
Time X CONTAINS
|
|
289
|
+
0 2003-12-01 00:00:00.000 hello 1.0
|
|
290
|
+
1 2003-12-01 00:00:00.001 world! 0.0
|
|
291
|
+
|
|
292
|
+
Other columns can be used as parameter ``substr`` too:
|
|
293
|
+
|
|
294
|
+
>>> # OTdirective: snippet-name: string.contains another field;
|
|
295
|
+
>>> data = otp.Ticks(X=['hello', 'big', 'world!'],
|
|
296
|
+
... Y=['hel', 'wor', 'wor'])
|
|
297
|
+
>>> data['CONTAINS'] = data['X'].str.contains(data['Y'])
|
|
298
|
+
>>> otp.run(data)
|
|
299
|
+
Time X Y CONTAINS
|
|
300
|
+
0 2003-12-01 00:00:00.000 hello hel 1.0
|
|
301
|
+
1 2003-12-01 00:00:00.001 big wor 0.0
|
|
302
|
+
2 2003-12-01 00:00:00.002 world! wor 1.0
|
|
303
|
+
|
|
304
|
+
This method can also be used for filtering:
|
|
305
|
+
|
|
306
|
+
>>> # OTdirective: snippet-name: string.contains as a filter;
|
|
307
|
+
>>> data = otp.Ticks(X=['Hello', 'World'])
|
|
308
|
+
>>> with_substr, wo_substr = data[data['X'].str.contains('Hel')]
|
|
309
|
+
>>> otp.run(with_substr)
|
|
310
|
+
Time X
|
|
311
|
+
0 2003-12-01 Hello
|
|
312
|
+
"""
|
|
313
|
+
return _StrAccessor.Formatter(
|
|
314
|
+
op_params=[self._base_column, substr],
|
|
315
|
+
dtype=bool,
|
|
316
|
+
formatter=lambda column, substr: f'instr({ott.value2str(column)}, {ott.value2str(substr)}) > -1',
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
def trim(self):
|
|
320
|
+
"""
|
|
321
|
+
Removes white spaces from both sides of the string.
|
|
322
|
+
|
|
323
|
+
Returns
|
|
324
|
+
-------
|
|
325
|
+
Operation
|
|
326
|
+
Trimmed string
|
|
327
|
+
|
|
328
|
+
See Also
|
|
329
|
+
--------
|
|
330
|
+
:meth:`ltrim`, :meth:`rtrim`
|
|
331
|
+
|
|
332
|
+
Examples
|
|
333
|
+
--------
|
|
334
|
+
>>> # OTdirective: snippet-name: string.trim;
|
|
335
|
+
>>> data = otp.Ticks(X=[' Hello', 'World '])
|
|
336
|
+
>>> data['X'] = data['X'].str.trim()
|
|
337
|
+
>>> otp.run(data)
|
|
338
|
+
Time X
|
|
339
|
+
0 2003-12-01 00:00:00.000 Hello
|
|
340
|
+
1 2003-12-01 00:00:00.001 World
|
|
341
|
+
"""
|
|
342
|
+
return _StrAccessor.Formatter(op_params=[self._base_column],
|
|
343
|
+
dtype=self._base_column.dtype,
|
|
344
|
+
formatter=lambda column: f'trim({ott.value2str(column)})')
|
|
345
|
+
|
|
346
|
+
def ltrim(self):
|
|
347
|
+
"""
|
|
348
|
+
Removes the leading white spaces from a string.
|
|
349
|
+
|
|
350
|
+
Returns
|
|
351
|
+
-------
|
|
352
|
+
Operation
|
|
353
|
+
Trimmed string
|
|
354
|
+
|
|
355
|
+
See Also
|
|
356
|
+
--------
|
|
357
|
+
:meth:`trim`, :meth:`rtrim`
|
|
358
|
+
"""
|
|
359
|
+
return _StrAccessor.Formatter(op_params=[self._base_column],
|
|
360
|
+
dtype=self._base_column.dtype,
|
|
361
|
+
formatter=lambda column: f'ltrim({ott.value2str(column)})')
|
|
362
|
+
|
|
363
|
+
def rtrim(self):
|
|
364
|
+
"""
|
|
365
|
+
Removes the trailing white spaces from a string.
|
|
366
|
+
|
|
367
|
+
Returns
|
|
368
|
+
-------
|
|
369
|
+
Operation
|
|
370
|
+
Trimmed string
|
|
371
|
+
|
|
372
|
+
See Also
|
|
373
|
+
--------
|
|
374
|
+
:meth:`ltrim`, :meth:`trim`
|
|
375
|
+
"""
|
|
376
|
+
return _StrAccessor.Formatter(op_params=[self._base_column],
|
|
377
|
+
dtype=self._base_column.dtype,
|
|
378
|
+
formatter=lambda column: f'rtrim({ott.value2str(column)})')
|
|
379
|
+
|
|
380
|
+
def lower(self):
|
|
381
|
+
"""
|
|
382
|
+
Convert a string to lower case.
|
|
383
|
+
|
|
384
|
+
Returns
|
|
385
|
+
-------
|
|
386
|
+
Operation
|
|
387
|
+
Lowercased string
|
|
388
|
+
|
|
389
|
+
Examples
|
|
390
|
+
--------
|
|
391
|
+
>>> # OTdirective: snippet-name: string.lower;
|
|
392
|
+
>>> data = otp.Ticks(X=['HeLlO', 'wOrLd!'])
|
|
393
|
+
>>> data['LOW'] = data['X'].str.lower()
|
|
394
|
+
>>> otp.run(data)
|
|
395
|
+
Time X LOW
|
|
396
|
+
0 2003-12-01 00:00:00.000 HeLlO hello
|
|
397
|
+
1 2003-12-01 00:00:00.001 wOrLd! world!
|
|
398
|
+
"""
|
|
399
|
+
return _StrAccessor.Formatter(op_params=[self._base_column],
|
|
400
|
+
dtype=self._base_column.dtype,
|
|
401
|
+
formatter=lambda column: f'lower({ott.value2str(column)})')
|
|
402
|
+
|
|
403
|
+
def upper(self):
|
|
404
|
+
"""
|
|
405
|
+
Converts a string to upper case.
|
|
406
|
+
|
|
407
|
+
Returns
|
|
408
|
+
-------
|
|
409
|
+
Operation
|
|
410
|
+
Uppercased string
|
|
411
|
+
|
|
412
|
+
Examples
|
|
413
|
+
--------
|
|
414
|
+
>>> # OTdirective: snippet-name: string.upper;
|
|
415
|
+
>>> data = otp.Ticks(X=['HeLlO', 'wOrLd!'])
|
|
416
|
+
>>> data['UP'] = data['X'].str.upper()
|
|
417
|
+
>>> otp.run(data)
|
|
418
|
+
Time X UP
|
|
419
|
+
0 2003-12-01 00:00:00.000 HeLlO HELLO
|
|
420
|
+
1 2003-12-01 00:00:00.001 wOrLd! WORLD!
|
|
421
|
+
"""
|
|
422
|
+
return _StrAccessor.Formatter(op_params=[self._base_column],
|
|
423
|
+
dtype=self._base_column.dtype,
|
|
424
|
+
formatter=lambda column: f'upper({ott.value2str(column)})')
|
|
425
|
+
|
|
426
|
+
def replace(self, pat, repl):
|
|
427
|
+
"""
|
|
428
|
+
Search for occurrences (case dependent) of ``pat`` and replace with ``repl``.
|
|
429
|
+
|
|
430
|
+
Parameters
|
|
431
|
+
----------
|
|
432
|
+
pat: str or Column or Operation
|
|
433
|
+
Pattern to replace.
|
|
434
|
+
repl: str or Column or Operation
|
|
435
|
+
Replacement string.
|
|
436
|
+
|
|
437
|
+
Returns
|
|
438
|
+
-------
|
|
439
|
+
Operation
|
|
440
|
+
String with ``pat`` replaced by ``repl``.
|
|
441
|
+
|
|
442
|
+
Examples
|
|
443
|
+
--------
|
|
444
|
+
>>> # OTdirective: snippet-name: string.replace;
|
|
445
|
+
>>> data = otp.Ticks(X=['A Table', 'A Chair', 'An Apple'])
|
|
446
|
+
>>> data['Y'] = data['X'].str.replace('A', 'The')
|
|
447
|
+
>>> otp.run(data)
|
|
448
|
+
Time X Y
|
|
449
|
+
0 2003-12-01 00:00:00.000 A Table The Table
|
|
450
|
+
1 2003-12-01 00:00:00.001 A Chair The Chair
|
|
451
|
+
2 2003-12-01 00:00:00.002 An Apple Then Thepple
|
|
452
|
+
|
|
453
|
+
Other columns can be used as parameters too:
|
|
454
|
+
|
|
455
|
+
>>> # OTdirective: snippet-name: string.replace from field;
|
|
456
|
+
>>> data = otp.Ticks(X=['A Table', 'A Chair', 'An Apple'],
|
|
457
|
+
... PAT=['A', 'A', 'An'],
|
|
458
|
+
... REPL=['The', 'Their', 'My'])
|
|
459
|
+
>>> data['Y'] = data['X'].str.replace(data['PAT'], data['REPL'])
|
|
460
|
+
>>> otp.run(data)
|
|
461
|
+
Time X PAT REPL Y
|
|
462
|
+
0 2003-12-01 00:00:00.000 A Table A The The Table
|
|
463
|
+
1 2003-12-01 00:00:00.001 A Chair A Their Their Chair
|
|
464
|
+
2 2003-12-01 00:00:00.002 An Apple An My My Apple
|
|
465
|
+
"""
|
|
466
|
+
# see, BDS-112
|
|
467
|
+
if not isinstance(pat, str):
|
|
468
|
+
pat = pat.str.rtrim()
|
|
469
|
+
if not isinstance(repl, str):
|
|
470
|
+
repl = repl.str.rtrim()
|
|
471
|
+
return _StrAccessor.Formatter(
|
|
472
|
+
op_params=[self._base_column, pat, repl],
|
|
473
|
+
dtype=self._base_column.dtype,
|
|
474
|
+
formatter=(
|
|
475
|
+
lambda column, pat, repl:
|
|
476
|
+
f'replace({ott.value2str(column)}, {ott.value2str(pat)}, {ott.value2str(repl)})'
|
|
477
|
+
),
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
def regex_replace(self, pat, repl, *, replace_every=False, caseless=False):
|
|
481
|
+
r"""
|
|
482
|
+
Search for occurrences (case dependent) of ``pat`` and replace with ``repl``.
|
|
483
|
+
|
|
484
|
+
Parameters
|
|
485
|
+
----------
|
|
486
|
+
pat: str or Column or Operation
|
|
487
|
+
Pattern to replace specified via the POSIX extended regular expression syntax.
|
|
488
|
+
repl: str or Column or Operation
|
|
489
|
+
Replacement string. ``\0`` refers to the entire matched text. ``\1`` to ``\9`` refer
|
|
490
|
+
to the text matched by the corresponding parenthesized group in ``pat``.
|
|
491
|
+
replace_every: bool
|
|
492
|
+
If ``replace_every`` flag is set to ``True``, all matches will be replaced, if ``False`` only the first one.
|
|
493
|
+
caseless: bool
|
|
494
|
+
If the ``caseless`` flag is set to ``True``, matching is case-insensitive.
|
|
495
|
+
|
|
496
|
+
Returns
|
|
497
|
+
-------
|
|
498
|
+
Operation
|
|
499
|
+
String with pattern ``pat`` replaced by ``repl``.
|
|
500
|
+
|
|
501
|
+
See Also
|
|
502
|
+
--------
|
|
503
|
+
:meth:`extract`
|
|
504
|
+
|
|
505
|
+
Examples
|
|
506
|
+
--------
|
|
507
|
+
>>> # OTdirective: snippet-name: string.regex replace;
|
|
508
|
+
>>> data = otp.Ticks(X=['A Table', 'A Chair', 'An Apple'])
|
|
509
|
+
>>> data['Y'] = data['X'].str.regex_replace('An? ', 'The ')
|
|
510
|
+
>>> otp.run(data)
|
|
511
|
+
Time X Y
|
|
512
|
+
0 2003-12-01 00:00:00.000 A Table The Table
|
|
513
|
+
1 2003-12-01 00:00:00.001 A Chair The Chair
|
|
514
|
+
2 2003-12-01 00:00:00.002 An Apple The Apple
|
|
515
|
+
|
|
516
|
+
Parameter ``replace_every`` will replace all occurrences of ``pat`` in the string:
|
|
517
|
+
|
|
518
|
+
>>> # OTdirective: snippet-name: string.regex replace all;
|
|
519
|
+
>>> data = otp.Ticks(X=['A Table, A Chair, An Apple'])
|
|
520
|
+
>>> data['Y'] = data['X'].str.regex_replace('An? ', 'The ', replace_every=True)
|
|
521
|
+
>>> otp.run(data)
|
|
522
|
+
Time X Y
|
|
523
|
+
0 2003-12-01 A Table, A Chair, An Apple The Table, The Chair, The Apple
|
|
524
|
+
|
|
525
|
+
Capturing groups in regular expressions is supported:
|
|
526
|
+
|
|
527
|
+
>>> # OTdirective: snippet-name: string.regex groups;
|
|
528
|
+
>>> data = otp.Ticks(X=['11/12/1992', '9/22/1993', '3/30/1991'])
|
|
529
|
+
>>> data['Y'] = data['X'].str.regex_replace(r'(\d{1,2})/(\d{1,2})/', r'\2.\1.')
|
|
530
|
+
>>> otp.run(data)
|
|
531
|
+
Time X Y
|
|
532
|
+
0 2003-12-01 00:00:00.000 11/12/1992 12.11.1992
|
|
533
|
+
1 2003-12-01 00:00:00.001 9/22/1993 22.9.1993
|
|
534
|
+
2 2003-12-01 00:00:00.002 3/30/1991 30.3.1991
|
|
535
|
+
"""
|
|
536
|
+
replace_every = _get_onetick_bool_string(replace_every)
|
|
537
|
+
caseless = _get_onetick_bool_string(caseless)
|
|
538
|
+
return _StrAccessor.Formatter(
|
|
539
|
+
op_params=[self._base_column, pat, repl],
|
|
540
|
+
dtype=self._base_column.dtype,
|
|
541
|
+
formatter=lambda column, pat, repl: f'regex_replace({ott.value2str(column)}, {ott.value2str(pat)},'
|
|
542
|
+
f' {ott.value2str(repl)}, {replace_every}, {caseless})',
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
def find(self, sub, start=0):
|
|
546
|
+
"""
|
|
547
|
+
Find the index of ``sub`` in the string. If not found, returns ``-1``.
|
|
548
|
+
|
|
549
|
+
Parameters
|
|
550
|
+
----------
|
|
551
|
+
sub: str or Column or Operation
|
|
552
|
+
Substring to find.
|
|
553
|
+
start: int or Column or Operation
|
|
554
|
+
Starting position to find.
|
|
555
|
+
|
|
556
|
+
Returns
|
|
557
|
+
-------
|
|
558
|
+
Operation
|
|
559
|
+
The starting position of the substring or ``-1`` if it is not found.
|
|
560
|
+
|
|
561
|
+
Examples
|
|
562
|
+
--------
|
|
563
|
+
>>> data = otp.Ticks(X=['ananas', 'banana', 'potato'])
|
|
564
|
+
>>> data['Y'] = data['X'].str.find('ana') # OTdirective: snippet-name: string.find;
|
|
565
|
+
>>> otp.run(data)
|
|
566
|
+
Time X Y
|
|
567
|
+
0 2003-12-01 00:00:00.000 ananas 0
|
|
568
|
+
1 2003-12-01 00:00:00.001 banana 1
|
|
569
|
+
2 2003-12-01 00:00:00.002 potato -1
|
|
570
|
+
|
|
571
|
+
Other columns can be used as parameter ``sub`` too:
|
|
572
|
+
|
|
573
|
+
>>> # OTdirective: snippet-name: string.find field value;
|
|
574
|
+
>>> data = otp.Ticks(X=['Ananas', 'Banana', 'Potato'], sub=['Ana', 'anan', 'ato'])
|
|
575
|
+
>>> data['Y'] = data['X'].str.find(data['sub'])
|
|
576
|
+
>>> otp.run(data)
|
|
577
|
+
Time X sub Y
|
|
578
|
+
0 2003-12-01 00:00:00.000 Ananas Ana 0
|
|
579
|
+
1 2003-12-01 00:00:00.001 Banana anan 1
|
|
580
|
+
2 2003-12-01 00:00:00.002 Potato ato 3
|
|
581
|
+
|
|
582
|
+
Note that empty string will be found at the start of any string:
|
|
583
|
+
|
|
584
|
+
>>> data = otp.Ticks(X=['string', ''])
|
|
585
|
+
>>> data['Y'] = data['X'].str.find('')
|
|
586
|
+
>>> otp.run(data)
|
|
587
|
+
Time X Y
|
|
588
|
+
0 2003-12-01 00:00:00.000 string 0
|
|
589
|
+
1 2003-12-01 00:00:00.001 0
|
|
590
|
+
|
|
591
|
+
``start`` parameter is used to find ``sub`` starting from selected position:
|
|
592
|
+
|
|
593
|
+
>>> data = otp.Ticks(X=['ababab', 'abbbbb'])
|
|
594
|
+
>>> data['Y'] = data['X'].str.find('ab', 1)
|
|
595
|
+
>>> otp.run(data)
|
|
596
|
+
Time X Y
|
|
597
|
+
0 2003-12-01 00:00:00.000 ababab 2
|
|
598
|
+
1 2003-12-01 00:00:00.001 abbbbb -1
|
|
599
|
+
"""
|
|
600
|
+
return _StrAccessor.Formatter(
|
|
601
|
+
op_params=[self._base_column, sub, start],
|
|
602
|
+
dtype=int,
|
|
603
|
+
formatter=(
|
|
604
|
+
lambda column, sub, start:
|
|
605
|
+
f'LOCATE({ott.value2str(sub)}, {ott.value2str(column)}, {ott.value2str(start + 1)})-1'
|
|
606
|
+
),
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
def repeat(self, repeats):
|
|
610
|
+
"""
|
|
611
|
+
Duplicate a string ``repeats`` times.
|
|
612
|
+
|
|
613
|
+
Note
|
|
614
|
+
----
|
|
615
|
+
* Alternative for the ``repeat`` function is multiplication.
|
|
616
|
+
* The returned string has the same type and maximum length as the original field.
|
|
617
|
+
|
|
618
|
+
Parameters
|
|
619
|
+
----------
|
|
620
|
+
repeats: int or Column or Operation
|
|
621
|
+
Non-negative number of copies of the string.
|
|
622
|
+
Repeating zero times results in empty string.
|
|
623
|
+
Repeating negative number of times results in exception.
|
|
624
|
+
|
|
625
|
+
Returns
|
|
626
|
+
-------
|
|
627
|
+
Operation
|
|
628
|
+
String repeated ``repeats`` times.
|
|
629
|
+
|
|
630
|
+
Examples
|
|
631
|
+
--------
|
|
632
|
+
>>> # OTdirective: snippet-name: string.repeat;
|
|
633
|
+
>>> data = otp.Ticks(X=['Banana', 'Ananas', 'Apple'])
|
|
634
|
+
>>> data['X'] = data['X'].str.repeat(3)
|
|
635
|
+
>>> otp.run(data)
|
|
636
|
+
Time X
|
|
637
|
+
0 2003-12-01 00:00:00.000 BananaBananaBanana
|
|
638
|
+
1 2003-12-01 00:00:00.001 AnanasAnanasAnanas
|
|
639
|
+
2 2003-12-01 00:00:00.002 AppleAppleApple
|
|
640
|
+
|
|
641
|
+
Other columns can be used as parameter ``repeats`` too:
|
|
642
|
+
|
|
643
|
+
# OTdirective: snippet-name: string.repeat from a field;
|
|
644
|
+
>>> data = otp.Ticks(X=['Banana', 'Ananas', 'Apple'], TIMES=[1, 3, 2])
|
|
645
|
+
>>> data['Y'] = data['X'].str.repeat(data['TIMES'])
|
|
646
|
+
>>> otp.run(data)
|
|
647
|
+
Time X TIMES Y
|
|
648
|
+
0 2003-12-01 00:00:00.000 Banana 1 Banana
|
|
649
|
+
1 2003-12-01 00:00:00.001 Ananas 3 AnanasAnanasAnanas
|
|
650
|
+
2 2003-12-01 00:00:00.002 Apple 2 AppleApple
|
|
651
|
+
|
|
652
|
+
The returned string has the same type and therefore the same maximum length as the original field:
|
|
653
|
+
|
|
654
|
+
>>> data = otp.Ticks(X=[otp.string[9]('Banana')])
|
|
655
|
+
>>> data['Y'] = data['X'].str.repeat(3)
|
|
656
|
+
>>> data.schema
|
|
657
|
+
{'X': string[9], 'Y': string[9]}
|
|
658
|
+
>>> otp.run(data)
|
|
659
|
+
Time X Y
|
|
660
|
+
0 2003-12-01 Banana BananaBan
|
|
661
|
+
|
|
662
|
+
``repeat`` does the same thing as multiplication by a non-negative int:
|
|
663
|
+
|
|
664
|
+
>>> # OTdirective: snippet-name: string.repeat by multiplication;
|
|
665
|
+
>>> data = otp.Ticks(X=['Banana'], N=[2])
|
|
666
|
+
>>> data['X2'] = data['X'] * data['N']
|
|
667
|
+
>>> data['X3'] = data['X'] * 3
|
|
668
|
+
>>> otp.run(data)
|
|
669
|
+
Time X N X2 X3
|
|
670
|
+
0 2003-12-01 Banana 2 BananaBanana BananaBananaBanana
|
|
671
|
+
|
|
672
|
+
Multiplying by 0 results in empty string:
|
|
673
|
+
|
|
674
|
+
>>> data = otp.Ticks(X=['Banana', 'Apple'])
|
|
675
|
+
>>> data['Y'] = data['X'].str.repeat(0)
|
|
676
|
+
>>> otp.run(data)
|
|
677
|
+
Time X Y
|
|
678
|
+
0 2003-12-01 00:00:00.000 Banana
|
|
679
|
+
1 2003-12-01 00:00:00.001 Apple
|
|
680
|
+
"""
|
|
681
|
+
return _StrAccessor.Formatter(
|
|
682
|
+
op_params=[self._base_column, repeats],
|
|
683
|
+
dtype=self._base_column.dtype,
|
|
684
|
+
formatter=lambda column, repeats: f'repeat({ott.value2str(column)}, {ott.value2str(repeats)})',
|
|
685
|
+
)
|
|
686
|
+
|
|
687
|
+
def extract(self, pat, rewrite=r"\0", caseless=False):
|
|
688
|
+
r"""
|
|
689
|
+
Match the string against a regular expression specified by ``pat`` and return the first match.
|
|
690
|
+
The ``rewrite`` parameter can optionally be used to arrange the matched substrings and embed them within the
|
|
691
|
+
string specified in ``rewrite``.
|
|
692
|
+
|
|
693
|
+
Parameters
|
|
694
|
+
----------
|
|
695
|
+
pat: str or Column or Operation
|
|
696
|
+
Pattern to search for specified via the POSIX extended regular expression syntax.
|
|
697
|
+
rewrite: str or Column or Operation
|
|
698
|
+
A string that specifies how to arrange the matched text. ``\0`` refers to the entire matched text.
|
|
699
|
+
``\1`` to ``\9`` refer to the text matched by the corresponding parenthesized group in ``pat``.
|
|
700
|
+
``\u`` and ``\l`` modifiers within the ``rewrite`` string convert the case of the text that
|
|
701
|
+
matches the corresponding parenthesized group (e.g., ``\u1`` converts ``\1`` to uppercase).
|
|
702
|
+
caseless: bool
|
|
703
|
+
If the ``caseless`` flag is set to ``True``, matching is case-insensitive.
|
|
704
|
+
|
|
705
|
+
Returns
|
|
706
|
+
-------
|
|
707
|
+
Operation
|
|
708
|
+
String matched by ``pat`` with format specified in ``rewrite``.
|
|
709
|
+
|
|
710
|
+
See Also
|
|
711
|
+
--------
|
|
712
|
+
regex_replace
|
|
713
|
+
|
|
714
|
+
Examples
|
|
715
|
+
--------
|
|
716
|
+
>>> # OTdirective: snippet-name: string.regex extract;
|
|
717
|
+
>>> data = otp.Ticks(X=['Mr. Smith: +1348 +4781', 'Ms. Smith: +8971'])
|
|
718
|
+
>>> data['TEL'] = data['X'].str.extract(r'\+\d{4}')
|
|
719
|
+
>>> otp.run(data)
|
|
720
|
+
Time X TEL
|
|
721
|
+
0 2003-12-01 00:00:00.000 Mr. Smith: +1348 +4781 +1348
|
|
722
|
+
1 2003-12-01 00:00:00.001 Ms. Smith: +8971 +8971
|
|
723
|
+
|
|
724
|
+
You can specify the group to extract in the ``rewrite`` parameter:
|
|
725
|
+
|
|
726
|
+
>>> # OTdirective: snippet-name: string.regex extract group;
|
|
727
|
+
>>> data = otp.Ticks(X=['Mr. Smith: 1992/12/22', 'Ms. Smith: 1989/10/15'])
|
|
728
|
+
>>> data['BIRTH_YEAR'] = data['X'].str.extract(r'(\d{4})/(\d{2})/(\d{2})', rewrite=r'birth year: \1')
|
|
729
|
+
>>> otp.run(data)
|
|
730
|
+
Time X BIRTH_YEAR
|
|
731
|
+
0 2003-12-01 00:00:00.000 Mr. Smith: 1992/12/22 birth year: 1992
|
|
732
|
+
1 2003-12-01 00:00:00.001 Ms. Smith: 1989/10/15 birth year: 1989
|
|
733
|
+
|
|
734
|
+
You can use a column as a ``rewrite`` or ``pat`` parameter:
|
|
735
|
+
|
|
736
|
+
>>> # OTdirective: snippet-name: string.regex extract from field;
|
|
737
|
+
>>> data = otp.Ticks(X=['Kelly, Mr. James', 'Wilkes, Mrs. James', 'Connolly, Miss. Kate'],
|
|
738
|
+
... PAT=['(Mrs?)\\.', '(Mrs?)\\.', '(Miss)\\.'],
|
|
739
|
+
... REWRITE=['Title 1: \\1', 'Title 2: \\1', 'Title 3: \\1'])
|
|
740
|
+
>>> data['TITLE'] = data['X'].str.extract(data['PAT'], rewrite=data['REWRITE'])
|
|
741
|
+
>>> otp.run(data)
|
|
742
|
+
Time X PAT REWRITE TITLE
|
|
743
|
+
0 2003-12-01 00:00:00.000 Kelly, Mr. James (Mrs?)\. Title 1: \1 Title 1: Mr
|
|
744
|
+
1 2003-12-01 00:00:00.001 Wilkes, Mrs. James (Mrs?)\. Title 2: \1 Title 2: Mrs
|
|
745
|
+
2 2003-12-01 00:00:00.002 Connolly, Miss. Kate (Miss)\. Title 3: \1 Title 3: Miss
|
|
746
|
+
|
|
747
|
+
Case of the extracted string can be changed by adding ``l`` and ``u`` to extract group:
|
|
748
|
+
|
|
749
|
+
>>> # OTdirective: snippet-name: string.regex extract caseless;
|
|
750
|
+
>>> data = otp.Ticks(NAME=['mr. BroWn', 'Ms. smITh'])
|
|
751
|
+
>>> data['RESULT'] = data['NAME'].str.extract(r'(m)([rs]\. )([a-z])([a-z]*)', r'\u1\l2\u3\l4', caseless=True)
|
|
752
|
+
>>> otp.run(data)
|
|
753
|
+
Time NAME RESULT
|
|
754
|
+
0 2003-12-01 00:00:00.000 mr. BroWn Mr. Brown
|
|
755
|
+
1 2003-12-01 00:00:00.001 Ms. smITh Ms. Smith
|
|
756
|
+
"""
|
|
757
|
+
caseless = _get_onetick_bool_string(caseless)
|
|
758
|
+
return _StrAccessor.Formatter(
|
|
759
|
+
op_params=[self._base_column, pat, rewrite],
|
|
760
|
+
dtype=self._base_column.dtype,
|
|
761
|
+
formatter=(
|
|
762
|
+
lambda column, pat, rewrite:
|
|
763
|
+
f'regex_extract({ott.value2str(column)}, {ott.value2str(pat)}, {ott.value2str(rewrite)}, {caseless})'
|
|
764
|
+
),
|
|
765
|
+
)
|
|
766
|
+
|
|
767
|
+
def substr(self, start, n_bytes=None, rtrim=False):
|
|
768
|
+
"""
|
|
769
|
+
Return ``n_bytes`` characters starting from ``start``.
|
|
770
|
+
|
|
771
|
+
For a positive ``start`` return ``num_bytes`` of the string, starting from the position specified by
|
|
772
|
+
``start`` (0-based).
|
|
773
|
+
For a negative ``start``, the position is counted from the end of the string.
|
|
774
|
+
If the ``n_bytes`` parameter is omitted, returns the part of the input string
|
|
775
|
+
starting at ``start`` till the end of the string.
|
|
776
|
+
|
|
777
|
+
Parameters
|
|
778
|
+
----------
|
|
779
|
+
start: int or Column or Operation
|
|
780
|
+
Index of first symbol in substring
|
|
781
|
+
n_bytes: int or Column or Operation
|
|
782
|
+
Number of bytes in substring
|
|
783
|
+
rtrim: bool
|
|
784
|
+
If set to ``True``, original string will be trimmed from the right side
|
|
785
|
+
before getting the substring, this can be useful with negative ``start`` index.
|
|
786
|
+
|
|
787
|
+
Returns
|
|
788
|
+
-------
|
|
789
|
+
Operation
|
|
790
|
+
Substring of string (``n_bytes`` length starting with ``start``).
|
|
791
|
+
|
|
792
|
+
Examples
|
|
793
|
+
--------
|
|
794
|
+
>>> # OTdirective: snippet-name: string.substring;
|
|
795
|
+
>>> data = otp.Ticks(X=['abcdef', '12345 '], START_INDEX=[2, 1], N=[2, 3])
|
|
796
|
+
>>> data['FIRST_3'] = data['X'].str.substr(0, 3)
|
|
797
|
+
>>> data['LAST_3'] = data['X'].str.substr(-3, rtrim=True)
|
|
798
|
+
>>> data['CENTER'] = data['X'].str.substr(data['START_INDEX'], data['N'])
|
|
799
|
+
>>> otp.run(data)
|
|
800
|
+
Time X START_INDEX N FIRST_3 LAST_3 CENTER
|
|
801
|
+
0 2003-12-01 00:00:00.000 abcdef 2 2 abc def cd
|
|
802
|
+
1 2003-12-01 00:00:00.001 12345 1 3 123 345 234
|
|
803
|
+
"""
|
|
804
|
+
column = self._base_column
|
|
805
|
+
if rtrim:
|
|
806
|
+
column = column.str.rtrim()
|
|
807
|
+
|
|
808
|
+
if n_bytes is None:
|
|
809
|
+
return _StrAccessor.Formatter(
|
|
810
|
+
op_params=[column, start],
|
|
811
|
+
dtype=self._base_column.dtype,
|
|
812
|
+
formatter=(
|
|
813
|
+
lambda column, start:
|
|
814
|
+
f'substr({ott.value2str(column)}, {ott.value2str(start)})'
|
|
815
|
+
),
|
|
816
|
+
)
|
|
817
|
+
else:
|
|
818
|
+
return _StrAccessor.Formatter(
|
|
819
|
+
op_params=[column, start, n_bytes],
|
|
820
|
+
dtype=self._base_column.dtype,
|
|
821
|
+
formatter=(
|
|
822
|
+
lambda column, start, n_bytes:
|
|
823
|
+
f'substr({ott.value2str(column)}, {ott.value2str(start)}, {ott.value2str(n_bytes)})'
|
|
824
|
+
),
|
|
825
|
+
)
|
|
826
|
+
|
|
827
|
+
def get(self, i):
|
|
828
|
+
"""
|
|
829
|
+
Returns the character at the position indicated by the 0-based index; and empty string,
|
|
830
|
+
if position is greater or equal to the length.
|
|
831
|
+
|
|
832
|
+
Parameters
|
|
833
|
+
----------
|
|
834
|
+
i: int or Column or Operation
|
|
835
|
+
Index of the character to find.
|
|
836
|
+
|
|
837
|
+
Examples
|
|
838
|
+
--------
|
|
839
|
+
>>> data = otp.Ticks(X=['abcdef', '12345 ', 'qw'], GET_INDEX=[2, 1, 0])
|
|
840
|
+
>>> data['THIRD'] = data['X'].str.get(2)
|
|
841
|
+
>>> data['FROM_INDEX'] = data['X'].str.get(data['GET_INDEX'])
|
|
842
|
+
>>> otp.run(data)
|
|
843
|
+
Time X GET_INDEX THIRD FROM_INDEX
|
|
844
|
+
0 2003-12-01 00:00:00.000 abcdef 2 c c
|
|
845
|
+
1 2003-12-01 00:00:00.001 12345 1 3 2
|
|
846
|
+
2 2003-12-01 00:00:00.002 qw 0 q
|
|
847
|
+
|
|
848
|
+
It is possible to use syntax with indexer to call this method:
|
|
849
|
+
|
|
850
|
+
>>> data = otp.Ticks(X=['abcdef', '12345 ', 'qw'])
|
|
851
|
+
>>> data['THIRD'] = data['X'].str[1]
|
|
852
|
+
>>> otp.run(data)
|
|
853
|
+
Time X THIRD
|
|
854
|
+
0 2003-12-01 00:00:00.000 abcdef b
|
|
855
|
+
1 2003-12-01 00:00:00.001 12345 2
|
|
856
|
+
2 2003-12-01 00:00:00.002 qw w
|
|
857
|
+
"""
|
|
858
|
+
return _StrAccessor.Formatter(
|
|
859
|
+
op_params=[self._base_column, i],
|
|
860
|
+
dtype=str,
|
|
861
|
+
formatter=(
|
|
862
|
+
lambda column, i:
|
|
863
|
+
'CASE(BYTE_AT({0}, {1}),-1,"",CHAR(BYTE_AT({0}, {1})))'.format(ott.value2str(column), ott.value2str(i))
|
|
864
|
+
),
|
|
865
|
+
)
|
|
866
|
+
|
|
867
|
+
def concat(self, other):
|
|
868
|
+
"""
|
|
869
|
+
Returns a string that is the result of concatenating to ``others``.
|
|
870
|
+
|
|
871
|
+
Parameters
|
|
872
|
+
----------
|
|
873
|
+
other: str or Column or Operation
|
|
874
|
+
String to concatenate with.
|
|
875
|
+
|
|
876
|
+
Examples
|
|
877
|
+
--------
|
|
878
|
+
>>> data = otp.Ticks(X=['X1', 'X2', 'X3'], Y=['Y1', 'Y2', 'Y3'])
|
|
879
|
+
>>> data['X_WITH_CONST_SUFFIX'] = data['X'].str.concat('_suffix')
|
|
880
|
+
>>> data['X_WTH_Y'] = data['X'].str.concat(data['Y'])
|
|
881
|
+
>>> otp.run(data)
|
|
882
|
+
Time X Y X_WITH_CONST_SUFFIX X_WTH_Y
|
|
883
|
+
0 2003-12-01 00:00:00.000 X1 Y1 X1_suffix X1Y1
|
|
884
|
+
1 2003-12-01 00:00:00.001 X2 Y2 X2_suffix X2Y2
|
|
885
|
+
2 2003-12-01 00:00:00.002 X3 Y3 X3_suffix X3Y3
|
|
886
|
+
"""
|
|
887
|
+
return _StrAccessor.Formatter(
|
|
888
|
+
op_params=[self._base_column, other],
|
|
889
|
+
dtype=self._base_column.dtype,
|
|
890
|
+
formatter=lambda column, other: f'CONCAT({ott.value2str(column)}, {ott.value2str(other)})',
|
|
891
|
+
)
|
|
892
|
+
|
|
893
|
+
def insert(self, start, length, value):
|
|
894
|
+
"""
|
|
895
|
+
Returns a string where ``length`` characters have been deleted from string,
|
|
896
|
+
beginning at ``start``, and where ``value`` has been inserted into string, beginning at ``start``.
|
|
897
|
+
|
|
898
|
+
Parameters
|
|
899
|
+
----------
|
|
900
|
+
start: int or Column or Operation
|
|
901
|
+
Position to remove from and to insert into.
|
|
902
|
+
length: int or Column or Operation
|
|
903
|
+
Number if characters to remove.
|
|
904
|
+
value: str or Column or Operation
|
|
905
|
+
String to insert.
|
|
906
|
+
|
|
907
|
+
Examples
|
|
908
|
+
--------
|
|
909
|
+
>>> data = otp.Ticks(X=['aaaaaaa', 'bbbbb', 'cccc'], Y=['ddd', 'ee', 'f'])
|
|
910
|
+
>>> data['INSERTED_1'] = data['X'].str.insert(3, 1, 'X')
|
|
911
|
+
>>> data['INSERTED_2'] = data['X'].str.insert(3, 2, 'X')
|
|
912
|
+
>>> data['INSERTED_Y'] = data['X'].str.insert(3, 2, data['Y'])
|
|
913
|
+
>>> otp.run(data)
|
|
914
|
+
Time X Y INSERTED_1 INSERTED_2 INSERTED_Y
|
|
915
|
+
0 2003-12-01 00:00:00.000 aaaaaaa ddd aaXaaaa aaXaaa aadddaaa
|
|
916
|
+
1 2003-12-01 00:00:00.001 bbbbb ee bbXbb bbXb bbeeb
|
|
917
|
+
2 2003-12-01 00:00:00.002 cccc f ccXc ccX ccf
|
|
918
|
+
|
|
919
|
+
It is possible to insert without removal:
|
|
920
|
+
|
|
921
|
+
>>> data = otp.Ticks(X=['aaaaaaa', 'bbbbb', 'cccc'])
|
|
922
|
+
>>> data['INSERTED'] = data['X'].str.insert(3, 0, 'X')
|
|
923
|
+
>>> otp.run(data)
|
|
924
|
+
Time X INSERTED
|
|
925
|
+
0 2003-12-01 00:00:00.000 aaaaaaa aaXaaaaa
|
|
926
|
+
1 2003-12-01 00:00:00.001 bbbbb bbXbbb
|
|
927
|
+
2 2003-12-01 00:00:00.002 cccc ccXcc
|
|
928
|
+
|
|
929
|
+
It is possible to remove without insertion:
|
|
930
|
+
|
|
931
|
+
>>> data = otp.Ticks(X=['aaaaaaa', 'bbbbb', 'cccc'])
|
|
932
|
+
>>> data['REMOVED'] = data['X'].str.insert(3, 2, '')
|
|
933
|
+
>>> otp.run(data)
|
|
934
|
+
Time X REMOVED
|
|
935
|
+
0 2003-12-01 00:00:00.000 aaaaaaa aaaaa
|
|
936
|
+
1 2003-12-01 00:00:00.001 bbbbb bbb
|
|
937
|
+
2 2003-12-01 00:00:00.002 cccc cc
|
|
938
|
+
"""
|
|
939
|
+
return _StrAccessor.Formatter(
|
|
940
|
+
op_params=[self._base_column, start, length, value],
|
|
941
|
+
dtype=self._base_column.dtype,
|
|
942
|
+
formatter=(
|
|
943
|
+
lambda column, start, length, value:
|
|
944
|
+
f'INSERT({ott.value2str(column)}, {ott.value2str(start)},'
|
|
945
|
+
f' {ott.value2str(length)}, {ott.value2str(value)})'
|
|
946
|
+
),
|
|
947
|
+
)
|
|
948
|
+
|
|
949
|
+
def first(self, count=1):
|
|
950
|
+
"""
|
|
951
|
+
Returns first ``count`` symbols.
|
|
952
|
+
|
|
953
|
+
Parameters
|
|
954
|
+
----------
|
|
955
|
+
count: int or Column or Operation
|
|
956
|
+
Number of first symbols to return. Default: 1
|
|
957
|
+
|
|
958
|
+
Examples
|
|
959
|
+
--------
|
|
960
|
+
>>> data = otp.Ticks(X=['abc', 'bac', 'cba'], Y=[3, 1, 10])
|
|
961
|
+
>>> data['FIRST'] = data['X'].str.first()
|
|
962
|
+
>>> data['FIRST_Y'] = data['X'].str.first(data['Y'])
|
|
963
|
+
>>> otp.run(data)
|
|
964
|
+
Time X Y FIRST FIRST_Y
|
|
965
|
+
0 2003-12-01 00:00:00.000 abc 3 a abc
|
|
966
|
+
1 2003-12-01 00:00:00.001 bac 1 b b
|
|
967
|
+
2 2003-12-01 00:00:00.002 cba 10 c cba
|
|
968
|
+
"""
|
|
969
|
+
return _StrAccessor.Formatter(
|
|
970
|
+
op_params=[self._base_column, count],
|
|
971
|
+
dtype=str,
|
|
972
|
+
formatter=lambda column, count: f'LEFT({ott.value2str(column)}, {ott.value2str(count)})',
|
|
973
|
+
)
|
|
974
|
+
|
|
975
|
+
def last(self, count=1):
|
|
976
|
+
"""
|
|
977
|
+
Returns last ``count`` symbols.
|
|
978
|
+
|
|
979
|
+
Parameters
|
|
980
|
+
----------
|
|
981
|
+
count: int or Column or Operation
|
|
982
|
+
Number of last symbols to return. Default: 1
|
|
983
|
+
|
|
984
|
+
Examples
|
|
985
|
+
--------
|
|
986
|
+
>>> data = otp.Ticks(X=['abc', 'bac', 'cba'], Y=[3, 1, 9])
|
|
987
|
+
>>> data['LAST'] = data['X'].str.last()
|
|
988
|
+
>>> data['LAST_Y'] = data['X'].str.last(data['Y'])
|
|
989
|
+
>>> otp.run(data)
|
|
990
|
+
Time X Y LAST LAST_Y
|
|
991
|
+
0 2003-12-01 00:00:00.000 abc 3 c abc
|
|
992
|
+
1 2003-12-01 00:00:00.001 bac 1 c c
|
|
993
|
+
2 2003-12-01 00:00:00.002 cba 9 a cba
|
|
994
|
+
"""
|
|
995
|
+
# RIGHT function works strange with negative index
|
|
996
|
+
# RIGHT_UTF8 works fine but it is not supported by old builds
|
|
997
|
+
return _StrAccessor.Formatter(
|
|
998
|
+
op_params=[self._base_column, count],
|
|
999
|
+
dtype=self._base_column.dtype,
|
|
1000
|
+
formatter=(
|
|
1001
|
+
lambda column, count:
|
|
1002
|
+
'SUBSTR({0}, MAX(STRLEN({0})-{1}, 0))'.format(ott.value2str(column), ott.value2str(count))
|
|
1003
|
+
),
|
|
1004
|
+
)
|
|
1005
|
+
|
|
1006
|
+
def startswith(self, value):
|
|
1007
|
+
"""
|
|
1008
|
+
Checks if the Operation starts with a string.
|
|
1009
|
+
|
|
1010
|
+
Parameters
|
|
1011
|
+
----------
|
|
1012
|
+
value: str or Column or Operation
|
|
1013
|
+
String to check if starts with it.
|
|
1014
|
+
|
|
1015
|
+
Examples
|
|
1016
|
+
--------
|
|
1017
|
+
>>> data = otp.Ticks(X=['baaaa', 'bbbbb', 'cbbc'], Y=['ba', 'abb', 'c'])
|
|
1018
|
+
>>> data['STARTSWITH_CONST'] = data['X'].str.startswith('bb')
|
|
1019
|
+
>>> data['STARTSWITH_Y'] = data['X'].str.startswith(data['Y'])
|
|
1020
|
+
>>> otp.run(data)
|
|
1021
|
+
Time X Y STARTSWITH_CONST STARTSWITH_Y
|
|
1022
|
+
0 2003-12-01 00:00:00.000 baaaa ba 0.0 1.0
|
|
1023
|
+
1 2003-12-01 00:00:00.001 bbbbb abb 1.0 0.0
|
|
1024
|
+
2 2003-12-01 00:00:00.002 cbbc c 0.0 1.0
|
|
1025
|
+
"""
|
|
1026
|
+
return _StrAccessor.Formatter(
|
|
1027
|
+
op_params=[self._base_column, value],
|
|
1028
|
+
dtype=bool,
|
|
1029
|
+
formatter=(
|
|
1030
|
+
lambda column, value:
|
|
1031
|
+
'LEFT({0}, STRLEN({1}))={1}'.format(ott.value2str(column), ott.value2str(value))
|
|
1032
|
+
),
|
|
1033
|
+
)
|
|
1034
|
+
|
|
1035
|
+
def endswith(self, value):
|
|
1036
|
+
"""
|
|
1037
|
+
Checks if the Operation ends with a string.
|
|
1038
|
+
|
|
1039
|
+
Parameters
|
|
1040
|
+
----------
|
|
1041
|
+
value: str or Column or Operation
|
|
1042
|
+
String to check if starts with it.
|
|
1043
|
+
|
|
1044
|
+
Examples
|
|
1045
|
+
--------
|
|
1046
|
+
>>> data = otp.Ticks(X=['baaaa', 'bbbbb', 'cbbc', 'c'], Y=['ba', 'bbb', 'c', 'cc'])
|
|
1047
|
+
>>> data['ENDSWITH_CONST'] = data['X'].str.endswith('bb')
|
|
1048
|
+
>>> data['ENDSWITH_Y'] = data['X'].str.endswith(data['Y'])
|
|
1049
|
+
>>> otp.run(data)
|
|
1050
|
+
Time X Y ENDSWITH_CONST ENDSWITH_Y
|
|
1051
|
+
0 2003-12-01 00:00:00.000 baaaa ba 0.0 0.0
|
|
1052
|
+
1 2003-12-01 00:00:00.001 bbbbb bbb 1.0 1.0
|
|
1053
|
+
2 2003-12-01 00:00:00.002 cbbc c 0.0 1.0
|
|
1054
|
+
3 2003-12-01 00:00:00.003 c cc 0.0 0.0
|
|
1055
|
+
"""
|
|
1056
|
+
# RIGHT function works strange with negative index
|
|
1057
|
+
# RIGHT_UTF8 works fine but it is not supported by old builds
|
|
1058
|
+
return _StrAccessor.Formatter(
|
|
1059
|
+
op_params=[self._base_column, value],
|
|
1060
|
+
dtype=bool,
|
|
1061
|
+
formatter=(
|
|
1062
|
+
lambda column, value:
|
|
1063
|
+
'SUBSTR({0}, MAX(STRLEN({0})-STRLEN({1}), 0))={1}'.format(ott.value2str(column), ott.value2str(value))
|
|
1064
|
+
),
|
|
1065
|
+
)
|
|
1066
|
+
|
|
1067
|
+
def slice(self, start=None, stop=None):
|
|
1068
|
+
"""
|
|
1069
|
+
Returns slice.
|
|
1070
|
+
|
|
1071
|
+
Parameters
|
|
1072
|
+
----------
|
|
1073
|
+
start: int or Column or Operation, optional
|
|
1074
|
+
Start position for slice operation.
|
|
1075
|
+
stop: int or Column or Operation, optional
|
|
1076
|
+
Stop position for slice operation.
|
|
1077
|
+
|
|
1078
|
+
Examples
|
|
1079
|
+
--------
|
|
1080
|
+
>>> data = otp.Ticks(X=['12345', 'abcde', 'qwerty'], START=[3, 0, 1], STOP=[4, 3, 3])
|
|
1081
|
+
>>> data['START_1_SLICE'] = data['X'].str.slice(start=1)
|
|
1082
|
+
>>> data['STOP_2_SLICE'] = data['X'].str.slice(stop=2)
|
|
1083
|
+
>>> data['SLICE_FROM_COLUMNS'] = data['X'].str.slice(start=data['START'], stop=data['STOP'])
|
|
1084
|
+
>>> otp.run(data)
|
|
1085
|
+
Time X START STOP START_1_SLICE STOP_2_SLICE SLICE_FROM_COLUMNS
|
|
1086
|
+
0 2003-12-01 00:00:00.000 12345 3 4 2345 12 4
|
|
1087
|
+
1 2003-12-01 00:00:00.001 abcde 0 3 bcde ab abc
|
|
1088
|
+
2 2003-12-01 00:00:00.002 qwerty 1 3 werty qw we
|
|
1089
|
+
|
|
1090
|
+
Parameters can be negative:
|
|
1091
|
+
|
|
1092
|
+
>>> data = otp.Ticks(X=['12345', 'abcde', 'qwerty'])
|
|
1093
|
+
>>> data['START_SLICE'] = data['X'].str.slice(start=-3)
|
|
1094
|
+
>>> data['STOP_SLICE'] = data['X'].str.slice(stop=-1)
|
|
1095
|
+
>>> data['START_STOP_SLICE'] = data['X'].str.slice(start=-3, stop=-1)
|
|
1096
|
+
>>> otp.run(data)
|
|
1097
|
+
Time X START_SLICE STOP_SLICE START_STOP_SLICE
|
|
1098
|
+
0 2003-12-01 00:00:00.000 12345 345 1234 34
|
|
1099
|
+
1 2003-12-01 00:00:00.001 abcde cde abcd cd
|
|
1100
|
+
2 2003-12-01 00:00:00.002 qwerty rty qwert rt
|
|
1101
|
+
|
|
1102
|
+
It is possible to use syntax with indexer to call this method:
|
|
1103
|
+
|
|
1104
|
+
>>> data = otp.Ticks(X=['12345', 'abcde', 'qwerty'])
|
|
1105
|
+
>>> data['START_SLICE'] = data['X'].str[1:]
|
|
1106
|
+
>>> data['STOP_SLICE'] = data['X'].str[:3]
|
|
1107
|
+
>>> data['START_STOP_SLICE'] = data['X'].str[1:3]
|
|
1108
|
+
>>> otp.run(data)
|
|
1109
|
+
Time X START_SLICE STOP_SLICE START_STOP_SLICE
|
|
1110
|
+
0 2003-12-01 00:00:00.000 12345 2345 123 23
|
|
1111
|
+
1 2003-12-01 00:00:00.001 abcde bcde abc bc
|
|
1112
|
+
2 2003-12-01 00:00:00.002 qwerty werty qwe we
|
|
1113
|
+
"""
|
|
1114
|
+
if start is None and stop is None:
|
|
1115
|
+
raise ValueError("At least one of the `start` or `stop` parameters should be set.")
|
|
1116
|
+
if start is None:
|
|
1117
|
+
def formatter(x, start, stop):
|
|
1118
|
+
x = ott.value2str(x)
|
|
1119
|
+
stop_str = ott.value2str(stop)
|
|
1120
|
+
len_x = f'STRLEN({x})'
|
|
1121
|
+
return (f'CASE({stop_str}>=0,1,'
|
|
1122
|
+
f'SUBSTR({x},0,{stop_str}),'
|
|
1123
|
+
f'SUBSTR({x},0,MAX(0,{len_x}+{stop_str})))')
|
|
1124
|
+
elif stop is None:
|
|
1125
|
+
def formatter(x, start, stop):
|
|
1126
|
+
x = ott.value2str(x)
|
|
1127
|
+
len_x = f'STRLEN({x})'
|
|
1128
|
+
# we need this workaround because simple RIGHT and SUBSTR with negative start parameter work strange
|
|
1129
|
+
# SUBSTR_UTF8 works fine but it is not supported by old builds
|
|
1130
|
+
x_corrected = f'LEFT({x},{len_x})'
|
|
1131
|
+
# SUBSTR returns '' when ABC(second parameter) >= STRLEN
|
|
1132
|
+
return f'SUBSTR({x_corrected},MAX({ott.value2str(start)},-{len_x}))'
|
|
1133
|
+
else:
|
|
1134
|
+
def formatter(x, start, stop):
|
|
1135
|
+
x = ott.value2str(x)
|
|
1136
|
+
stop_str = ott.value2str(stop)
|
|
1137
|
+
len_x = f'STRLEN({x})'
|
|
1138
|
+
# we need this workaround because simple RIGHT and SUBSTR with negative start parameter work strange
|
|
1139
|
+
# SUBSTR_UTF8 works fine but it is not supported by old builds
|
|
1140
|
+
x_corrected = f'LEFT({x},{len_x})'
|
|
1141
|
+
# y is x after cutting the left part (we need to cut the right part of it)
|
|
1142
|
+
# SUBSTR returns '' when ABC(second parameter) >= STRLEN
|
|
1143
|
+
y = f'SUBSTR({x_corrected},MAX({ott.value2str(start)},-{len_x}))'
|
|
1144
|
+
len_y = f'STRLEN({y})'
|
|
1145
|
+
len_cut = f'({len_x}-{len_y})' # length of already cut part (the left one)
|
|
1146
|
+
stop_for_y = f'CASE({stop_str}>=0,1,{stop_str}-{len_cut},{stop_str})'
|
|
1147
|
+
return (f'CASE({stop_for_y}>=0,1,'
|
|
1148
|
+
f'SUBSTR({y},0,{stop_for_y}),'
|
|
1149
|
+
f'SUBSTR({y},0,MAX(0,{len_y}+{stop_for_y})))')
|
|
1150
|
+
return _StrAccessor.Formatter(op_params=[self._base_column, start, stop],
|
|
1151
|
+
dtype=self._base_column.dtype,
|
|
1152
|
+
formatter=formatter)
|
|
1153
|
+
|
|
1154
|
+
def __getitem__(self, item):
|
|
1155
|
+
if isinstance(item, slice):
|
|
1156
|
+
if item.step is not None:
|
|
1157
|
+
raise ValueError("`step` parameter is not supported.")
|
|
1158
|
+
return self.slice(start=item.start, stop=item.stop)
|
|
1159
|
+
return self.get(item)
|
|
1160
|
+
|
|
1161
|
+
def like(self, pattern):
|
|
1162
|
+
r"""
|
|
1163
|
+
Check if the value is matched with SQL-like ``pattern``.
|
|
1164
|
+
|
|
1165
|
+
Parameters
|
|
1166
|
+
----------
|
|
1167
|
+
pattern: str or symbol parameter (:py:class:`~onetick.py.core._source._symbol_param._SymbolParamColumn`)
|
|
1168
|
+
Pattern to match the value with.
|
|
1169
|
+
The pattern can contain usual text characters and two special ones:
|
|
1170
|
+
|
|
1171
|
+
* ``%`` represents zero or more characters
|
|
1172
|
+
* ``_`` represents a single character
|
|
1173
|
+
|
|
1174
|
+
Use backslash ``\`` character to escape these special characters.
|
|
1175
|
+
|
|
1176
|
+
Returns
|
|
1177
|
+
-------
|
|
1178
|
+
Operation
|
|
1179
|
+
``True`` if the match was successful, ``False`` otherwise.
|
|
1180
|
+
Note that boolean Operation is converted to float if added as a column.
|
|
1181
|
+
|
|
1182
|
+
Examples
|
|
1183
|
+
--------
|
|
1184
|
+
|
|
1185
|
+
Use ``%`` character to specify any number of characters:
|
|
1186
|
+
|
|
1187
|
+
>>> data = otp.Ticks(X=['a', 'ab', 'b_', 'b%'])
|
|
1188
|
+
>>> data['LIKE'] = data['X'].str.like('a%')
|
|
1189
|
+
>>> otp.run(data)
|
|
1190
|
+
Time X LIKE
|
|
1191
|
+
0 2003-12-01 00:00:00.000 a 1.0
|
|
1192
|
+
1 2003-12-01 00:00:00.001 ab 1.0
|
|
1193
|
+
2 2003-12-01 00:00:00.002 b_ 0.0
|
|
1194
|
+
3 2003-12-01 00:00:00.003 b% 0.0
|
|
1195
|
+
|
|
1196
|
+
Use ``_`` special character to specify a single character:
|
|
1197
|
+
|
|
1198
|
+
>>> data = otp.Ticks(X=['a', 'ab', 'b_', 'b%'])
|
|
1199
|
+
>>> data['LIKE'] = data['X'].str.like('a_')
|
|
1200
|
+
>>> otp.run(data)
|
|
1201
|
+
Time X LIKE
|
|
1202
|
+
0 2003-12-01 00:00:00.000 a 0.0
|
|
1203
|
+
1 2003-12-01 00:00:00.001 ab 1.0
|
|
1204
|
+
2 2003-12-01 00:00:00.002 b_ 0.0
|
|
1205
|
+
3 2003-12-01 00:00:00.003 b% 0.0
|
|
1206
|
+
|
|
1207
|
+
Use backslash ``\`` character to escape special characters:
|
|
1208
|
+
|
|
1209
|
+
>>> data = otp.Ticks(X=['a', 'ab', 'b_', 'b%'])
|
|
1210
|
+
>>> data['LIKE'] = data['X'].str.like(r'b\_')
|
|
1211
|
+
>>> otp.run(data)
|
|
1212
|
+
Time X LIKE
|
|
1213
|
+
0 2003-12-01 00:00:00.000 a 0.0
|
|
1214
|
+
1 2003-12-01 00:00:00.001 ab 0.0
|
|
1215
|
+
2 2003-12-01 00:00:00.002 b_ 1.0
|
|
1216
|
+
3 2003-12-01 00:00:00.003 b% 0.0
|
|
1217
|
+
|
|
1218
|
+
This function can be used to filter out ticks:
|
|
1219
|
+
|
|
1220
|
+
>>> data = otp.Ticks(X=['a', 'ab', 'b_', 'b%'])
|
|
1221
|
+
>>> data = data.where(data['X'].str.like('a%'))
|
|
1222
|
+
>>> otp.run(data)
|
|
1223
|
+
Time X
|
|
1224
|
+
0 2003-12-01 00:00:00.000 a
|
|
1225
|
+
1 2003-12-01 00:00:00.001 ab
|
|
1226
|
+
|
|
1227
|
+
``pattern`` can only be a constant expression, like string or symbol parameter:
|
|
1228
|
+
|
|
1229
|
+
>>> data = otp.Ticks(X=['a', 'ab', 'b_', 'b%'])
|
|
1230
|
+
>>> data['LIKE'] = data['X'].str.like(data.Symbol['PATTERN', str])
|
|
1231
|
+
>>> otp.run(data, symbols=otp.Tick(SYMBOL_NAME='COMMON::AAPL', PATTERN='_'))['COMMON::AAPL']
|
|
1232
|
+
Time X LIKE
|
|
1233
|
+
0 2003-12-01 00:00:00.000 a 1.0
|
|
1234
|
+
1 2003-12-01 00:00:00.001 ab 0.0
|
|
1235
|
+
2 2003-12-01 00:00:00.002 b_ 0.0
|
|
1236
|
+
3 2003-12-01 00:00:00.003 b% 0.0
|
|
1237
|
+
"""
|
|
1238
|
+
from onetick.py.core._source._symbol_param import _SymbolParamColumn
|
|
1239
|
+
if not isinstance(pattern, (str, _SymbolParamColumn)):
|
|
1240
|
+
raise ValueError('like() function expects parameter to be a constant expression')
|
|
1241
|
+
return _StrAccessor.Formatter(
|
|
1242
|
+
op_params=[self._base_column, pattern],
|
|
1243
|
+
dtype=bool,
|
|
1244
|
+
formatter=lambda column, pattern: f'{ott.value2str(column)} LIKE {ott.value2str(pattern)}'
|
|
1245
|
+
)
|
|
1246
|
+
|
|
1247
|
+
def ilike(self, pattern):
|
|
1248
|
+
r"""
|
|
1249
|
+
Check if the value is case insensitive matched with SQL-like ``pattern``.
|
|
1250
|
+
|
|
1251
|
+
Parameters
|
|
1252
|
+
----------
|
|
1253
|
+
pattern: str or symbol parameter (:py:class:`~onetick.py.core._source._symbol_param._SymbolParamColumn`)
|
|
1254
|
+
Pattern to match the value with.
|
|
1255
|
+
The pattern can contain usual text characters and two special ones:
|
|
1256
|
+
|
|
1257
|
+
* ``%`` represents zero or more characters
|
|
1258
|
+
* ``_`` represents a single character
|
|
1259
|
+
|
|
1260
|
+
Use backslash ``\`` character to escape these special characters.
|
|
1261
|
+
|
|
1262
|
+
Returns
|
|
1263
|
+
-------
|
|
1264
|
+
Operation
|
|
1265
|
+
``True`` if the match was successful, ``False`` otherwise.
|
|
1266
|
+
Note that boolean Operation is converted to float if added as a column.
|
|
1267
|
+
|
|
1268
|
+
Examples
|
|
1269
|
+
--------
|
|
1270
|
+
|
|
1271
|
+
Use ``%`` character to specify any number of characters:
|
|
1272
|
+
|
|
1273
|
+
.. testcode::
|
|
1274
|
+
:skipif: not is_ilike_supported()
|
|
1275
|
+
|
|
1276
|
+
data = otp.Ticks(X=['a', 'ab', 'Ab', 'b_'])
|
|
1277
|
+
data['LIKE'] = data['X'].str.ilike('a%')
|
|
1278
|
+
df = otp.run(data)
|
|
1279
|
+
print(df)
|
|
1280
|
+
|
|
1281
|
+
.. testoutput::
|
|
1282
|
+
|
|
1283
|
+
Time X LIKE
|
|
1284
|
+
0 2003-12-01 00:00:00.000 a 1.0
|
|
1285
|
+
1 2003-12-01 00:00:00.001 ab 1.0
|
|
1286
|
+
2 2003-12-01 00:00:00.002 Ab 1.0
|
|
1287
|
+
3 2003-12-01 00:00:00.003 b_ 0.0
|
|
1288
|
+
|
|
1289
|
+
Use ``_`` special character to specify a single character:
|
|
1290
|
+
|
|
1291
|
+
.. testcode::
|
|
1292
|
+
:skipif: not is_ilike_supported()
|
|
1293
|
+
|
|
1294
|
+
data = otp.Ticks(X=['a', 'ab', 'Ab', 'b_'])
|
|
1295
|
+
data['LIKE'] = data['X'].str.ilike('a_')
|
|
1296
|
+
df = otp.run(data)
|
|
1297
|
+
print(df)
|
|
1298
|
+
|
|
1299
|
+
.. testoutput::
|
|
1300
|
+
|
|
1301
|
+
Time X LIKE
|
|
1302
|
+
0 2003-12-01 00:00:00.000 a 0.0
|
|
1303
|
+
1 2003-12-01 00:00:00.001 ab 1.0
|
|
1304
|
+
2 2003-12-01 00:00:00.002 Ab 1.0
|
|
1305
|
+
3 2003-12-01 00:00:00.003 b_ 0.0
|
|
1306
|
+
|
|
1307
|
+
Use backslash ``\`` character to escape special characters:
|
|
1308
|
+
|
|
1309
|
+
.. testcode::
|
|
1310
|
+
:skipif: not is_ilike_supported()
|
|
1311
|
+
|
|
1312
|
+
data = otp.Ticks(X=['a', 'ab', 'bb', 'b_'])
|
|
1313
|
+
data['LIKE'] = data['X'].str.ilike(r'b\_')
|
|
1314
|
+
df = otp.run(data)
|
|
1315
|
+
print(df)
|
|
1316
|
+
|
|
1317
|
+
.. testoutput::
|
|
1318
|
+
|
|
1319
|
+
Time X LIKE
|
|
1320
|
+
0 2003-12-01 00:00:00.000 a 0.0
|
|
1321
|
+
1 2003-12-01 00:00:00.001 ab 0.0
|
|
1322
|
+
2 2003-12-01 00:00:00.002 bb 0.0
|
|
1323
|
+
3 2003-12-01 00:00:00.003 b_ 1.0
|
|
1324
|
+
|
|
1325
|
+
This function can be used to filter out ticks:
|
|
1326
|
+
|
|
1327
|
+
.. testcode::
|
|
1328
|
+
:skipif: not is_ilike_supported()
|
|
1329
|
+
|
|
1330
|
+
data = otp.Ticks(X=['a', 'ab', 'Ab', 'b_'])
|
|
1331
|
+
data = data.where(data['X'].str.ilike('a%'))
|
|
1332
|
+
df = otp.run(data)
|
|
1333
|
+
print(df)
|
|
1334
|
+
|
|
1335
|
+
.. testoutput::
|
|
1336
|
+
|
|
1337
|
+
Time X
|
|
1338
|
+
0 2003-12-01 00:00:00.000 a
|
|
1339
|
+
1 2003-12-01 00:00:00.001 ab
|
|
1340
|
+
2 2003-12-01 00:00:00.002 Ab
|
|
1341
|
+
|
|
1342
|
+
``pattern`` can only be a constant expression, like string or symbol parameter:
|
|
1343
|
+
|
|
1344
|
+
.. testcode::
|
|
1345
|
+
:skipif: not is_ilike_supported()
|
|
1346
|
+
|
|
1347
|
+
data = otp.Ticks(X=['a', 'ab', 'A', 'b_'])
|
|
1348
|
+
data['LIKE'] = data['X'].str.ilike(data.Symbol['PATTERN', str])
|
|
1349
|
+
df = otp.run(data, symbols=otp.Tick(SYMBOL_NAME='COMMON::AAPL', PATTERN='_'))['COMMON::AAPL']
|
|
1350
|
+
print(df)
|
|
1351
|
+
|
|
1352
|
+
.. testoutput::
|
|
1353
|
+
|
|
1354
|
+
Time X LIKE
|
|
1355
|
+
0 2003-12-01 00:00:00.000 a 1.0
|
|
1356
|
+
1 2003-12-01 00:00:00.001 ab 0.0
|
|
1357
|
+
2 2003-12-01 00:00:00.002 A 1.0
|
|
1358
|
+
3 2003-12-01 00:00:00.003 b_ 0.0
|
|
1359
|
+
"""
|
|
1360
|
+
from onetick.py.core._source._symbol_param import _SymbolParamColumn
|
|
1361
|
+
if not isinstance(pattern, (str, _SymbolParamColumn)):
|
|
1362
|
+
raise ValueError('ilike() function expects parameter to be a constant expression')
|
|
1363
|
+
return _StrAccessor.Formatter(
|
|
1364
|
+
op_params=[self._base_column, pattern],
|
|
1365
|
+
dtype=bool,
|
|
1366
|
+
formatter=lambda column, pattern: f'{ott.value2str(column)} ILIKE {ott.value2str(pattern)}'
|
|
1367
|
+
)
|