equity-aggregator 0.1.1__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- equity_aggregator/README.md +49 -39
- equity_aggregator/adapters/__init__.py +13 -7
- equity_aggregator/adapters/data_sources/__init__.py +4 -6
- equity_aggregator/adapters/data_sources/_utils/_client.py +1 -1
- equity_aggregator/adapters/data_sources/{authoritative_feeds → _utils}/_record_types.py +1 -1
- equity_aggregator/adapters/data_sources/discovery_feeds/__init__.py +17 -0
- equity_aggregator/adapters/data_sources/discovery_feeds/intrinio/__init__.py +7 -0
- equity_aggregator/adapters/data_sources/discovery_feeds/intrinio/_utils/__init__.py +10 -0
- equity_aggregator/adapters/data_sources/discovery_feeds/intrinio/_utils/backoff.py +33 -0
- equity_aggregator/adapters/data_sources/discovery_feeds/intrinio/_utils/parser.py +107 -0
- equity_aggregator/adapters/data_sources/discovery_feeds/intrinio/intrinio.py +305 -0
- equity_aggregator/adapters/data_sources/discovery_feeds/intrinio/session.py +197 -0
- equity_aggregator/adapters/data_sources/discovery_feeds/lseg/__init__.py +7 -0
- equity_aggregator/adapters/data_sources/discovery_feeds/lseg/_utils/__init__.py +9 -0
- equity_aggregator/adapters/data_sources/discovery_feeds/lseg/_utils/backoff.py +33 -0
- equity_aggregator/adapters/data_sources/discovery_feeds/lseg/_utils/parser.py +120 -0
- equity_aggregator/adapters/data_sources/discovery_feeds/lseg/lseg.py +239 -0
- equity_aggregator/adapters/data_sources/discovery_feeds/lseg/session.py +162 -0
- equity_aggregator/adapters/data_sources/discovery_feeds/sec/__init__.py +7 -0
- equity_aggregator/adapters/data_sources/{authoritative_feeds → discovery_feeds/sec}/sec.py +4 -5
- equity_aggregator/adapters/data_sources/discovery_feeds/stock_analysis/__init__.py +7 -0
- equity_aggregator/adapters/data_sources/discovery_feeds/stock_analysis/stock_analysis.py +150 -0
- equity_aggregator/adapters/data_sources/discovery_feeds/tradingview/__init__.py +5 -0
- equity_aggregator/adapters/data_sources/discovery_feeds/tradingview/tradingview.py +275 -0
- equity_aggregator/adapters/data_sources/discovery_feeds/xetra/__init__.py +7 -0
- equity_aggregator/adapters/data_sources/{authoritative_feeds → discovery_feeds/xetra}/xetra.py +9 -12
- equity_aggregator/adapters/data_sources/enrichment_feeds/__init__.py +6 -1
- equity_aggregator/adapters/data_sources/enrichment_feeds/gleif/__init__.py +5 -0
- equity_aggregator/adapters/data_sources/enrichment_feeds/gleif/api.py +71 -0
- equity_aggregator/adapters/data_sources/enrichment_feeds/gleif/download.py +109 -0
- equity_aggregator/adapters/data_sources/enrichment_feeds/gleif/gleif.py +195 -0
- equity_aggregator/adapters/data_sources/enrichment_feeds/gleif/parser.py +75 -0
- equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/__init__.py +1 -1
- equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/_utils/__init__.py +11 -0
- equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/{utils → _utils}/backoff.py +1 -1
- equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/{utils → _utils}/fuzzy.py +28 -26
- equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/_utils/json.py +36 -0
- equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/api/__init__.py +1 -1
- equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/api/{summary.py → quote_summary.py} +44 -30
- equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/api/search.py +10 -5
- equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/auth.py +130 -0
- equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/config.py +3 -3
- equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/ranking.py +97 -0
- equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/session.py +85 -218
- equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/transport.py +191 -0
- equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/yfinance.py +413 -0
- equity_aggregator/adapters/data_sources/reference_lookup/exchange_rate_api.py +6 -13
- equity_aggregator/adapters/data_sources/reference_lookup/openfigi.py +23 -7
- equity_aggregator/cli/dispatcher.py +11 -8
- equity_aggregator/cli/main.py +14 -5
- equity_aggregator/cli/parser.py +1 -1
- equity_aggregator/cli/signals.py +32 -0
- equity_aggregator/domain/_utils/__init__.py +2 -2
- equity_aggregator/domain/_utils/_load_converter.py +30 -21
- equity_aggregator/domain/_utils/_merge.py +221 -368
- equity_aggregator/domain/_utils/_merge_config.py +205 -0
- equity_aggregator/domain/_utils/_strategies.py +180 -0
- equity_aggregator/domain/pipeline/resolve.py +17 -11
- equity_aggregator/domain/pipeline/runner.py +4 -4
- equity_aggregator/domain/pipeline/seed.py +5 -1
- equity_aggregator/domain/pipeline/transforms/__init__.py +2 -2
- equity_aggregator/domain/pipeline/transforms/canonicalise.py +1 -1
- equity_aggregator/domain/pipeline/transforms/enrich.py +328 -285
- equity_aggregator/domain/pipeline/transforms/group.py +48 -0
- equity_aggregator/logging_config.py +4 -1
- equity_aggregator/schemas/__init__.py +11 -5
- equity_aggregator/schemas/canonical.py +11 -6
- equity_aggregator/schemas/feeds/__init__.py +11 -5
- equity_aggregator/schemas/feeds/gleif_feed_data.py +35 -0
- equity_aggregator/schemas/feeds/intrinio_feed_data.py +142 -0
- equity_aggregator/schemas/feeds/{lse_feed_data.py → lseg_feed_data.py} +85 -52
- equity_aggregator/schemas/feeds/sec_feed_data.py +36 -6
- equity_aggregator/schemas/feeds/stock_analysis_feed_data.py +107 -0
- equity_aggregator/schemas/feeds/tradingview_feed_data.py +144 -0
- equity_aggregator/schemas/feeds/xetra_feed_data.py +1 -1
- equity_aggregator/schemas/feeds/yfinance_feed_data.py +47 -35
- equity_aggregator/schemas/raw.py +5 -3
- equity_aggregator/schemas/types.py +7 -0
- equity_aggregator/schemas/validators.py +81 -27
- equity_aggregator/storage/data_store.py +5 -3
- {equity_aggregator-0.1.1.dist-info → equity_aggregator-0.1.5.dist-info}/METADATA +205 -115
- equity_aggregator-0.1.5.dist-info/RECORD +103 -0
- {equity_aggregator-0.1.1.dist-info → equity_aggregator-0.1.5.dist-info}/WHEEL +1 -1
- equity_aggregator/adapters/data_sources/authoritative_feeds/__init__.py +0 -13
- equity_aggregator/adapters/data_sources/authoritative_feeds/euronext.py +0 -420
- equity_aggregator/adapters/data_sources/authoritative_feeds/lse.py +0 -352
- equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/feed.py +0 -350
- equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/utils/__init__.py +0 -9
- equity_aggregator/domain/pipeline/transforms/deduplicate.py +0 -54
- equity_aggregator/schemas/feeds/euronext_feed_data.py +0 -59
- equity_aggregator-0.1.1.dist-info/RECORD +0 -72
- {equity_aggregator-0.1.1.dist-info → equity_aggregator-0.1.5.dist-info}/entry_points.txt +0 -0
- {equity_aggregator-0.1.1.dist-info → equity_aggregator-0.1.5.dist-info}/licenses/LICENCE.txt +0 -0
|
@@ -1,459 +1,312 @@
|
|
|
1
1
|
# _utils/_merge.py
|
|
2
2
|
|
|
3
3
|
|
|
4
|
-
from collections import Counter
|
|
5
4
|
from collections.abc import Sequence
|
|
6
5
|
from decimal import Decimal
|
|
7
|
-
from functools import
|
|
8
|
-
from
|
|
9
|
-
from statistics import median
|
|
10
|
-
|
|
11
|
-
from rapidfuzz import fuzz
|
|
6
|
+
from functools import partial
|
|
7
|
+
from typing import NamedTuple
|
|
12
8
|
|
|
13
9
|
from equity_aggregator.schemas.raw import RawEquity
|
|
14
10
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
group (Sequence[RawEquity]): A sequence of RawEquity objects considered
|
|
37
|
-
duplicates to be merged.
|
|
38
|
-
|
|
39
|
-
Returns:
|
|
40
|
-
RawEquity: A new RawEquity instance with merged field values.
|
|
11
|
+
from ._merge_config import (
|
|
12
|
+
FIELD_CONFIG,
|
|
13
|
+
PRICE_RANGE_FIELDS,
|
|
14
|
+
FieldSpec,
|
|
15
|
+
Strategy,
|
|
16
|
+
)
|
|
17
|
+
from ._strategies import (
|
|
18
|
+
filter_by_deviation,
|
|
19
|
+
fuzzy_cluster_mode,
|
|
20
|
+
median_decimal,
|
|
21
|
+
mode_first,
|
|
22
|
+
union_ordered,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _extract_field(
|
|
27
|
+
group: Sequence[RawEquity],
|
|
28
|
+
field: str,
|
|
29
|
+
*,
|
|
30
|
+
filter_none: bool = True,
|
|
31
|
+
) -> list:
|
|
41
32
|
"""
|
|
33
|
+
Extract field values from a group of RawEquity objects.
|
|
42
34
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
return RawEquity(
|
|
47
|
-
name=_merge_name(group),
|
|
48
|
-
symbol=_merge_symbol(group),
|
|
49
|
-
isin=_merge_id(group, "isin"),
|
|
50
|
-
cusip=_merge_id(group, "cusip"),
|
|
51
|
-
cik=_merge_id(group, "cik"),
|
|
52
|
-
share_class_figi=share_class_figi_value,
|
|
53
|
-
mics=_merge_mics(group),
|
|
54
|
-
currency=_merge_currency(group),
|
|
55
|
-
last_price=_merge_decimal_field(group, "last_price"),
|
|
56
|
-
market_cap=_merge_decimal_field(group, "market_cap"),
|
|
57
|
-
fifty_two_week_min=_merge_decimal_field(group, "fifty_two_week_min"),
|
|
58
|
-
fifty_two_week_max=_merge_decimal_field(group, "fifty_two_week_max"),
|
|
59
|
-
dividend_yield=_merge_decimal_field(group, "dividend_yield"),
|
|
60
|
-
market_volume=_merge_decimal_field(group, "market_volume"),
|
|
61
|
-
held_insiders=_merge_decimal_field(group, "held_insiders"),
|
|
62
|
-
held_institutions=_merge_decimal_field(group, "held_institutions"),
|
|
63
|
-
short_interest=_merge_decimal_field(group, "short_interest"),
|
|
64
|
-
share_float=_merge_decimal_field(group, "share_float"),
|
|
65
|
-
shares_outstanding=_merge_decimal_field(group, "shares_outstanding"),
|
|
66
|
-
revenue_per_share=_merge_decimal_field(group, "revenue_per_share"),
|
|
67
|
-
profit_margin=_merge_decimal_field(group, "profit_margin"),
|
|
68
|
-
gross_margin=_merge_decimal_field(group, "gross_margin"),
|
|
69
|
-
operating_margin=_merge_decimal_field(group, "operating_margin"),
|
|
70
|
-
free_cash_flow=_merge_decimal_field(group, "free_cash_flow"),
|
|
71
|
-
operating_cash_flow=_merge_decimal_field(group, "operating_cash_flow"),
|
|
72
|
-
return_on_equity=_merge_decimal_field(group, "return_on_equity"),
|
|
73
|
-
return_on_assets=_merge_decimal_field(group, "return_on_assets"),
|
|
74
|
-
performance_1_year=_merge_decimal_field(group, "performance_1_year"),
|
|
75
|
-
total_debt=_merge_decimal_field(group, "total_debt"),
|
|
76
|
-
revenue=_merge_decimal_field(group, "revenue"),
|
|
77
|
-
ebitda=_merge_decimal_field(group, "ebitda"),
|
|
78
|
-
trailing_pe=_merge_decimal_field(group, "trailing_pe"),
|
|
79
|
-
price_to_book=_merge_decimal_field(group, "price_to_book"),
|
|
80
|
-
trailing_eps=_merge_decimal_field(group, "trailing_eps"),
|
|
81
|
-
analyst_rating=_merge_analyst_rating(group),
|
|
82
|
-
industry=_merge_industry(group),
|
|
83
|
-
sector=_merge_sector(group),
|
|
84
|
-
)
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def _validate_share_class_figi(group: Sequence[RawEquity]) -> str:
|
|
88
|
-
"""
|
|
89
|
-
Validates that all RawEquity objects in the group share the same
|
|
90
|
-
share_class_figi value.
|
|
35
|
+
Retrieves the specified field from each RawEquity object in the group.
|
|
36
|
+
Optionally filters out None values from the result.
|
|
91
37
|
|
|
92
38
|
Args:
|
|
93
|
-
group (Sequence[RawEquity]):
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
ValueError: If the group is empty or contains multiple distinct
|
|
98
|
-
share_class_figi values.
|
|
39
|
+
group (Sequence[RawEquity]): Sequence of RawEquity objects to extract from.
|
|
40
|
+
field (str): Name of the field to extract from each object.
|
|
41
|
+
filter_none (bool, optional): If True, exclude None values from the result.
|
|
42
|
+
Defaults to True.
|
|
99
43
|
|
|
100
44
|
Returns:
|
|
101
|
-
|
|
45
|
+
list: Extracted field values, optionally filtered to exclude None values.
|
|
102
46
|
"""
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
figis = {raw_equity.share_class_figi for raw_equity in group}
|
|
107
|
-
if len(figis) != 1:
|
|
108
|
-
raise ValueError(
|
|
109
|
-
"All raw equities in the group must have identical share_class_figi values "
|
|
110
|
-
f"(found: {sorted(figis)})",
|
|
111
|
-
)
|
|
112
|
-
return figis.pop()
|
|
47
|
+
values = [getattr(eq, field) for eq in group]
|
|
48
|
+
return [v for v in values if v is not None] if filter_none else values
|
|
113
49
|
|
|
114
50
|
|
|
115
|
-
|
|
51
|
+
class EquityIdentifiers(NamedTuple):
|
|
116
52
|
"""
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
Defaults to 90.
|
|
128
|
-
|
|
129
|
-
Returns:
|
|
130
|
-
str: The selected representative equity name from the group.
|
|
53
|
+
Representative identifiers extracted from a group of RawEquity records.
|
|
54
|
+
|
|
55
|
+
Attributes:
|
|
56
|
+
symbol: Representative ticker symbol.
|
|
57
|
+
name: Representative equity name.
|
|
58
|
+
isin: Representative ISIN identifier.
|
|
59
|
+
cusip: Representative CUSIP identifier.
|
|
60
|
+
cik: Representative CIK identifier.
|
|
61
|
+
lei: Representative LEI identifier.
|
|
62
|
+
share_class_figi: Validated share class FIGI (must be identical across group).
|
|
131
63
|
"""
|
|
132
|
-
names = [equity.name for equity in duplicate_group]
|
|
133
|
-
|
|
134
|
-
# cluster names by fuzzy similarity
|
|
135
|
-
clusters = _cluster(names, threshold=threshold)
|
|
136
64
|
|
|
137
|
-
|
|
138
|
-
|
|
65
|
+
symbol: str
|
|
66
|
+
name: str
|
|
67
|
+
isin: str | None
|
|
68
|
+
cusip: str | None
|
|
69
|
+
cik: str | None
|
|
70
|
+
lei: str | None
|
|
71
|
+
share_class_figi: str
|
|
139
72
|
|
|
140
|
-
def _cluster_weight(cluster: list[str]) -> int:
|
|
141
|
-
return sum(weight[token] for token in cluster)
|
|
142
73
|
|
|
143
|
-
|
|
144
|
-
best_cluster = max(clusters, key=_cluster_weight)
|
|
145
|
-
|
|
146
|
-
return next(name for name in names if name in best_cluster)
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
def _merge_symbol(duplicate_group: Sequence[RawEquity]) -> str:
|
|
74
|
+
def merge(group: Sequence[RawEquity]) -> RawEquity:
|
|
150
75
|
"""
|
|
151
|
-
|
|
76
|
+
Merge a group of RawEquity records into a single, representative RawEquity instance.
|
|
152
77
|
|
|
153
|
-
|
|
154
|
-
|
|
78
|
+
Each field is merged using a configurable strategy defined in FIELD_CONFIG:
|
|
79
|
+
- Most fields use one of: mode (most frequent), median (for numerics), fuzzy
|
|
80
|
+
clustering (for similar strings), or union (for lists).
|
|
81
|
+
- Price range fields (last_price, fifty_two_week_min, fifty_two_week_max) are
|
|
82
|
+
merged together with additional consistency checks.
|
|
83
|
+
|
|
84
|
+
The merging process ensures that all records in the group share the same
|
|
85
|
+
share_class_figi; otherwise, a ValueError is raised.
|
|
155
86
|
|
|
156
87
|
Args:
|
|
157
|
-
|
|
158
|
-
|
|
88
|
+
group (Sequence[RawEquity]): Non-empty sequence of RawEquity objects to merge.
|
|
89
|
+
All must have identical share_class_figi.
|
|
159
90
|
|
|
160
91
|
Returns:
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
"""
|
|
164
|
-
symbols = [equity.symbol for equity in duplicate_group]
|
|
165
|
-
return Counter(symbols).most_common(1)[0][0]
|
|
92
|
+
RawEquity: A new RawEquity instance with merged values for each field, according
|
|
93
|
+
to the configured strategies.
|
|
166
94
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
field: str,
|
|
171
|
-
) -> Decimal | None:
|
|
95
|
+
Raises:
|
|
96
|
+
ValueError: If the group is empty or contains multiple distinct share_class_figi
|
|
97
|
+
values.
|
|
172
98
|
"""
|
|
173
|
-
|
|
174
|
-
RawEquity objects, ignoring any entries where the field value is None.
|
|
99
|
+
share_class_figi = _validate_share_class_figi(group)
|
|
175
100
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
101
|
+
merged = {
|
|
102
|
+
"share_class_figi": share_class_figi,
|
|
103
|
+
**{
|
|
104
|
+
field: _apply_strategy(group, field, spec)
|
|
105
|
+
for field, spec in FIELD_CONFIG.items()
|
|
106
|
+
if field not in PRICE_RANGE_FIELDS
|
|
107
|
+
},
|
|
108
|
+
**_merge_price_range(group),
|
|
109
|
+
}
|
|
180
110
|
|
|
181
|
-
|
|
182
|
-
Decimal | None: The median of the non-null field values as a Decimal,
|
|
183
|
-
or None if no valid values are present.
|
|
184
|
-
"""
|
|
185
|
-
values: list[Decimal] = [
|
|
186
|
-
getattr(equity, field)
|
|
187
|
-
for equity in duplicate_group
|
|
188
|
-
if getattr(equity, field) is not None
|
|
189
|
-
]
|
|
190
|
-
return median(values) if values else None
|
|
111
|
+
return RawEquity.model_validate(merged)
|
|
191
112
|
|
|
192
113
|
|
|
193
|
-
def
|
|
114
|
+
def extract_identifiers(group: Sequence[RawEquity]) -> EquityIdentifiers:
|
|
194
115
|
"""
|
|
195
|
-
|
|
196
|
-
("isin", "cusip", "cik" or "share_class_figi") from a group of RawEquity objects.
|
|
116
|
+
Compute representative identifiers from a group of RawEquity records.
|
|
197
117
|
|
|
198
|
-
|
|
199
|
-
|
|
118
|
+
Uses the same resolution algorithms as merge() — mode for IDs,
|
|
119
|
+
fuzzy clustering for name, frequency for symbol.
|
|
200
120
|
|
|
201
121
|
Args:
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
or "share_class_figi").
|
|
122
|
+
group: A non-empty sequence of RawEquity objects from which to extract
|
|
123
|
+
identifiers. All records must share the same share_class_figi.
|
|
205
124
|
|
|
206
125
|
Returns:
|
|
207
|
-
|
|
208
|
-
"""
|
|
209
|
-
# get the values for the given field
|
|
210
|
-
values = [
|
|
211
|
-
getattr(equity, field)
|
|
212
|
-
for equity in duplicate_group
|
|
213
|
-
if getattr(equity, field) is not None
|
|
214
|
-
]
|
|
215
|
-
|
|
216
|
-
if not values:
|
|
217
|
-
return None
|
|
218
|
-
|
|
219
|
-
counts = Counter(values)
|
|
220
|
-
|
|
221
|
-
# max(counts.values()) guaranteed ≥ 1
|
|
222
|
-
best_freq = max(counts.values())
|
|
126
|
+
EquityIdentifiers: Representative identifiers resolved from the group.
|
|
223
127
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
def _merge_mics(duplicate_group: Sequence[RawEquity]) -> list[str] | None:
|
|
229
|
-
"""
|
|
230
|
-
Merges all non-null MIC lists from a group of RawEquity objects, preserving the
|
|
231
|
-
order of first occurrence and removing duplicates.
|
|
232
|
-
|
|
233
|
-
Args:
|
|
234
|
-
duplicate_group (Sequence[RawEquity]): A sequence of RawEquity objects, each
|
|
235
|
-
possibly containing a list of MICs.
|
|
236
|
-
|
|
237
|
-
Returns:
|
|
238
|
-
list[str] | None: A list of unique MICs in order of first appearance, or None
|
|
239
|
-
if no MICs are found.
|
|
128
|
+
Raises:
|
|
129
|
+
ValueError: If the group is empty or contains multiple distinct
|
|
130
|
+
share_class_figi values.
|
|
240
131
|
"""
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
132
|
+
share_class_figi = _validate_share_class_figi(group)
|
|
133
|
+
|
|
134
|
+
return EquityIdentifiers(
|
|
135
|
+
symbol=mode_first(_extract_field(group, "symbol")),
|
|
136
|
+
name=fuzzy_cluster_mode(_extract_field(group, "name")),
|
|
137
|
+
isin=mode_first(_extract_field(group, "isin")),
|
|
138
|
+
cusip=mode_first(_extract_field(group, "cusip")),
|
|
139
|
+
cik=mode_first(_extract_field(group, "cik")),
|
|
140
|
+
lei=mode_first(_extract_field(group, "lei")),
|
|
141
|
+
share_class_figi=share_class_figi,
|
|
142
|
+
)
|
|
251
143
|
|
|
252
144
|
|
|
253
|
-
def
|
|
145
|
+
def _validate_share_class_figi(group: Sequence[RawEquity]) -> str:
|
|
254
146
|
"""
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
currency code in the original group order. If all currency codes are null,
|
|
258
|
-
returns None.
|
|
147
|
+
Validates that all RawEquity objects in the group share the same
|
|
148
|
+
share_class_figi value.
|
|
259
149
|
|
|
260
150
|
Args:
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
Returns:
|
|
265
|
-
str | None: The most frequent non-null currency code, or None if all are null.
|
|
266
|
-
"""
|
|
267
|
-
# get the currency codes from the duplicate group
|
|
268
|
-
currency_codes = [
|
|
269
|
-
equity.currency for equity in duplicate_group if equity.currency is not None
|
|
270
|
-
]
|
|
271
|
-
|
|
272
|
-
if not currency_codes:
|
|
273
|
-
return None
|
|
274
|
-
|
|
275
|
-
freq = Counter(currency_codes)
|
|
276
|
-
best_freq = max(freq.values())
|
|
277
|
-
|
|
278
|
-
# earliest among the non-null currencies with best frequency
|
|
279
|
-
return next(currency for currency in currency_codes if freq[currency] == best_freq)
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
def _merge_analyst_rating(duplicate_group: Sequence[RawEquity]) -> str | None:
|
|
283
|
-
"""
|
|
284
|
-
Selects the most frequent non-null analyst rating ("BUY", "SELL", or "HOLD")
|
|
285
|
-
from a group of RawEquity objects. If there is a tie, the rating that appears
|
|
286
|
-
first in the input sequence is returned. Returns None if all ratings are missing.
|
|
151
|
+
group (Sequence[RawEquity]): A non-empty sequence of RawEquity objects to
|
|
152
|
+
validate.
|
|
287
153
|
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
154
|
+
Raises:
|
|
155
|
+
ValueError: If the group is empty or contains multiple distinct
|
|
156
|
+
share_class_figi values.
|
|
291
157
|
|
|
292
158
|
Returns:
|
|
293
|
-
str
|
|
294
|
-
missing.
|
|
159
|
+
str: The single shared share_class_figi value present in the group.
|
|
295
160
|
"""
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
for equity in duplicate_group
|
|
299
|
-
if equity.analyst_rating is not None
|
|
300
|
-
]
|
|
301
|
-
|
|
302
|
-
if not ratings:
|
|
303
|
-
return None
|
|
304
|
-
|
|
305
|
-
freq = Counter(ratings)
|
|
306
|
-
best = max(freq.values())
|
|
161
|
+
if not group:
|
|
162
|
+
raise ValueError("Cannot merge an empty group of equities")
|
|
307
163
|
|
|
308
|
-
|
|
309
|
-
|
|
164
|
+
figis = {raw_equity.share_class_figi for raw_equity in group}
|
|
165
|
+
if len(figis) != 1:
|
|
166
|
+
raise ValueError(
|
|
167
|
+
"All raw equities in the group must have identical share_class_figi values "
|
|
168
|
+
f"(found: {sorted(figis)})",
|
|
169
|
+
)
|
|
170
|
+
return figis.pop()
|
|
310
171
|
|
|
311
172
|
|
|
312
|
-
def
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
) ->
|
|
173
|
+
def _apply_strategy(
|
|
174
|
+
group: Sequence[RawEquity],
|
|
175
|
+
field: str,
|
|
176
|
+
spec: FieldSpec,
|
|
177
|
+
) -> object:
|
|
317
178
|
"""
|
|
318
|
-
|
|
179
|
+
Apply a specific merge strategy to a field.
|
|
319
180
|
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
- Within the winning cluster, the earliest spelling in the original sequence is
|
|
324
|
-
returned, preserving original capitalisation.
|
|
181
|
+
Extracts field values from the group and applies the configured strategy.
|
|
182
|
+
If fewer than min_sources non-None values exist, returns None to prevent
|
|
183
|
+
accepting dubious single-source data.
|
|
325
184
|
|
|
326
185
|
Args:
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
industry names. Defaults to 90.
|
|
186
|
+
group (Sequence[RawEquity]): Sequence of RawEquity objects to merge.
|
|
187
|
+
field (str): Name of the field to merge.
|
|
188
|
+
spec (FieldSpec): Strategy specification for this field.
|
|
331
189
|
|
|
332
190
|
Returns:
|
|
333
|
-
|
|
334
|
-
are missing or blank.
|
|
191
|
+
object: The merged value for this field, or None if quorum not met.
|
|
335
192
|
"""
|
|
336
|
-
|
|
337
|
-
industries = [
|
|
338
|
-
equity.industry for equity in duplicate_group if equity.industry is not None
|
|
339
|
-
]
|
|
340
|
-
|
|
341
|
-
if not industries:
|
|
342
|
-
return None
|
|
343
|
-
|
|
344
|
-
# cluster names by fuzzy similarity
|
|
345
|
-
clusters = _cluster(industries, threshold=threshold)
|
|
193
|
+
values = _extract_field(group, field, filter_none=(spec.strategy != Strategy.UNION))
|
|
346
194
|
|
|
347
|
-
|
|
348
|
-
|
|
195
|
+
if spec.max_deviation is not None and spec.strategy == Strategy.MEDIAN:
|
|
196
|
+
values = filter_by_deviation(values, spec.max_deviation)
|
|
349
197
|
|
|
350
|
-
|
|
351
|
-
return
|
|
198
|
+
if len(values) < spec.min_sources:
|
|
199
|
+
return None
|
|
352
200
|
|
|
353
|
-
|
|
354
|
-
|
|
201
|
+
dispatch = {
|
|
202
|
+
Strategy.MODE: mode_first,
|
|
203
|
+
Strategy.FUZZY_CLUSTER: partial(fuzzy_cluster_mode, threshold=spec.threshold),
|
|
204
|
+
Strategy.UNION: union_ordered,
|
|
205
|
+
Strategy.MEDIAN: median_decimal,
|
|
206
|
+
}
|
|
355
207
|
|
|
356
|
-
|
|
357
|
-
return next(industry for industry in industries if industry in best_cluster)
|
|
208
|
+
return dispatch[spec.strategy](values)
|
|
358
209
|
|
|
359
210
|
|
|
360
|
-
def
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
) -> str | None:
|
|
211
|
+
def _merge_price_range(
|
|
212
|
+
group: Sequence[RawEquity],
|
|
213
|
+
min_consistent: int = 2,
|
|
214
|
+
) -> dict[str, Decimal | None]:
|
|
365
215
|
"""
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
216
|
+
Merge last_price, fifty_two_week_min, and fifty_two_week_max with tiered quality
|
|
217
|
+
checks.
|
|
218
|
+
|
|
219
|
+
Attempts to merge price fields as a coherent triplet when possible, falling back to
|
|
220
|
+
independent field merging when complete records are unavailable. This preserves data
|
|
221
|
+
quality through consistency checks whilst avoiding unnecessary data loss.
|
|
222
|
+
|
|
223
|
+
Primary strategy (preferred):
|
|
224
|
+
- Requires records with all three price fields populated (complete records).
|
|
225
|
+
- Filters out records where last_price violates the 52-week range constraint.
|
|
226
|
+
- A 10% tolerance above fifty_two_week_max accommodates timing drift between
|
|
227
|
+
feeds.
|
|
228
|
+
- If quorum of consistent complete records is met (default: 2), returns median
|
|
229
|
+
values.
|
|
230
|
+
|
|
231
|
+
Fallback strategy (when quorum not met):
|
|
232
|
+
- Merges each price field independently using per-field configuration from
|
|
233
|
+
FIELD_CONFIG.
|
|
234
|
+
- Each field still requires its own min_sources threshold (typically 2).
|
|
235
|
+
- Allows partial price data when complete triplets are unavailable across sources.
|
|
372
236
|
|
|
373
237
|
Args:
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
names. Defaults to 90.
|
|
238
|
+
group (Sequence[RawEquity]): Sequence of RawEquity objects to merge.
|
|
239
|
+
min_consistent (int): Minimum number of consistent complete records required
|
|
240
|
+
for primary strategy. Defaults to 2.
|
|
378
241
|
|
|
379
242
|
Returns:
|
|
380
|
-
str | None:
|
|
381
|
-
|
|
243
|
+
dict[str, Decimal | None]: Dictionary containing merged last_price,
|
|
244
|
+
fifty_two_week_min, and fifty_two_week_max values. Fields may be None
|
|
245
|
+
if neither strategy can satisfy quorum requirements.
|
|
382
246
|
"""
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
if not sectors:
|
|
387
|
-
return None
|
|
388
|
-
|
|
389
|
-
# cluster names by fuzzy similarity
|
|
390
|
-
clusters = _cluster(sectors, threshold=threshold)
|
|
391
|
-
|
|
392
|
-
# weight clusters and keep earliest spelling
|
|
393
|
-
weights = Counter(sectors)
|
|
394
|
-
|
|
395
|
-
def _cluster_weight(cluster: list[str]) -> int:
|
|
396
|
-
return sum(weights[token] for token in cluster)
|
|
247
|
+
consistent = tuple(
|
|
248
|
+
filter(_is_price_consistent, filter(_is_price_complete, group)),
|
|
249
|
+
)
|
|
397
250
|
|
|
398
|
-
|
|
399
|
-
|
|
251
|
+
if len(consistent) >= min_consistent:
|
|
252
|
+
return {
|
|
253
|
+
"last_price": median_decimal([eq.last_price for eq in consistent]),
|
|
254
|
+
"fifty_two_week_min": median_decimal(
|
|
255
|
+
[eq.fifty_two_week_min for eq in consistent],
|
|
256
|
+
),
|
|
257
|
+
"fifty_two_week_max": median_decimal(
|
|
258
|
+
[eq.fifty_two_week_max for eq in consistent],
|
|
259
|
+
),
|
|
260
|
+
}
|
|
400
261
|
|
|
401
|
-
#
|
|
402
|
-
return
|
|
262
|
+
# Fallback: merge fields independently
|
|
263
|
+
return {
|
|
264
|
+
field: _apply_strategy(group, field, FIELD_CONFIG[field])
|
|
265
|
+
for field in PRICE_RANGE_FIELDS
|
|
266
|
+
}
|
|
403
267
|
|
|
404
268
|
|
|
405
|
-
|
|
406
|
-
def _token_ratio(a: str, b: str) -> int:
|
|
269
|
+
def _is_price_complete(eq: RawEquity) -> bool:
|
|
407
270
|
"""
|
|
408
|
-
|
|
271
|
+
Checks if a RawEquity record has non-null values for last_price, fifty_two_week_min,
|
|
272
|
+
and fifty_two_week_max.
|
|
409
273
|
|
|
410
274
|
Args:
|
|
411
|
-
|
|
412
|
-
b (str): The second string to compare.
|
|
275
|
+
eq (RawEquity): The RawEquity instance to check.
|
|
413
276
|
|
|
414
277
|
Returns:
|
|
415
|
-
|
|
278
|
+
bool: True if all three price fields are not None, False otherwise.
|
|
416
279
|
"""
|
|
417
|
-
return
|
|
280
|
+
return (
|
|
281
|
+
eq.last_price is not None
|
|
282
|
+
and eq.fifty_two_week_min is not None
|
|
283
|
+
and eq.fifty_two_week_max is not None
|
|
284
|
+
)
|
|
418
285
|
|
|
419
286
|
|
|
420
|
-
def
|
|
287
|
+
def _is_price_consistent(eq: RawEquity) -> bool:
|
|
421
288
|
"""
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
Each name is compared to the representative (first item) of each existing cluster.
|
|
426
|
-
If the token-set ratio between the name and a cluster's representative is greater
|
|
427
|
-
than or equal to the specified threshold, the name is added to that cluster.
|
|
428
|
-
|
|
429
|
-
Otherwise, a new cluster is created for the name.
|
|
289
|
+
Checks if the last_price of a RawEquity record falls within its fifty_two_week_min
|
|
290
|
+
and fifty_two_week_max range, allowing a 10% tolerance above the max.
|
|
430
291
|
|
|
431
292
|
Args:
|
|
432
|
-
|
|
433
|
-
threshold (int, optional): Minimum token-set ratio (0-100) required to join an
|
|
434
|
-
existing cluster. Defaults to 90.
|
|
293
|
+
eq (RawEquity): The RawEquity instance to check.
|
|
435
294
|
|
|
436
295
|
Returns:
|
|
437
|
-
|
|
438
|
-
|
|
296
|
+
bool: True if last_price is between fifty_two_week_min and up to 10% above
|
|
297
|
+
fifty_two_week_max, False otherwise. Returns False if any price field
|
|
298
|
+
is None.
|
|
439
299
|
"""
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
if target:
|
|
455
|
-
target.append(name)
|
|
456
|
-
else:
|
|
457
|
-
clusters.append([name])
|
|
458
|
-
|
|
459
|
-
return clusters
|
|
300
|
+
if (
|
|
301
|
+
eq.last_price is None
|
|
302
|
+
or eq.fifty_two_week_min is None
|
|
303
|
+
or eq.fifty_two_week_max is None
|
|
304
|
+
):
|
|
305
|
+
return False
|
|
306
|
+
|
|
307
|
+
price_tolerance = Decimal("1.1")
|
|
308
|
+
return (
|
|
309
|
+
eq.fifty_two_week_min
|
|
310
|
+
<= eq.last_price
|
|
311
|
+
<= eq.fifty_two_week_max * price_tolerance
|
|
312
|
+
)
|