equity-aggregator 0.1.1__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. equity_aggregator/README.md +49 -39
  2. equity_aggregator/adapters/__init__.py +13 -7
  3. equity_aggregator/adapters/data_sources/__init__.py +4 -6
  4. equity_aggregator/adapters/data_sources/_utils/_client.py +1 -1
  5. equity_aggregator/adapters/data_sources/{authoritative_feeds → _utils}/_record_types.py +1 -1
  6. equity_aggregator/adapters/data_sources/discovery_feeds/__init__.py +17 -0
  7. equity_aggregator/adapters/data_sources/discovery_feeds/intrinio/__init__.py +7 -0
  8. equity_aggregator/adapters/data_sources/discovery_feeds/intrinio/_utils/__init__.py +10 -0
  9. equity_aggregator/adapters/data_sources/discovery_feeds/intrinio/_utils/backoff.py +33 -0
  10. equity_aggregator/adapters/data_sources/discovery_feeds/intrinio/_utils/parser.py +107 -0
  11. equity_aggregator/adapters/data_sources/discovery_feeds/intrinio/intrinio.py +305 -0
  12. equity_aggregator/adapters/data_sources/discovery_feeds/intrinio/session.py +197 -0
  13. equity_aggregator/adapters/data_sources/discovery_feeds/lseg/__init__.py +7 -0
  14. equity_aggregator/adapters/data_sources/discovery_feeds/lseg/_utils/__init__.py +9 -0
  15. equity_aggregator/adapters/data_sources/discovery_feeds/lseg/_utils/backoff.py +33 -0
  16. equity_aggregator/adapters/data_sources/discovery_feeds/lseg/_utils/parser.py +120 -0
  17. equity_aggregator/adapters/data_sources/discovery_feeds/lseg/lseg.py +239 -0
  18. equity_aggregator/adapters/data_sources/discovery_feeds/lseg/session.py +162 -0
  19. equity_aggregator/adapters/data_sources/discovery_feeds/sec/__init__.py +7 -0
  20. equity_aggregator/adapters/data_sources/{authoritative_feeds → discovery_feeds/sec}/sec.py +4 -5
  21. equity_aggregator/adapters/data_sources/discovery_feeds/stock_analysis/__init__.py +7 -0
  22. equity_aggregator/adapters/data_sources/discovery_feeds/stock_analysis/stock_analysis.py +150 -0
  23. equity_aggregator/adapters/data_sources/discovery_feeds/tradingview/__init__.py +5 -0
  24. equity_aggregator/adapters/data_sources/discovery_feeds/tradingview/tradingview.py +275 -0
  25. equity_aggregator/adapters/data_sources/discovery_feeds/xetra/__init__.py +7 -0
  26. equity_aggregator/adapters/data_sources/{authoritative_feeds → discovery_feeds/xetra}/xetra.py +9 -12
  27. equity_aggregator/adapters/data_sources/enrichment_feeds/__init__.py +6 -1
  28. equity_aggregator/adapters/data_sources/enrichment_feeds/gleif/__init__.py +5 -0
  29. equity_aggregator/adapters/data_sources/enrichment_feeds/gleif/api.py +71 -0
  30. equity_aggregator/adapters/data_sources/enrichment_feeds/gleif/download.py +109 -0
  31. equity_aggregator/adapters/data_sources/enrichment_feeds/gleif/gleif.py +195 -0
  32. equity_aggregator/adapters/data_sources/enrichment_feeds/gleif/parser.py +75 -0
  33. equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/__init__.py +1 -1
  34. equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/_utils/__init__.py +11 -0
  35. equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/{utils → _utils}/backoff.py +1 -1
  36. equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/{utils → _utils}/fuzzy.py +28 -26
  37. equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/_utils/json.py +36 -0
  38. equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/api/__init__.py +1 -1
  39. equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/api/{summary.py → quote_summary.py} +44 -30
  40. equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/api/search.py +10 -5
  41. equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/auth.py +130 -0
  42. equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/config.py +3 -3
  43. equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/ranking.py +97 -0
  44. equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/session.py +85 -218
  45. equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/transport.py +191 -0
  46. equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/yfinance.py +413 -0
  47. equity_aggregator/adapters/data_sources/reference_lookup/exchange_rate_api.py +6 -13
  48. equity_aggregator/adapters/data_sources/reference_lookup/openfigi.py +23 -7
  49. equity_aggregator/cli/dispatcher.py +11 -8
  50. equity_aggregator/cli/main.py +14 -5
  51. equity_aggregator/cli/parser.py +1 -1
  52. equity_aggregator/cli/signals.py +32 -0
  53. equity_aggregator/domain/_utils/__init__.py +2 -2
  54. equity_aggregator/domain/_utils/_load_converter.py +30 -21
  55. equity_aggregator/domain/_utils/_merge.py +221 -368
  56. equity_aggregator/domain/_utils/_merge_config.py +205 -0
  57. equity_aggregator/domain/_utils/_strategies.py +180 -0
  58. equity_aggregator/domain/pipeline/resolve.py +17 -11
  59. equity_aggregator/domain/pipeline/runner.py +4 -4
  60. equity_aggregator/domain/pipeline/seed.py +5 -1
  61. equity_aggregator/domain/pipeline/transforms/__init__.py +2 -2
  62. equity_aggregator/domain/pipeline/transforms/canonicalise.py +1 -1
  63. equity_aggregator/domain/pipeline/transforms/enrich.py +328 -285
  64. equity_aggregator/domain/pipeline/transforms/group.py +48 -0
  65. equity_aggregator/logging_config.py +4 -1
  66. equity_aggregator/schemas/__init__.py +11 -5
  67. equity_aggregator/schemas/canonical.py +11 -6
  68. equity_aggregator/schemas/feeds/__init__.py +11 -5
  69. equity_aggregator/schemas/feeds/gleif_feed_data.py +35 -0
  70. equity_aggregator/schemas/feeds/intrinio_feed_data.py +142 -0
  71. equity_aggregator/schemas/feeds/{lse_feed_data.py → lseg_feed_data.py} +85 -52
  72. equity_aggregator/schemas/feeds/sec_feed_data.py +36 -6
  73. equity_aggregator/schemas/feeds/stock_analysis_feed_data.py +107 -0
  74. equity_aggregator/schemas/feeds/tradingview_feed_data.py +144 -0
  75. equity_aggregator/schemas/feeds/xetra_feed_data.py +1 -1
  76. equity_aggregator/schemas/feeds/yfinance_feed_data.py +47 -35
  77. equity_aggregator/schemas/raw.py +5 -3
  78. equity_aggregator/schemas/types.py +7 -0
  79. equity_aggregator/schemas/validators.py +81 -27
  80. equity_aggregator/storage/data_store.py +5 -3
  81. {equity_aggregator-0.1.1.dist-info → equity_aggregator-0.1.5.dist-info}/METADATA +205 -115
  82. equity_aggregator-0.1.5.dist-info/RECORD +103 -0
  83. {equity_aggregator-0.1.1.dist-info → equity_aggregator-0.1.5.dist-info}/WHEEL +1 -1
  84. equity_aggregator/adapters/data_sources/authoritative_feeds/__init__.py +0 -13
  85. equity_aggregator/adapters/data_sources/authoritative_feeds/euronext.py +0 -420
  86. equity_aggregator/adapters/data_sources/authoritative_feeds/lse.py +0 -352
  87. equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/feed.py +0 -350
  88. equity_aggregator/adapters/data_sources/enrichment_feeds/yfinance/utils/__init__.py +0 -9
  89. equity_aggregator/domain/pipeline/transforms/deduplicate.py +0 -54
  90. equity_aggregator/schemas/feeds/euronext_feed_data.py +0 -59
  91. equity_aggregator-0.1.1.dist-info/RECORD +0 -72
  92. {equity_aggregator-0.1.1.dist-info → equity_aggregator-0.1.5.dist-info}/entry_points.txt +0 -0
  93. {equity_aggregator-0.1.1.dist-info → equity_aggregator-0.1.5.dist-info}/licenses/LICENCE.txt +0 -0
@@ -1,459 +1,312 @@
1
1
  # _utils/_merge.py
2
2
 
3
3
 
4
- from collections import Counter
5
4
  from collections.abc import Sequence
6
5
  from decimal import Decimal
7
- from functools import cache
8
- from itertools import chain
9
- from statistics import median
10
-
11
- from rapidfuzz import fuzz
6
+ from functools import partial
7
+ from typing import NamedTuple
12
8
 
13
9
  from equity_aggregator.schemas.raw import RawEquity
14
10
 
15
-
16
- def merge(group: Sequence[RawEquity]) -> RawEquity:
17
- """
18
- Merges a group of duplicate RawEquity records into a single representative record.
19
-
20
- For each field, a representative value is computed as follows:
21
- - name: Clustered by fuzzy similarity; selecting most frequent, earliest spelling.
22
- - symbol: Most frequent symbol; ties broken by first occurrence.
23
- - isin, cusip, cik, share_class_figi: Most frequent value; ties broken by order.
24
- - mics: Union of all non-null lists, order-preserving and duplicates removed.
25
- - currency: Most frequent non-null value; ties broken by first occurrence.
26
- - last_price: Median of all non-null values.
27
- - market_cap: Median of all non-null values.
28
-
29
- Note:
30
- This function requires that the input group is non-empty and that all RawEquity
31
- objects in the group share the same identical share_class_figi. If these
32
- conditions are not met, a ValueError will be raised. This is an enforced
33
- constraint to prevent merging heterogeneous equity records.
34
-
35
- Args:
36
- group (Sequence[RawEquity]): A sequence of RawEquity objects considered
37
- duplicates to be merged.
38
-
39
- Returns:
40
- RawEquity: A new RawEquity instance with merged field values.
11
+ from ._merge_config import (
12
+ FIELD_CONFIG,
13
+ PRICE_RANGE_FIELDS,
14
+ FieldSpec,
15
+ Strategy,
16
+ )
17
+ from ._strategies import (
18
+ filter_by_deviation,
19
+ fuzzy_cluster_mode,
20
+ median_decimal,
21
+ mode_first,
22
+ union_ordered,
23
+ )
24
+
25
+
26
+ def _extract_field(
27
+ group: Sequence[RawEquity],
28
+ field: str,
29
+ *,
30
+ filter_none: bool = True,
31
+ ) -> list:
41
32
  """
33
+ Extract field values from a group of RawEquity objects.
42
34
 
43
- # validate share_class_figi consistency first
44
- share_class_figi_value = _validate_share_class_figi(group)
45
-
46
- return RawEquity(
47
- name=_merge_name(group),
48
- symbol=_merge_symbol(group),
49
- isin=_merge_id(group, "isin"),
50
- cusip=_merge_id(group, "cusip"),
51
- cik=_merge_id(group, "cik"),
52
- share_class_figi=share_class_figi_value,
53
- mics=_merge_mics(group),
54
- currency=_merge_currency(group),
55
- last_price=_merge_decimal_field(group, "last_price"),
56
- market_cap=_merge_decimal_field(group, "market_cap"),
57
- fifty_two_week_min=_merge_decimal_field(group, "fifty_two_week_min"),
58
- fifty_two_week_max=_merge_decimal_field(group, "fifty_two_week_max"),
59
- dividend_yield=_merge_decimal_field(group, "dividend_yield"),
60
- market_volume=_merge_decimal_field(group, "market_volume"),
61
- held_insiders=_merge_decimal_field(group, "held_insiders"),
62
- held_institutions=_merge_decimal_field(group, "held_institutions"),
63
- short_interest=_merge_decimal_field(group, "short_interest"),
64
- share_float=_merge_decimal_field(group, "share_float"),
65
- shares_outstanding=_merge_decimal_field(group, "shares_outstanding"),
66
- revenue_per_share=_merge_decimal_field(group, "revenue_per_share"),
67
- profit_margin=_merge_decimal_field(group, "profit_margin"),
68
- gross_margin=_merge_decimal_field(group, "gross_margin"),
69
- operating_margin=_merge_decimal_field(group, "operating_margin"),
70
- free_cash_flow=_merge_decimal_field(group, "free_cash_flow"),
71
- operating_cash_flow=_merge_decimal_field(group, "operating_cash_flow"),
72
- return_on_equity=_merge_decimal_field(group, "return_on_equity"),
73
- return_on_assets=_merge_decimal_field(group, "return_on_assets"),
74
- performance_1_year=_merge_decimal_field(group, "performance_1_year"),
75
- total_debt=_merge_decimal_field(group, "total_debt"),
76
- revenue=_merge_decimal_field(group, "revenue"),
77
- ebitda=_merge_decimal_field(group, "ebitda"),
78
- trailing_pe=_merge_decimal_field(group, "trailing_pe"),
79
- price_to_book=_merge_decimal_field(group, "price_to_book"),
80
- trailing_eps=_merge_decimal_field(group, "trailing_eps"),
81
- analyst_rating=_merge_analyst_rating(group),
82
- industry=_merge_industry(group),
83
- sector=_merge_sector(group),
84
- )
85
-
86
-
87
- def _validate_share_class_figi(group: Sequence[RawEquity]) -> str:
88
- """
89
- Validates that all RawEquity objects in the group share the same
90
- share_class_figi value.
35
+ Retrieves the specified field from each RawEquity object in the group.
36
+ Optionally filters out None values from the result.
91
37
 
92
38
  Args:
93
- group (Sequence[RawEquity]): A non-empty sequence of RawEquity objects to
94
- validate.
95
-
96
- Raises:
97
- ValueError: If the group is empty or contains multiple distinct
98
- share_class_figi values.
39
+ group (Sequence[RawEquity]): Sequence of RawEquity objects to extract from.
40
+ field (str): Name of the field to extract from each object.
41
+ filter_none (bool, optional): If True, exclude None values from the result.
42
+ Defaults to True.
99
43
 
100
44
  Returns:
101
- str: The single shared share_class_figi value present in the group.
45
+ list: Extracted field values, optionally filtered to exclude None values.
102
46
  """
103
- if not group:
104
- raise ValueError("Cannot merge an empty group of equities")
105
-
106
- figis = {raw_equity.share_class_figi for raw_equity in group}
107
- if len(figis) != 1:
108
- raise ValueError(
109
- "All raw equities in the group must have identical share_class_figi values "
110
- f"(found: {sorted(figis)})",
111
- )
112
- return figis.pop()
47
+ values = [getattr(eq, field) for eq in group]
48
+ return [v for v in values if v is not None] if filter_none else values
113
49
 
114
50
 
115
- def _merge_name(duplicate_group: Sequence[RawEquity], *, threshold: int = 90) -> str:
51
+ class EquityIdentifiers(NamedTuple):
116
52
  """
117
- Selects a representative equity name from a group of near-duplicate equities.
118
-
119
- This function clusters similar equity names using fuzzy matching, then selects the
120
- cluster with the highest total occurrence count. Within the chosen cluster, it
121
- returns the earliest original spelling found in the input sequence.
122
-
123
- Args:
124
- duplicate_group (Sequence[RawEquity]): A sequence of RawEquity objects
125
- considered near-duplicates, each with a 'name' attribute.
126
- threshold (int, optional): Similarity threshold (0-100) for clustering names.
127
- Defaults to 90.
128
-
129
- Returns:
130
- str: The selected representative equity name from the group.
53
+ Representative identifiers extracted from a group of RawEquity records.
54
+
55
+ Attributes:
56
+ symbol: Representative ticker symbol.
57
+ name: Representative equity name.
58
+ isin: Representative ISIN identifier.
59
+ cusip: Representative CUSIP identifier.
60
+ cik: Representative CIK identifier.
61
+ lei: Representative LEI identifier.
62
+ share_class_figi: Validated share class FIGI (must be identical across group).
131
63
  """
132
- names = [equity.name for equity in duplicate_group]
133
-
134
- # cluster names by fuzzy similarity
135
- clusters = _cluster(names, threshold=threshold)
136
64
 
137
- # weight clusters and keep earliest spelling
138
- weight = Counter(names) # how many times each form occurs
65
+ symbol: str
66
+ name: str
67
+ isin: str | None
68
+ cusip: str | None
69
+ cik: str | None
70
+ lei: str | None
71
+ share_class_figi: str
139
72
 
140
- def _cluster_weight(cluster: list[str]) -> int:
141
- return sum(weight[token] for token in cluster)
142
73
 
143
- # choose cluster with the most occurrences (i.e. highest weight)
144
- best_cluster = max(clusters, key=_cluster_weight)
145
-
146
- return next(name for name in names if name in best_cluster)
147
-
148
-
149
- def _merge_symbol(duplicate_group: Sequence[RawEquity]) -> str:
74
+ def merge(group: Sequence[RawEquity]) -> RawEquity:
150
75
  """
151
- Selects the most frequently occurring symbol from a group of RawEquity objects.
76
+ Merge a group of RawEquity records into a single, representative RawEquity instance.
152
77
 
153
- If multiple symbols share the highest frequency (a tie), the symbol that appears
154
- first in the group is returned.
78
+ Each field is merged using a configurable strategy defined in FIELD_CONFIG:
79
+ - Most fields use one of: mode (most frequent), median (for numerics), fuzzy
80
+ clustering (for similar strings), or union (for lists).
81
+ - Price range fields (last_price, fifty_two_week_min, fifty_two_week_max) are
82
+ merged together with additional consistency checks.
83
+
84
+ The merging process ensures that all records in the group share the same
85
+ share_class_figi; otherwise, a ValueError is raised.
155
86
 
156
87
  Args:
157
- duplicate_group (Sequence[RawEquity]): A sequence of RawEquity objects that are
158
- considered duplicates and need to be merged based on their symbol.
88
+ group (Sequence[RawEquity]): Non-empty sequence of RawEquity objects to merge.
89
+ All must have identical share_class_figi.
159
90
 
160
91
  Returns:
161
- str: The symbol that is the mode of the group, with ties broken by first
162
- occurrence.
163
- """
164
- symbols = [equity.symbol for equity in duplicate_group]
165
- return Counter(symbols).most_common(1)[0][0]
92
+ RawEquity: A new RawEquity instance with merged values for each field, according
93
+ to the configured strategies.
166
94
 
167
-
168
- def _merge_decimal_field(
169
- duplicate_group: Sequence[RawEquity],
170
- field: str,
171
- ) -> Decimal | None:
95
+ Raises:
96
+ ValueError: If the group is empty or contains multiple distinct share_class_figi
97
+ values.
172
98
  """
173
- Calculates the median value for a specified Decimal field from a group of
174
- RawEquity objects, ignoring any entries where the field value is None.
99
+ share_class_figi = _validate_share_class_figi(group)
175
100
 
176
- Args:
177
- duplicate_group (Sequence[RawEquity]): A sequence of RawEquity instances,
178
- each potentially containing the specified field.
179
- field (str): The name of the Decimal field to merge.
101
+ merged = {
102
+ "share_class_figi": share_class_figi,
103
+ **{
104
+ field: _apply_strategy(group, field, spec)
105
+ for field, spec in FIELD_CONFIG.items()
106
+ if field not in PRICE_RANGE_FIELDS
107
+ },
108
+ **_merge_price_range(group),
109
+ }
180
110
 
181
- Returns:
182
- Decimal | None: The median of the non-null field values as a Decimal,
183
- or None if no valid values are present.
184
- """
185
- values: list[Decimal] = [
186
- getattr(equity, field)
187
- for equity in duplicate_group
188
- if getattr(equity, field) is not None
189
- ]
190
- return median(values) if values else None
111
+ return RawEquity.model_validate(merged)
191
112
 
192
113
 
193
- def _merge_id(duplicate_group: Sequence[RawEquity], field: str) -> str | None:
114
+ def extract_identifiers(group: Sequence[RawEquity]) -> EquityIdentifiers:
194
115
  """
195
- Selects the most frequent non-null value for a specified identifier field
196
- ("isin", "cusip", "cik" or "share_class_figi") from a group of RawEquity objects.
116
+ Compute representative identifiers from a group of RawEquity records.
197
117
 
198
- In case of a tie, returns the earliest occurrence in the original group order.
199
- Returns None if all values are null.
118
+ Uses the same resolution algorithms as merge() mode for IDs,
119
+ fuzzy clustering for name, frequency for symbol.
200
120
 
201
121
  Args:
202
- duplicate_group (Sequence[RawEquity]): A sequence of RawEquity objects to merge.
203
- field (str): The name of the identifier field to merge ("isin", "cusip", "cik"
204
- or "share_class_figi").
122
+ group: A non-empty sequence of RawEquity objects from which to extract
123
+ identifiers. All records must share the same share_class_figi.
205
124
 
206
125
  Returns:
207
- str | None: Most frequent non-null identifier value, or None if all are null.
208
- """
209
- # get the values for the given field
210
- values = [
211
- getattr(equity, field)
212
- for equity in duplicate_group
213
- if getattr(equity, field) is not None
214
- ]
215
-
216
- if not values:
217
- return None
218
-
219
- counts = Counter(values)
220
-
221
- # max(counts.values()) guaranteed ≥ 1
222
- best_freq = max(counts.values())
126
+ EquityIdentifiers: Representative identifiers resolved from the group.
223
127
 
224
- # earliest among the non-null values with best frequency
225
- return next(value for value in values if counts[value] == best_freq)
226
-
227
-
228
- def _merge_mics(duplicate_group: Sequence[RawEquity]) -> list[str] | None:
229
- """
230
- Merges all non-null MIC lists from a group of RawEquity objects, preserving the
231
- order of first occurrence and removing duplicates.
232
-
233
- Args:
234
- duplicate_group (Sequence[RawEquity]): A sequence of RawEquity objects, each
235
- possibly containing a list of MICs.
236
-
237
- Returns:
238
- list[str] | None: A list of unique MICs in order of first appearance, or None
239
- if no MICs are found.
128
+ Raises:
129
+ ValueError: If the group is empty or contains multiple distinct
130
+ share_class_figi values.
240
131
  """
241
- # combine all (possibly None) MIC lists
242
- combined_mics = chain.from_iterable(e.mics or [] for e in duplicate_group)
243
-
244
- # Keep only truthy, non-blank strings
245
- cleaned = (mic for mic in combined_mics if mic and str(mic).strip())
246
-
247
- # Order-preserving de-duplication
248
- unique_mics = list(dict.fromkeys(cleaned))
249
-
250
- return unique_mics or None
132
+ share_class_figi = _validate_share_class_figi(group)
133
+
134
+ return EquityIdentifiers(
135
+ symbol=mode_first(_extract_field(group, "symbol")),
136
+ name=fuzzy_cluster_mode(_extract_field(group, "name")),
137
+ isin=mode_first(_extract_field(group, "isin")),
138
+ cusip=mode_first(_extract_field(group, "cusip")),
139
+ cik=mode_first(_extract_field(group, "cik")),
140
+ lei=mode_first(_extract_field(group, "lei")),
141
+ share_class_figi=share_class_figi,
142
+ )
251
143
 
252
144
 
253
- def _merge_currency(duplicate_group: Sequence[RawEquity]) -> str | None:
145
+ def _validate_share_class_figi(group: Sequence[RawEquity]) -> str:
254
146
  """
255
- Selects the most frequent non-null currency code (ISO-4217) from a group of
256
- duplicate RawEquity objects. In case of a tie, returns the first encountered
257
- currency code in the original group order. If all currency codes are null,
258
- returns None.
147
+ Validates that all RawEquity objects in the group share the same
148
+ share_class_figi value.
259
149
 
260
150
  Args:
261
- duplicate_group (Sequence[RawEquity]): A sequence of RawEquity objects, each
262
- possibly containing a currency attribute.
263
-
264
- Returns:
265
- str | None: The most frequent non-null currency code, or None if all are null.
266
- """
267
- # get the currency codes from the duplicate group
268
- currency_codes = [
269
- equity.currency for equity in duplicate_group if equity.currency is not None
270
- ]
271
-
272
- if not currency_codes:
273
- return None
274
-
275
- freq = Counter(currency_codes)
276
- best_freq = max(freq.values())
277
-
278
- # earliest among the non-null currencies with best frequency
279
- return next(currency for currency in currency_codes if freq[currency] == best_freq)
280
-
281
-
282
- def _merge_analyst_rating(duplicate_group: Sequence[RawEquity]) -> str | None:
283
- """
284
- Selects the most frequent non-null analyst rating ("BUY", "SELL", or "HOLD")
285
- from a group of RawEquity objects. If there is a tie, the rating that appears
286
- first in the input sequence is returned. Returns None if all ratings are missing.
151
+ group (Sequence[RawEquity]): A non-empty sequence of RawEquity objects to
152
+ validate.
287
153
 
288
- Args:
289
- duplicate_group (Sequence[RawEquity]): A sequence of RawEquity objects,
290
- each potentially containing an analyst_rating attribute.
154
+ Raises:
155
+ ValueError: If the group is empty or contains multiple distinct
156
+ share_class_figi values.
291
157
 
292
158
  Returns:
293
- str | None: The most frequent non-null analyst rating, or None if all are
294
- missing.
159
+ str: The single shared share_class_figi value present in the group.
295
160
  """
296
- ratings = [
297
- equity.analyst_rating
298
- for equity in duplicate_group
299
- if equity.analyst_rating is not None
300
- ]
301
-
302
- if not ratings:
303
- return None
304
-
305
- freq = Counter(ratings)
306
- best = max(freq.values())
161
+ if not group:
162
+ raise ValueError("Cannot merge an empty group of equities")
307
163
 
308
- # earliest among the non-null ratings with best frequency
309
- return next(rating for rating in ratings if freq[rating] == best)
164
+ figis = {raw_equity.share_class_figi for raw_equity in group}
165
+ if len(figis) != 1:
166
+ raise ValueError(
167
+ "All raw equities in the group must have identical share_class_figi values "
168
+ f"(found: {sorted(figis)})",
169
+ )
170
+ return figis.pop()
310
171
 
311
172
 
312
- def _merge_industry(
313
- duplicate_group: Sequence[RawEquity],
314
- *,
315
- threshold: int = 90,
316
- ) -> str | None:
173
+ def _apply_strategy(
174
+ group: Sequence[RawEquity],
175
+ field: str,
176
+ spec: FieldSpec,
177
+ ) -> object:
317
178
  """
318
- Selects a representative industry from a group of RawEquity objects.
179
+ Apply a specific merge strategy to a field.
319
180
 
320
- - Ignores blank or missing industry values.
321
- - Clusters similar industry names with single-link fuzzy matching (token-set ratio).
322
- - The cluster with the highest total frequency is chosen (majority rule).
323
- - Within the winning cluster, the earliest spelling in the original sequence is
324
- returned, preserving original capitalisation.
181
+ Extracts field values from the group and applies the configured strategy.
182
+ If fewer than min_sources non-None values exist, returns None to prevent
183
+ accepting dubious single-source data.
325
184
 
326
185
  Args:
327
- duplicate_group (Sequence[RawEquity]): Sequence of RawEquity objects, each
328
- possibly containing an industry attribute.
329
- threshold (int, optional): Similarity threshold (0-100) for clustering
330
- industry names. Defaults to 90.
186
+ group (Sequence[RawEquity]): Sequence of RawEquity objects to merge.
187
+ field (str): Name of the field to merge.
188
+ spec (FieldSpec): Strategy specification for this field.
331
189
 
332
190
  Returns:
333
- str | None: The selected representative industry string, or None if all values
334
- are missing or blank.
191
+ object: The merged value for this field, or None if quorum not met.
335
192
  """
336
- # skip if every record is null or blank
337
- industries = [
338
- equity.industry for equity in duplicate_group if equity.industry is not None
339
- ]
340
-
341
- if not industries:
342
- return None
343
-
344
- # cluster names by fuzzy similarity
345
- clusters = _cluster(industries, threshold=threshold)
193
+ values = _extract_field(group, field, filter_none=(spec.strategy != Strategy.UNION))
346
194
 
347
- # weight clusters and keep earliest spelling
348
- weight = Counter(industries)
195
+ if spec.max_deviation is not None and spec.strategy == Strategy.MEDIAN:
196
+ values = filter_by_deviation(values, spec.max_deviation)
349
197
 
350
- def _cluster_weight(cluster: list[str]) -> int:
351
- return sum(weight[token] for token in cluster)
198
+ if len(values) < spec.min_sources:
199
+ return None
352
200
 
353
- # choose cluster with the most occurrences (i.e. highest weight)
354
- best_cluster = max(clusters, key=_cluster_weight)
201
+ dispatch = {
202
+ Strategy.MODE: mode_first,
203
+ Strategy.FUZZY_CLUSTER: partial(fuzzy_cluster_mode, threshold=spec.threshold),
204
+ Strategy.UNION: union_ordered,
205
+ Strategy.MEDIAN: median_decimal,
206
+ }
355
207
 
356
- # earliest spelling in original order among non-nulls
357
- return next(industry for industry in industries if industry in best_cluster)
208
+ return dispatch[spec.strategy](values)
358
209
 
359
210
 
360
- def _merge_sector(
361
- duplicate_group: Sequence[RawEquity],
362
- *,
363
- threshold: int = 90,
364
- ) -> str | None:
211
+ def _merge_price_range(
212
+ group: Sequence[RawEquity],
213
+ min_consistent: int = 2,
214
+ ) -> dict[str, Decimal | None]:
365
215
  """
366
- Selects a representative sector from a group of RawEquity objects.
367
-
368
- This function clusters similar sector names using fuzzy matching (token-set ratio,
369
- single-link) and a configurable threshold. The cluster with the highest total
370
- frequency is chosen. Within the winning cluster, the earliest spelling in the
371
- original sequence is returned, preserving original capitalisation.
216
+ Merge last_price, fifty_two_week_min, and fifty_two_week_max with tiered quality
217
+ checks.
218
+
219
+ Attempts to merge price fields as a coherent triplet when possible, falling back to
220
+ independent field merging when complete records are unavailable. This preserves data
221
+ quality through consistency checks whilst avoiding unnecessary data loss.
222
+
223
+ Primary strategy (preferred):
224
+ - Requires records with all three price fields populated (complete records).
225
+ - Filters out records where last_price violates the 52-week range constraint.
226
+ - A 10% tolerance above fifty_two_week_max accommodates timing drift between
227
+ feeds.
228
+ - If quorum of consistent complete records is met (default: 2), returns median
229
+ values.
230
+
231
+ Fallback strategy (when quorum not met):
232
+ - Merges each price field independently using per-field configuration from
233
+ FIELD_CONFIG.
234
+ - Each field still requires its own min_sources threshold (typically 2).
235
+ - Allows partial price data when complete triplets are unavailable across sources.
372
236
 
373
237
  Args:
374
- duplicate_group (Sequence[RawEquity]): Sequence of RawEquity records, each
375
- possibly containing a sector attribute.
376
- threshold (int, optional): Similarity threshold (0-100) for clustering sector
377
- names. Defaults to 90.
238
+ group (Sequence[RawEquity]): Sequence of RawEquity objects to merge.
239
+ min_consistent (int): Minimum number of consistent complete records required
240
+ for primary strategy. Defaults to 2.
378
241
 
379
242
  Returns:
380
- str | None: The selected representative sector string, or None if all values
381
- are missing or blank.
243
+ dict[str, Decimal | None]: Dictionary containing merged last_price,
244
+ fifty_two_week_min, and fifty_two_week_max values. Fields may be None
245
+ if neither strategy can satisfy quorum requirements.
382
246
  """
383
- # skip if every record is null or blank
384
- sectors = [equity.sector for equity in duplicate_group if equity.sector is not None]
385
-
386
- if not sectors:
387
- return None
388
-
389
- # cluster names by fuzzy similarity
390
- clusters = _cluster(sectors, threshold=threshold)
391
-
392
- # weight clusters and keep earliest spelling
393
- weights = Counter(sectors)
394
-
395
- def _cluster_weight(cluster: list[str]) -> int:
396
- return sum(weights[token] for token in cluster)
247
+ consistent = tuple(
248
+ filter(_is_price_consistent, filter(_is_price_complete, group)),
249
+ )
397
250
 
398
- # choose cluster with the most occurrences (i.e. highest weight)
399
- best_cluster = max(clusters, key=_cluster_weight)
251
+ if len(consistent) >= min_consistent:
252
+ return {
253
+ "last_price": median_decimal([eq.last_price for eq in consistent]),
254
+ "fifty_two_week_min": median_decimal(
255
+ [eq.fifty_two_week_min for eq in consistent],
256
+ ),
257
+ "fifty_two_week_max": median_decimal(
258
+ [eq.fifty_two_week_max for eq in consistent],
259
+ ),
260
+ }
400
261
 
401
- # earliest spelling in original order among non-nulls
402
- return next(sector for sector in sectors if sector in best_cluster)
262
+ # Fallback: merge fields independently
263
+ return {
264
+ field: _apply_strategy(group, field, FIELD_CONFIG[field])
265
+ for field in PRICE_RANGE_FIELDS
266
+ }
403
267
 
404
268
 
405
- @cache
406
- def _token_ratio(a: str, b: str) -> int:
269
+ def _is_price_complete(eq: RawEquity) -> bool:
407
270
  """
408
- Compute the token-set ratio between two strings using fuzzy matching.
271
+ Checks if a RawEquity record has non-null values for last_price, fifty_two_week_min,
272
+ and fifty_two_week_max.
409
273
 
410
274
  Args:
411
- a (str): The first string to compare.
412
- b (str): The second string to compare.
275
+ eq (RawEquity): The RawEquity instance to check.
413
276
 
414
277
  Returns:
415
- int: The token-set similarity ratio (0-100) between the two strings.
278
+ bool: True if all three price fields are not None, False otherwise.
416
279
  """
417
- return fuzz.token_set_ratio(a, b)
280
+ return (
281
+ eq.last_price is not None
282
+ and eq.fifty_two_week_min is not None
283
+ and eq.fifty_two_week_max is not None
284
+ )
418
285
 
419
286
 
420
- def _cluster(names: list[str], threshold: int = 90) -> list[list[str]]:
287
+ def _is_price_consistent(eq: RawEquity) -> bool:
421
288
  """
422
- Groups similar strings into clusters using single-link clustering based on token-set
423
- ratio.
424
-
425
- Each name is compared to the representative (first item) of each existing cluster.
426
- If the token-set ratio between the name and a cluster's representative is greater
427
- than or equal to the specified threshold, the name is added to that cluster.
428
-
429
- Otherwise, a new cluster is created for the name.
289
+ Checks if the last_price of a RawEquity record falls within its fifty_two_week_min
290
+ and fifty_two_week_max range, allowing a 10% tolerance above the max.
430
291
 
431
292
  Args:
432
- names (list[str]): List of strings to be clustered.
433
- threshold (int, optional): Minimum token-set ratio (0-100) required to join an
434
- existing cluster. Defaults to 90.
293
+ eq (RawEquity): The RawEquity instance to check.
435
294
 
436
295
  Returns:
437
- list[list[str]]: A list of clusters, where each cluster is a list of similar
438
- strings.
296
+ bool: True if last_price is between fifty_two_week_min and up to 10% above
297
+ fifty_two_week_max, False otherwise. Returns False if any price field
298
+ is None.
439
299
  """
440
- clusters: list[list[str]] = []
441
-
442
- for name in names:
443
- # find the first cluster whose representative (first item)
444
- # is similar enough to this name
445
- target: list[str] = next(
446
- (
447
- cluster
448
- for cluster in clusters
449
- if _token_ratio(name, cluster[0]) >= threshold
450
- ),
451
- None,
452
- )
453
-
454
- if target:
455
- target.append(name)
456
- else:
457
- clusters.append([name])
458
-
459
- return clusters
300
+ if (
301
+ eq.last_price is None
302
+ or eq.fifty_two_week_min is None
303
+ or eq.fifty_two_week_max is None
304
+ ):
305
+ return False
306
+
307
+ price_tolerance = Decimal("1.1")
308
+ return (
309
+ eq.fifty_two_week_min
310
+ <= eq.last_price
311
+ <= eq.fifty_two_week_max * price_tolerance
312
+ )