PyPI - equity-aggregator - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.4__py3-none-any.whl - Mend

equity-aggregator 0.1.1py3-none-any.whl → 0.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (93) hide show

equity_aggregator/domain/_utils/_merge_config.py ADDED Viewed

@@ -0,0 +1,205 @@
+# _utils/_merge_config.py
+from decimal import Decimal
+from enum import Enum, auto
+from typing import NamedTuple
+class Strategy(Enum):
+    """
+    Enumeration of available merge strategies for RawEquity fields.
+    Attributes:
+        MODE: Most frequent value, ties broken by first occurrence.
+        MEDIAN: Median of numeric values.
+        FUZZY_CLUSTER: Fuzzy clustering with frequency weighting.
+        UNION: Union of all lists, order-preserving and deduplicated.
+    """
+    MODE = auto()
+    MEDIAN = auto()
+    FUZZY_CLUSTER = auto()
+    UNION = auto()
+class FieldSpec(NamedTuple):
+    """
+    Specification for how to merge a particular field.
+    Attributes:
+        strategy: The merge strategy to apply.
+        threshold: Similarity threshold for FUZZY_CLUSTER strategy (0-100).
+            Ignored for other strategies.
+        min_sources: Minimum number of non-None sources required to accept merged value.
+            If fewer sources provide data, returns None instead. Defaults to 1.
+        max_deviation: Maximum allowed deviation from median
+            (as decimal, e.g., 0.5 = 50%). Only applies to MEDIAN strategy.
+            None disables deviation filtering.
+    """
+    strategy: Strategy
+    threshold: int = 90
+    min_sources: int = 1
+    max_deviation: Decimal | None = None
+# Field-to-strategy mapping for all RawEquity fields
+FIELD_CONFIG: dict[str, FieldSpec] = {
+    # Identifier and metadata fields (single source acceptable)
+    "name": FieldSpec(Strategy.FUZZY_CLUSTER, min_sources=1),
+    "symbol": FieldSpec(Strategy.MODE, min_sources=1),
+    "isin": FieldSpec(Strategy.MODE, min_sources=1),
+    "cusip": FieldSpec(Strategy.MODE, min_sources=1),
+    "cik": FieldSpec(Strategy.MODE, min_sources=1),
+    "lei": FieldSpec(Strategy.MODE, min_sources=1),
+    "currency": FieldSpec(Strategy.MODE, min_sources=1),
+    "analyst_rating": FieldSpec(Strategy.MODE, min_sources=1),
+    "industry": FieldSpec(Strategy.FUZZY_CLUSTER, min_sources=1),
+    "sector": FieldSpec(Strategy.FUZZY_CLUSTER, min_sources=1),
+    "mics": FieldSpec(Strategy.UNION, min_sources=1),
+    # Critical price and market data (require corroboration from multiple sources)
+    # Fields with >50% multi-source coverage that benefit from cross-validation
+    "market_cap": FieldSpec(
+        Strategy.MEDIAN,
+        min_sources=2,
+        max_deviation=Decimal("0.5"),
+    ),
+    "last_price": FieldSpec(
+        Strategy.MEDIAN,
+        min_sources=2,
+        max_deviation=Decimal("0.5"),
+    ),
+    "fifty_two_week_min": FieldSpec(
+        Strategy.MEDIAN,
+        min_sources=2,
+        max_deviation=Decimal("0.5"),
+    ),
+    "fifty_two_week_max": FieldSpec(
+        Strategy.MEDIAN,
+        min_sources=2,
+        max_deviation=Decimal("0.5"),
+    ),
+    # Other financial metrics
+    # Fields with low coverage (<5%) accept single source to prevent data loss
+    # Fields with moderate coverage (>20%) require corroboration for quality
+    "dividend_yield": FieldSpec(
+        Strategy.MEDIAN,
+        min_sources=1,
+        max_deviation=Decimal("0.5"),
+    ),
+    "market_volume": FieldSpec(
+        Strategy.MEDIAN,
+        min_sources=2,
+        max_deviation=Decimal("0.5"),
+    ),
+    "held_insiders": FieldSpec(
+        Strategy.MEDIAN,
+        min_sources=1,
+        max_deviation=Decimal("0.5"),
+    ),
+    "held_institutions": FieldSpec(
+        Strategy.MEDIAN,
+        min_sources=1,
+        max_deviation=Decimal("0.5"),
+    ),
+    "short_interest": FieldSpec(
+        Strategy.MEDIAN,
+        min_sources=1,
+        max_deviation=Decimal("0.5"),
+    ),
+    "share_float": FieldSpec(
+        Strategy.MEDIAN,
+        min_sources=1,
+        max_deviation=Decimal("0.5"),
+    ),
+    "shares_outstanding": FieldSpec(
+        Strategy.MEDIAN,
+        min_sources=1,
+        max_deviation=Decimal("0.5"),
+    ),
+    "revenue_per_share": FieldSpec(
+        Strategy.MEDIAN,
+        min_sources=1,
+        max_deviation=Decimal("0.5"),
+    ),
+    "profit_margin": FieldSpec(
+        Strategy.MEDIAN,
+        min_sources=1,
+        max_deviation=Decimal("0.5"),
+    ),
+    "gross_margin": FieldSpec(
+        Strategy.MEDIAN,
+        min_sources=1,
+        max_deviation=Decimal("0.5"),
+    ),
+    "operating_margin": FieldSpec(
+        Strategy.MEDIAN,
+        min_sources=1,
+        max_deviation=Decimal("0.5"),
+    ),
+    "free_cash_flow": FieldSpec(
+        Strategy.MEDIAN,
+        min_sources=2,
+        max_deviation=Decimal("0.5"),
+    ),
+    "operating_cash_flow": FieldSpec(
+        Strategy.MEDIAN,
+        min_sources=1,
+        max_deviation=Decimal("0.5"),
+    ),
+    "return_on_equity": FieldSpec(
+        Strategy.MEDIAN,
+        min_sources=2,
+        max_deviation=Decimal("0.5"),
+    ),
+    "return_on_assets": FieldSpec(
+        Strategy.MEDIAN,
+        min_sources=2,
+        max_deviation=Decimal("0.5"),
+    ),
+    "performance_1_year": FieldSpec(
+        Strategy.MEDIAN,
+        min_sources=2,
+        max_deviation=Decimal("0.5"),
+    ),
+    "total_debt": FieldSpec(
+        Strategy.MEDIAN,
+        min_sources=1,
+        max_deviation=Decimal("0.5"),
+    ),
+    "revenue": FieldSpec(
+        Strategy.MEDIAN,
+        min_sources=2,
+        max_deviation=Decimal("0.5"),
+    ),
+    "ebitda": FieldSpec(
+        Strategy.MEDIAN,
+        min_sources=2,
+        max_deviation=Decimal("0.5"),
+    ),
+    "trailing_pe": FieldSpec(
+        Strategy.MEDIAN,
+        min_sources=2,
+        max_deviation=Decimal("0.5"),
+    ),
+    "price_to_book": FieldSpec(
+        Strategy.MEDIAN,
+        min_sources=1,
+        max_deviation=Decimal("0.5"),
+    ),
+    "trailing_eps": FieldSpec(
+        Strategy.MEDIAN,
+        min_sources=1,
+        max_deviation=Decimal("0.5"),
+    ),
+}
+# Coherent field groups requiring joint validation
+PRICE_RANGE_FIELDS: frozenset[str] = frozenset(
+    {
+        "last_price",
+        "fifty_two_week_min",
+        "fifty_two_week_max",
+    },
+)

equity_aggregator/domain/_utils/_strategies.py ADDED Viewed

@@ -0,0 +1,180 @@
+# _utils/_strategies.py
+from collections import Counter
+from collections.abc import Sequence
+from decimal import Decimal
+from functools import cache
+from statistics import median
+from rapidfuzz import fuzz
+def filter_by_deviation(
+    values: Sequence[Decimal],
+    max_deviation: Decimal = Decimal("0.5"),
+    min_samples: int = 3,
+) -> list[Decimal]:
+    """
+    Filter values that deviate more than a threshold percentage from the median.
+    Args:
+        values: Sequence of Decimal values.
+        max_deviation: Maximum allowed deviation as decimal (0.5 = 50%).
+        min_samples: Minimum sample size to apply filtering. Below this,
+            returns values unfiltered.
+    Returns:
+        List of values within threshold, or all values if filtering not applicable.
+    """
+    if len(values) < min_samples:
+        return list(values)
+    med = median(values)
+    if med == 0:
+        return list(values)
+    return [v for v in values if abs(v - med) / abs(med) <= max_deviation]
+def mode_first[T](values: Sequence[T]) -> T | None:
+    """
+    Selects the most frequently occurring value from a sequence.
+    If multiple values share the highest frequency (a tie), the value that appears
+    first in the sequence is returned. Returns None if the sequence is empty.
+    Args:
+        values (Sequence[T]): A sequence of values from which to select the mode.
+    Returns:
+        T | None: The most frequent value with ties broken by first occurrence,
+            or None if the sequence is empty.
+    """
+    if not values:
+        return None
+    counts = Counter(values)
+    best_freq = max(counts.values())
+    return next(v for v in values if counts[v] == best_freq)
+def median_decimal(values: Sequence[Decimal]) -> Decimal | None:
+    """
+    Calculates the median value of a sequence of Decimal values.
+    Args:
+        values (Sequence[Decimal]): A sequence of Decimal values.
+    Returns:
+        Decimal | None: The median of the sequence as a Decimal, or None if
+            the sequence is empty.
+    """
+    return median(values) if values else None
+def union_ordered[T](lists: Sequence[list[T] | None]) -> list[T] | None:
+    """
+    Merges multiple lists into a single deduplicated list, preserving the order of
+    first occurrence.
+    Flattens all input lists, removes duplicates while maintaining the order in which
+    elements first appear, and filters out empty or blank string values. Returns None
+    if the result is empty.
+    Args:
+        lists (Sequence[list[T] | None]): A sequence of lists (or None values) to merge.
+    Returns:
+        list[T] | None: A deduplicated list in order of first appearance, or None
+            if no valid elements exist.
+    """
+    seen: dict[T, None] = {}
+    for lst in lists:
+        for item in lst or []:
+            if item and str(item).strip():
+                seen.setdefault(item, None)
+    return list(seen) or None
+def fuzzy_cluster_mode(
+    values: Sequence[str],
+    threshold: int = 90,
+) -> str | None:
+    """
+    Selects a representative string from a sequence using fuzzy clustering.
+    This function clusters similar strings using fuzzy matching (token-set ratio),
+    then selects the cluster with the highest total occurrence count. Within the
+    chosen cluster, it returns the earliest original spelling found in the input
+    sequence.
+    Args:
+        values (Sequence[str]): A sequence of strings to cluster and select from.
+        threshold (int, optional): Similarity threshold (0-100) for clustering strings.
+            Defaults to 90.
+    Returns:
+        str | None: The selected representative string from the group, or None if
+            the sequence is empty.
+    """
+    if not values:
+        return None
+    clusters = _cluster(list(values), threshold)
+    weights = Counter(values)
+    best_cluster = max(clusters, key=lambda c: sum(weights[v] for v in c))
+    return next(v for v in values if v in best_cluster)
+@cache
+def _token_ratio(a: str, b: str) -> int:
+    """
+    Compute the token-set ratio between two strings using fuzzy matching.
+    Args:
+        a (str): The first string to compare.
+        b (str): The second string to compare.
+    Returns:
+        int: The token-set similarity ratio (0-100) between the two strings.
+    """
+    return fuzz.token_set_ratio(a, b)
+def _cluster(names: list[str], threshold: int = 90) -> list[list[str]]:
+    """
+    Groups similar strings into clusters using single-link clustering based on token-set
+    ratio.
+    Each name is compared to the representative (first item) of each existing cluster.
+    If the token-set ratio between the name and a cluster's representative is greater
+    than or equal to the specified threshold, the name is added to that cluster.
+    Otherwise, a new cluster is created for the name.
+    Args:
+        names (list[str]): List of strings to be clustered.
+        threshold (int, optional): Minimum token-set ratio (0-100) required to join an
+            existing cluster. Defaults to 90.
+    Returns:
+        list[list[str]]: A list of clusters, where each cluster is a list of similar
+            strings.
+    """
+    clusters: list[list[str]] = []
+    for name in names:
+        target = next(
+            (c for c in clusters if _token_ratio(name, c[0]) >= threshold),
+            None,
+        )
+        if target:
+            target.append(name)
+        else:
+            clusters.append([name])
+    return clusters

equity_aggregator/domain/pipeline/resolve.py CHANGED Viewed

@@ -6,15 +6,19 @@ from collections.abc import AsyncIterator, Callable
 from typing import NamedTuple
 from equity_aggregator.adapters import (
-    fetch_equity_records_euronext,
-    fetch_equity_records_lse,
+    fetch_equity_records_intrinio,
+    fetch_equity_records_lseg,
     fetch_equity_records_sec,
+    fetch_equity_records_stock_analysis,
+    fetch_equity_records_tradingview,
     fetch_equity_records_xetra,
 )
 from equity_aggregator.schemas import (
-    EuronextFeedData,
-    LseFeedData,
+    IntrinioFeedData,
+    LsegFeedData,
     SecFeedData,
+    StockAnalysisFeedData,
+    TradingViewFeedData,
     XetraFeedData,
 )
@@ -30,12 +34,14 @@ class FeedRecord(NamedTuple):
     raw_data: dict[str, object]
-# List of authoritative feed fetchers and their corresponding data models
-_AUTH_FEEDS: tuple[FeedPair] = [
-    (fetch_equity_records_euronext, EuronextFeedData),
+# List of discovery feed fetchers and their corresponding data models
+_DISCOVERY_FEEDS: tuple[FeedPair] = [
     (fetch_equity_records_xetra, XetraFeedData),
-    (fetch_equity_records_lse, LseFeedData),
+    (fetch_equity_records_lseg, LsegFeedData),
+    (fetch_equity_records_stock_analysis, StockAnalysisFeedData),
+    (fetch_equity_records_tradingview, TradingViewFeedData),
     (fetch_equity_records_sec, SecFeedData),
+    (fetch_equity_records_intrinio, IntrinioFeedData),
 ]
@@ -43,7 +49,7 @@ async def resolve(
     feeds: tuple[FeedPair, ...] | None = None,
 ) -> AsyncIterator[FeedRecord]:
     """
-    Merge all authoritative feed streams into a single asynchronous output.
+    Merge all discovery feed streams into a single asynchronous output.
     Args:
         feeds
@@ -56,9 +62,9 @@ async def resolve(
     items into a shared queue. Records are yielded as they arrive, ensuring
     minimal latency and efficient merging of multiple asynchronous sources.
     """
-    logger.info("Resolving raw equities from authoritative feeds...")
+    logger.info("Resolving raw equities from discovery feeds...")
-    feeds = feeds or _AUTH_FEEDS
+    feeds = feeds or _DISCOVERY_FEEDS
     queue: asyncio.Queue[FeedRecord | None] = asyncio.Queue()
     async with asyncio.TaskGroup() as task_group:

equity_aggregator/domain/pipeline/runner.py CHANGED Viewed

@@ -5,21 +5,21 @@ import logging
 from equity_aggregator.domain.pipeline.resolve import resolve
 from equity_aggregator.schemas import CanonicalEquity
-from .transforms import canonicalise, convert, deduplicate, enrich, identify, parse
+from .transforms import canonicalise, convert, enrich, group, identify, parse
 logger = logging.getLogger(__name__)
 async def aggregate_canonical_equities() -> list[CanonicalEquity]:
     """
-    Aggregates and processes raw equity data from authoritative feeds, returning
+    Aggregates and processes raw equity data from discovery feeds, returning
     a list of unique, canonical equities.
     The pipeline applies the following transforms in order:
       - parse: Parse raw equity data.
       - convert: Convert prices to reference currency (USD).
       - identify: Attach identification metadata.
-      - deduplicate: Merge duplicate equities.
+      - group: Group equities by share_class_figi.
       - enrich: Add supplementary data.
       - canonicalise: Convert to canonical equity format.
@@ -37,7 +37,7 @@ async def aggregate_canonical_equities() -> list[CanonicalEquity]:
         parse,
         convert,
         identify,
-        deduplicate,
+        group,
         enrich,
         canonicalise,
     )

equity_aggregator/domain/pipeline/seed.py CHANGED Viewed

@@ -10,13 +10,17 @@ from .runner import aggregate_canonical_equities
 logger = logging.getLogger(__name__)
-def seed_canonical_equities() -> None:
+def seed_canonical_equities() -> None:  # pragma: no cover
     """
     Runs the canonical equities aggregation pipeline and seeds the database.
     This function executes the aggregation pipeline to collect canonical equities,
     then saves them to the SQLite data store.
+    Note: This function is excluded from unit test coverage as it executes
+    the complete aggregation pipeline involving external API calls, database
+    operations, and async streaming transforms.
     Args:
         None

equity_aggregator/domain/pipeline/transforms/__init__.py CHANGED Viewed

@@ -2,13 +2,13 @@
 from .canonicalise import canonicalise
 from .convert import convert
-from .deduplicate import deduplicate
 from .enrich import enrich
+from .group import group
 from .identify import identify
 from .parse import parse
 __all__ = [
-    "deduplicate",
+    "group",
     "enrich",
     "identify",
     "canonicalise",

equity_aggregator/domain/pipeline/transforms/canonicalise.py CHANGED Viewed

@@ -3,8 +3,8 @@
 import logging
 from collections.abc import AsyncIterable, AsyncIterator
-from equity_aggregator.schemas.raw import RawEquity
 from equity_aggregator.schemas.canonical import CanonicalEquity
+from equity_aggregator.schemas.raw import RawEquity
 logger = logging.getLogger(__name__)

equity-aggregator 0.1.1__py3-none-any.whl → 0.1.4__py3-none-any.whl

equity-aggregator 0.1.1py3-none-any.whl → 0.1.4py3-none-any.whl