PyPI - nomenklatura-mpt - Versions diffs - 4.1.10__tar.gz → 4.1.12__tar.gz - Mend

nomenklatura-mpt 4.1.10tar.gz → 4.1.12tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (129) hide show

{nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nomenklatura_mpt
-Version: 4.1.10
+Version: 4.1.12
 Summary: Make record linkages in followthemoney data.
 Project-URL: Documentation, https://github.com/opensanctions/nomenklatura/
 Project-URL: Repository, https://github.com/opensanctions/nomenklatura.git

{nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/__init__.py RENAMED Viewed

@@ -8,11 +8,15 @@ from nomenklatura.matching.erun.model import EntityResolveRegression
 from nomenklatura.matching.erun.train import train_matcher as train_erun_matcher
 from nomenklatura.matching.logic_v1.model import LogicV1
 from nomenklatura.matching.logic_v2.model import LogicV2
+from nomenklatura.matching.logic_v3.model import LogicV3
+from nomenklatura.matching.logic_v4.model import LogicV4
 from nomenklatura.matching.types import ScoringAlgorithm, ScoringConfig
 ALGORITHMS: List[Type[ScoringAlgorithm]] = [
     LogicV1,
     LogicV2,
+    LogicV3,
+    LogicV4,
     NameMatcher,
     NameQualifiedMatcher,
     RegressionV1,
@@ -44,4 +48,7 @@ __all__ = [
     "ScoringConfig",
     "LogicV1",
     "LogicV2",
+    "LogicV3",
+    "LogicV4",
+    "SVMV1",
 ]

{nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/logic_v2/names/match.py RENAMED Viewed

@@ -193,3 +193,20 @@ def name_match(query: E, result: E, config: ScoringConfig) -> FtResult:
     if best.detail is None:
         best.detail = "No names available for matching"
     return best
+def name_match_levenshtein(query: E, result: E, config: ScoringConfig) -> FtResult:
+    """Match two entities by analyzing and comparing their names."""
+    schema = model.common_schema(query.schema, result.schema)
+    type_tag = schema_type_tag(schema)
+    best = FtResult(score=0.0, detail=None)
+    if type_tag == NameTypeTag.UNK:
+        # Name matching is not supported for entities that are not listed
+        # as a person, organization, or a thing.
+        best.detail = "Unsuited for name matching: %s" % schema.name
+        return best
+    return match_object_names(query, result, config)

nomenklatura_mpt-4.1.12/nomenklatura/matching/logic_v3/identifiers.py ADDED Viewed

@@ -0,0 +1,104 @@
+from itertools import product
+from rigour.ids import LEI, ISIN, INN, OGRN, IMO, BIC
+from rigour.ids import StrictFormat
+from rigour.text.distance import levenshtein
+from followthemoney import E, registry
+from nomenklatura.matching.types import FtResult, ScoringConfig
+from nomenklatura.matching.util import has_schema, type_pair
+from nomenklatura.matching.compare.util import clean_map, CleanFunc
+def _id_prop_match(
+    query: E,
+    result: E,
+    prop_name: str,
+    clean: CleanFunc = None,
+) -> bool:
+    """Check if a specific property identifier is shared by two entities."""
+    prop = query.schema.get(prop_name)
+    if prop is None:
+        return False
+    lv = clean_map(query.get(prop), clean=clean)
+    if not len(lv):
+        return False
+    rv_ = result.get_type_values(prop.type, matchable=True)
+    rv = clean_map(rv_, clean=clean)
+    common = lv.intersection(rv)
+    return len(common) > 0
+def _bidi_id_prop_match(
+    query: E,
+    result: E,
+    prop_name: str,
+    clean: CleanFunc = None,
+) -> FtResult:
+    """Check if a specific property identifier is shared by two entities."""
+    if _id_prop_match(query, result, prop_name, clean=clean):
+        return FtResult(score=1.0, detail="Property match: %r" % prop_name)
+    if _id_prop_match(result, query, prop_name, clean=clean):
+        return FtResult(score=1.0, detail="Property match: %r" % prop_name)
+    return FtResult(score=0.0, detail="No match: %r" % prop_name)
+def lei_code_match(query: E, result: E, config: ScoringConfig) -> FtResult:
+    """Two entities have the same Legal Entity Identifier."""
+    return _bidi_id_prop_match(query, result, "leiCode", LEI.normalize)
+def bic_code_match(query: E, result: E, config: ScoringConfig) -> FtResult:
+    """Two entities have the same SWIFT BIC."""
+    return _bidi_id_prop_match(query, result, "swiftBic", BIC.normalize)
+def ogrn_code_match(query: E, result: E, config: ScoringConfig) -> FtResult:
+    """Two entities have the same Russian company registration (OGRN) code."""
+    return _bidi_id_prop_match(query, result, "ogrnCode", OGRN.normalize)
+def inn_code_match(query: E, result: E, config: ScoringConfig) -> FtResult:
+    """Two entities have the same Russian tax identifier (INN)."""
+    return _bidi_id_prop_match(query, result, "innCode", INN.normalize)
+def isin_security_match(query: E, result: E, config: ScoringConfig) -> FtResult:
+    """Two securities have the same ISIN."""
+    if not has_schema(query, result, "Security"):
+        return FtResult(score=0.0, detail="None of the entities is a security")
+    return _bidi_id_prop_match(query, result, "isin", ISIN.normalize)
+def vessel_imo_mmsi_match(query: E, result: E, config: ScoringConfig) -> FtResult:
+    """Two vessels have the same IMO or MMSI identifier."""
+    imo_res = _bidi_id_prop_match(query, result, "imoNumber", IMO.normalize)
+    if imo_res.score > 0.0:
+        return imo_res
+    return _bidi_id_prop_match(query, result, "mmsi")
+def orgid_disjoint(query: E, result: E, config: ScoringConfig) -> FtResult:
+    """Two companies or organizations have different tax identifiers or registration
+    numbers."""
+    if not has_schema(query, result, "Organization"):
+        return FtResult(score=0.0, detail=None)
+    query_ids_, result_ids_ = type_pair(query, result, registry.identifier)
+    query_ids = clean_map(query_ids_, StrictFormat.normalize)
+    result_ids = clean_map(result_ids_, StrictFormat.normalize)
+    if not len(query_ids) or not len(result_ids):
+        return FtResult(score=0.0, detail=None)
+    common = query_ids.intersection(result_ids)
+    if len(common) > 0:
+        return FtResult(score=0.0, detail=None)
+    max_ratio = 0.0
+    for query_id, result_id in product(query_ids, result_ids):
+        distance = levenshtein(query_id, result_id)
+        max_len = max(len(query_id), len(result_id))
+        ratio = 1.0 - (distance / float(max_len))
+        if ratio > 0.7:
+            max_ratio = max(max_ratio, ratio)
+    detail = "Mismatched identifiers: %s vs %s" % (
+        ", ".join(query_ids),
+        ", ".join(result_ids),
+    )
+    return FtResult(score=1 - max_ratio, detail=detail)

nomenklatura_mpt-4.1.12/nomenklatura/matching/logic_v3/model.py ADDED Viewed

@@ -0,0 +1,99 @@
+from typing import Dict, List
+from nomenklatura.matching.types import Feature, HeuristicAlgorithm
+from nomenklatura.matching.types import ConfigVar, ConfigVarType
+from nomenklatura.matching.compare.countries import country_mismatch
+from nomenklatura.matching.compare.gender import gender_mismatch
+from nomenklatura.matching.compare.identifiers import crypto_wallet_address
+from nomenklatura.matching.compare.identifiers import identifier_match
+from nomenklatura.matching.compare.dates import dob_day_disjoint, dob_year_disjoint
+from nomenklatura.matching.compare.names import weak_alias_match
+from nomenklatura.matching.compare.addresses import address_entity_match
+from nomenklatura.matching.compare.addresses import address_prop_match
+from nomenklatura.matching.logic_v2.names.match import name_match_levenshtein
+from nomenklatura.matching.logic_v2.identifiers import bic_code_match
+from nomenklatura.matching.logic_v2.identifiers import inn_code_match, ogrn_code_match
+from nomenklatura.matching.logic_v2.identifiers import isin_security_match
+from nomenklatura.matching.logic_v2.identifiers import lei_code_match
+from nomenklatura.matching.logic_v2.identifiers import vessel_imo_mmsi_match
+from nomenklatura.matching.logic_v2.identifiers import uei_code_match
+from nomenklatura.matching.logic_v2.identifiers import npi_code_match
+from nomenklatura.matching.util import FNUL
+class LogicV3(HeuristicAlgorithm):
+    """A rule-based matching system that generates a set of basic scores via
+    name and identifier-based matching, and then qualifies that score using
+    supporting or contradicting features of the two entities. Version 3 uses
+    the same set of features as version 2, but replaces the name_match feature
+    with a new implementation. This new name matching function uses strict levenshtein
+    for name matching"""
+    NAME = "logic-v3"
+    features = [
+        Feature(func=name_match_levenshtein, weight=1.0),
+        Feature(func=address_entity_match, weight=0.98),
+        Feature(func=crypto_wallet_address, weight=0.98),
+        Feature(func=isin_security_match, weight=0.98),
+        Feature(func=lei_code_match, weight=0.95),
+        Feature(func=ogrn_code_match, weight=0.95),
+        Feature(func=vessel_imo_mmsi_match, weight=0.95),
+        Feature(func=inn_code_match, weight=0.95),
+        Feature(func=bic_code_match, weight=0.95),
+        Feature(func=uei_code_match, weight=0.95),
+        Feature(func=npi_code_match, weight=0.95),
+        Feature(func=identifier_match, weight=0.85),
+        Feature(func=weak_alias_match, weight=0.8),
+        Feature(func=address_prop_match, weight=0.2, qualifier=True),
+        Feature(func=country_mismatch, weight=-0.2, qualifier=True),
+        Feature(func=dob_year_disjoint, weight=-0.15, qualifier=True),
+        Feature(func=dob_day_disjoint, weight=-0.25, qualifier=True),
+        Feature(func=gender_mismatch, weight=-0.2, qualifier=True),
+    ]
+    CONFIG = {
+        "nm_number_mismatch": ConfigVar(
+            type=ConfigVarType.FLOAT,
+            description="Penalty for mismatching numbers in object or company names.",
+            default=0.3,
+        ),
+        "nm_extra_query_name": ConfigVar(
+            type=ConfigVarType.FLOAT,
+            description="Weight for name parts in the query not matched to the result.",
+            default=0.8,
+        ),
+        "nm_extra_result_name": ConfigVar(
+            type=ConfigVarType.FLOAT,
+            description="Weight for name parts in the result not matched to the query.",
+            default=0.2,
+        ),
+        "nm_family_name_weight": ConfigVar(
+            type=ConfigVarType.FLOAT,
+            description="Extra weight multiplier for family name in person matches (John Smith vs. John Gruber is clearly distinct).",
+            default=1.3,
+        ),
+        "nm_fuzzy_cutoff_factor": ConfigVar(
+            type=ConfigVarType.FLOAT,
+            description="Extra factor for when a fuzzy match is triggered in name matching. "
+            "Below a certain threshold, a fuzzy match is considered as a non-match (score = 0.0). "
+            "Adjusting this multiplier will raise this threshold, making a fuzzy match trigger more leniently.",
+            default=1.0,
+        ),
+    }
+    @classmethod
+    def compute_score(
+        cls, scores: Dict[str, float], weights: Dict[str, float]
+    ) -> float:
+        mains: List[float] = []
+        for feat in cls.features:
+            if feat.qualifier:
+                continue
+            weight = scores.get(feat.name, FNUL) * weights.get(feat.name, FNUL)
+            mains.append(weight)
+        score = max(mains)
+        for feat in cls.features:
+            if not feat.qualifier:
+                continue
+            weight = scores.get(feat.name, FNUL) * weights.get(feat.name, FNUL)
+            score += weight
+        return score

nomenklatura_mpt-4.1.12/nomenklatura/matching/logic_v3/multi.py ADDED Viewed

@@ -0,0 +1,21 @@
+from followthemoney.proxy import E
+from followthemoney.types import registry
+from nomenklatura.matching.types import FtResult, ScoringConfig
+from nomenklatura.matching.compare.util import extract_numbers
+from nomenklatura.matching.util import type_pair, has_schema
+def numbers_mismatch(query: E, result: E, config: ScoringConfig) -> FtResult:
+    """Find numbers in names and addresses and penalise different numbers."""
+    if has_schema(query, result, "Address"):
+        qv, rv = type_pair(query, result, registry.address)
+    else:
+        qv, rv = type_pair(query, result, registry.name)
+    qvn = extract_numbers(qv)
+    rvn = extract_numbers(rv)
+    base = min(len(qvn), len(rvn))
+    mismatch = len(qvn.difference(rvn))
+    # print("numbers_mismatch", mismatch, base, qvn, rvn)
+    score = float(mismatch) / float(max(1, base))
+    return FtResult(score=score, detail="Mismatching numbers: %s" % mismatch)

nomenklatura_mpt-4.1.12/nomenklatura/matching/logic_v3/phonetic.py ADDED Viewed

@@ -0,0 +1,142 @@
+from functools import cached_property
+from typing import List, Optional
+from itertools import product
+from normality import ascii_text
+from followthemoney.proxy import E
+from followthemoney.types import registry
+from rigour.text.scripts import can_latinize
+from rigour.text.distance import is_levenshtein_plausible
+from rigour.text.phonetics import metaphone, soundex
+from rigour.names import tokenize_name
+from rigour.util import list_intersection
+from nomenklatura.matching.util import type_pair, has_schema
+from nomenklatura.matching.compat import fingerprint_name, name_words
+class NameTokenPhonetic:
+    def __init__(self, token: str):
+        self.token = token
+        self.ascii = ascii_text(token) if can_latinize(token) else None
+    @cached_property
+    def metaphone(self) -> Optional[str]:
+        if self.ascii is not None:
+            phoneme = metaphone(self.ascii)
+            if len(phoneme) >= 3:
+                return phoneme
+        return None
+    # def __repr__(self) -> str:
+    #     return f"<NameTokenPhonetic {self.token!r}, {self.ascii!r}, {self.metaphone!r}>"
+    @classmethod
+    def from_name(cls, name: str) -> List["NameTokenPhonetic"]:
+        tokens = tokenize_name(name.lower(), token_min_length=2)
+        return [cls(token) for token in tokens]
+def metaphone_token(token: str) -> str:
+    if token.isalpha() and len(token) > 1:
+        out = metaphone(token)
+        # doesn't handle non-ascii characters
+        if len(out) >= 3:
+            return out
+    return token.upper()
+def soundex_token(token: str) -> str:
+    if token.isalpha() and len(token) > 1:
+        out = soundex(token)
+        # doesn't handle non-ascii characters
+        if len(out):
+            return out
+    return token.upper()
+def compare_parts_phonetic(left: NameTokenPhonetic, right: NameTokenPhonetic) -> bool:
+    if left.metaphone is None or right.metaphone is None:
+        return left.ascii == right.ascii
+    if (
+        left.metaphone == right.metaphone
+        and left.ascii is not None
+        and right.ascii is not None
+    ):
+        # Secondary check for Levenshtein distance:
+        if is_levenshtein_plausible(left.ascii, right.ascii):
+            return True
+    return False
+def _clean_phonetic_entity(original: str) -> Optional[str]:
+    """Normalize a legal entity name without transliteration."""
+    if not can_latinize(original):
+        return None
+    return fingerprint_name(original)
+def _token_names_compare(
+    query_names: List[List[str]], result_names: List[List[str]]
+) -> float:
+    score = 0.0
+    for q, r in product(query_names, result_names):
+        # length = max(2.0, (len(q) + len(r)) / 2.0)
+        length = max(2.0, len(q))
+        combo = len(list_intersection(q, r)) / float(length)
+        score = max(score, combo)
+    return score
+def person_name_phonetic_match(query: E, result: E) -> float:
+    """Two persons have similar names, using a phonetic algorithm."""
+    if not has_schema(query, result, "Person"):
+        return 0.0
+    query_names_, result_names_ = type_pair(query, result, registry.name)
+    query_parts = [NameTokenPhonetic.from_name(n) for n in query_names_]
+    result_parts = [NameTokenPhonetic.from_name(n) for n in result_names_]
+    score = 0.0
+    for q, r in product(query_parts, result_parts):
+        if len(q) == 0:
+            continue
+        matches = list(r)
+        matched = 0
+        for part in q:
+            for other in matches:
+                if compare_parts_phonetic(part, other):
+                    matches.remove(other)
+                    matched += 1
+                    break
+        score = max(score, matched / float(len(q)))
+    return score
+def _metaphone_tokens(token: str) -> List[str]:
+    words: List[str] = []
+    for word in name_words(_clean_phonetic_entity(token), min_length=2):
+        words.append(metaphone_token(word))
+    return words
+def name_metaphone_match(query: E, result: E) -> float:
+    """Two entities (person and non-person) have similar names, using the metaphone
+    algorithm."""
+    query_names_, result_names_ = type_pair(query, result, registry.name)
+    query_names = [_metaphone_tokens(n) for n in query_names_]
+    result_names = [_metaphone_tokens(n) for n in result_names_]
+    return _token_names_compare(query_names, result_names)
+def _soundex_tokens(token: str) -> List[str]:
+    words: List[str] = []
+    for word in name_words(_clean_phonetic_entity(token), min_length=2):
+        words.append(soundex_token(word))
+    return words
+def name_soundex_match(query: E, result: E) -> float:
+    """Two entities (person and non-person) have similar names, using the soundex
+    algorithm."""
+    query_names_, result_names_ = type_pair(query, result, registry.name)
+    query_names = [_soundex_tokens(n) for n in query_names_]
+    result_names = [_soundex_tokens(n) for n in result_names_]
+    return _token_names_compare(query_names, result_names)

nomenklatura_mpt-4.1.12/nomenklatura/matching/logic_v4/identifiers.py ADDED Viewed

@@ -0,0 +1,104 @@
+from itertools import product
+from rigour.ids import LEI, ISIN, INN, OGRN, IMO, BIC
+from rigour.ids import StrictFormat
+from rigour.text.distance import levenshtein
+from followthemoney import E, registry
+from nomenklatura.matching.types import FtResult, ScoringConfig
+from nomenklatura.matching.util import has_schema, type_pair
+from nomenklatura.matching.compare.util import clean_map, CleanFunc
+def _id_prop_match(
+    query: E,
+    result: E,
+    prop_name: str,
+    clean: CleanFunc = None,
+) -> bool:
+    """Check if a specific property identifier is shared by two entities."""
+    prop = query.schema.get(prop_name)
+    if prop is None:
+        return False
+    lv = clean_map(query.get(prop), clean=clean)
+    if not len(lv):
+        return False
+    rv_ = result.get_type_values(prop.type, matchable=True)
+    rv = clean_map(rv_, clean=clean)
+    common = lv.intersection(rv)
+    return len(common) > 0
+def _bidi_id_prop_match(
+    query: E,
+    result: E,
+    prop_name: str,
+    clean: CleanFunc = None,
+) -> FtResult:
+    """Check if a specific property identifier is shared by two entities."""
+    if _id_prop_match(query, result, prop_name, clean=clean):
+        return FtResult(score=1.0, detail="Property match: %r" % prop_name)
+    if _id_prop_match(result, query, prop_name, clean=clean):
+        return FtResult(score=1.0, detail="Property match: %r" % prop_name)
+    return FtResult(score=0.0, detail="No match: %r" % prop_name)
+def lei_code_match(query: E, result: E, config: ScoringConfig) -> FtResult:
+    """Two entities have the same Legal Entity Identifier."""
+    return _bidi_id_prop_match(query, result, "leiCode", LEI.normalize)
+def bic_code_match(query: E, result: E, config: ScoringConfig) -> FtResult:
+    """Two entities have the same SWIFT BIC."""
+    return _bidi_id_prop_match(query, result, "swiftBic", BIC.normalize)
+def ogrn_code_match(query: E, result: E, config: ScoringConfig) -> FtResult:
+    """Two entities have the same Russian company registration (OGRN) code."""
+    return _bidi_id_prop_match(query, result, "ogrnCode", OGRN.normalize)
+def inn_code_match(query: E, result: E, config: ScoringConfig) -> FtResult:
+    """Two entities have the same Russian tax identifier (INN)."""
+    return _bidi_id_prop_match(query, result, "innCode", INN.normalize)
+def isin_security_match(query: E, result: E, config: ScoringConfig) -> FtResult:
+    """Two securities have the same ISIN."""
+    if not has_schema(query, result, "Security"):
+        return FtResult(score=0.0, detail="None of the entities is a security")
+    return _bidi_id_prop_match(query, result, "isin", ISIN.normalize)
+def vessel_imo_mmsi_match(query: E, result: E, config: ScoringConfig) -> FtResult:
+    """Two vessels have the same IMO or MMSI identifier."""
+    imo_res = _bidi_id_prop_match(query, result, "imoNumber", IMO.normalize)
+    if imo_res.score > 0.0:
+        return imo_res
+    return _bidi_id_prop_match(query, result, "mmsi")
+def orgid_disjoint(query: E, result: E, config: ScoringConfig) -> FtResult:
+    """Two companies or organizations have different tax identifiers or registration
+    numbers."""
+    if not has_schema(query, result, "Organization"):
+        return FtResult(score=0.0, detail=None)
+    query_ids_, result_ids_ = type_pair(query, result, registry.identifier)
+    query_ids = clean_map(query_ids_, StrictFormat.normalize)
+    result_ids = clean_map(result_ids_, StrictFormat.normalize)
+    if not len(query_ids) or not len(result_ids):
+        return FtResult(score=0.0, detail=None)
+    common = query_ids.intersection(result_ids)
+    if len(common) > 0:
+        return FtResult(score=0.0, detail=None)
+    max_ratio = 0.0
+    for query_id, result_id in product(query_ids, result_ids):
+        distance = levenshtein(query_id, result_id)
+        max_len = max(len(query_id), len(result_id))
+        ratio = 1.0 - (distance / float(max_len))
+        if ratio > 0.7:
+            max_ratio = max(max_ratio, ratio)
+    detail = "Mismatched identifiers: %s vs %s" % (
+        ", ".join(query_ids),
+        ", ".join(result_ids),
+    )
+    return FtResult(score=1 - max_ratio, detail=detail)

nomenklatura_mpt-4.1.12/nomenklatura/matching/logic_v4/model.py ADDED Viewed

@@ -0,0 +1,105 @@
+from typing import Dict, List
+from nomenklatura.matching.logic_v1.phonetic import name_soundex_match, person_name_phonetic_match
+from nomenklatura.matching.logic_v4.phonetic import name_metaphone_match
+from nomenklatura.matching.types import Feature, FtResult, HeuristicAlgorithm
+from nomenklatura.matching.types import ConfigVar, ConfigVarType
+from nomenklatura.matching.compare.countries import country_mismatch
+from nomenklatura.matching.compare.gender import gender_mismatch
+from nomenklatura.matching.compare.identifiers import crypto_wallet_address
+from nomenklatura.matching.compare.identifiers import identifier_match
+from nomenklatura.matching.compare.dates import dob_day_disjoint, dob_year_disjoint
+from nomenklatura.matching.compare.names import weak_alias_match
+from nomenklatura.matching.compare.addresses import address_entity_match
+from nomenklatura.matching.compare.addresses import address_prop_match
+from nomenklatura.matching.logic_v2.names.match import name_match_levenshtein
+from nomenklatura.matching.logic_v2.identifiers import bic_code_match
+from nomenklatura.matching.logic_v2.identifiers import inn_code_match, ogrn_code_match
+from nomenklatura.matching.logic_v2.identifiers import isin_security_match
+from nomenklatura.matching.logic_v2.identifiers import lei_code_match
+from nomenklatura.matching.logic_v2.identifiers import vessel_imo_mmsi_match
+from nomenklatura.matching.logic_v2.identifiers import uei_code_match
+from nomenklatura.matching.logic_v2.identifiers import npi_code_match
+from nomenklatura.matching.util import FNUL
+class LogicV4(HeuristicAlgorithm):
+    """A rule-based matching system that generates a set of basic scores via
+    name and identifier-based matching, and then qualifies that score using
+    supporting or contradicting features of the two entities. Version 4 uses
+    the same set of features as version 3, but adds phonetic name matching
+    as an additional name matching strategy. This new name matching function
+    uses metaphone and soundex algorithms for phonetic name matching."""
+    NAME = "logic-v4"
+    features = [
+        Feature(func=name_match_levenshtein, weight=1.0),
+        Feature(func=FtResult.wrap(person_name_phonetic_match), weight=0.9),
+        # These are there so they can be enabled using custom weights:
+        Feature(func=FtResult.wrap(name_metaphone_match), weight=FNUL),
+        Feature(func=FtResult.wrap(name_soundex_match), weight=FNUL),
+        Feature(func=address_entity_match, weight=0.98),
+        Feature(func=crypto_wallet_address, weight=0.98),
+        Feature(func=isin_security_match, weight=0.98),
+        Feature(func=lei_code_match, weight=0.95),
+        Feature(func=ogrn_code_match, weight=0.95),
+        Feature(func=vessel_imo_mmsi_match, weight=0.95),
+        Feature(func=inn_code_match, weight=0.95),
+        Feature(func=bic_code_match, weight=0.95),
+        Feature(func=uei_code_match, weight=0.95),
+        Feature(func=npi_code_match, weight=0.95),
+        Feature(func=identifier_match, weight=0.85),
+        Feature(func=weak_alias_match, weight=0.8),
+        Feature(func=address_prop_match, weight=0.2, qualifier=True),
+        Feature(func=country_mismatch, weight=-0.2, qualifier=True),
+        Feature(func=dob_year_disjoint, weight=-0.15, qualifier=True),
+        Feature(func=dob_day_disjoint, weight=-0.25, qualifier=True),
+        Feature(func=gender_mismatch, weight=-0.2, qualifier=True),
+    ]
+    CONFIG = {
+        "nm_number_mismatch": ConfigVar(
+            type=ConfigVarType.FLOAT,
+            description="Penalty for mismatching numbers in object or company names.",
+            default=0.3,
+        ),
+        "nm_extra_query_name": ConfigVar(
+            type=ConfigVarType.FLOAT,
+            description="Weight for name parts in the query not matched to the result.",
+            default=0.8,
+        ),
+        "nm_extra_result_name": ConfigVar(
+            type=ConfigVarType.FLOAT,
+            description="Weight for name parts in the result not matched to the query.",
+            default=0.2,
+        ),
+        "nm_family_name_weight": ConfigVar(
+            type=ConfigVarType.FLOAT,
+            description="Extra weight multiplier for family name in person matches (John Smith vs. John Gruber is clearly distinct).",
+            default=1.3,
+        ),
+        "nm_fuzzy_cutoff_factor": ConfigVar(
+            type=ConfigVarType.FLOAT,
+            description="Extra factor for when a fuzzy match is triggered in name matching. "
+            "Below a certain threshold, a fuzzy match is considered as a non-match (score = 0.0). "
+            "Adjusting this multiplier will raise this threshold, making a fuzzy match trigger more leniently.",
+            default=1.0,
+        ),
+    }
+    @classmethod
+    def compute_score(
+        cls, scores: Dict[str, float], weights: Dict[str, float]
+    ) -> float:
+        mains: List[float] = []
+        for feat in cls.features:
+            if feat.qualifier:
+                continue
+            weight = scores.get(feat.name, FNUL) * weights.get(feat.name, FNUL)
+            mains.append(weight)
+        score = max(mains)
+        for feat in cls.features:
+            if not feat.qualifier:
+                continue
+            weight = scores.get(feat.name, FNUL) * weights.get(feat.name, FNUL)
+            score += weight
+        return score

nomenklatura_mpt-4.1.12/nomenklatura/matching/logic_v4/multi.py ADDED Viewed

@@ -0,0 +1,21 @@
+from followthemoney.proxy import E
+from followthemoney.types import registry
+from nomenklatura.matching.types import FtResult, ScoringConfig
+from nomenklatura.matching.compare.util import extract_numbers
+from nomenklatura.matching.util import type_pair, has_schema
+def numbers_mismatch(query: E, result: E, config: ScoringConfig) -> FtResult:
+    """Find numbers in names and addresses and penalise different numbers."""
+    if has_schema(query, result, "Address"):
+        qv, rv = type_pair(query, result, registry.address)
+    else:
+        qv, rv = type_pair(query, result, registry.name)
+    qvn = extract_numbers(qv)
+    rvn = extract_numbers(rv)
+    base = min(len(qvn), len(rvn))
+    mismatch = len(qvn.difference(rvn))
+    # print("numbers_mismatch", mismatch, base, qvn, rvn)
+    score = float(mismatch) / float(max(1, base))
+    return FtResult(score=score, detail="Mismatching numbers: %s" % mismatch)

nomenklatura-mpt 4.1.10__tar.gz → 4.1.12__tar.gz

nomenklatura-mpt 4.1.10tar.gz → 4.1.12tar.gz