nomenklatura-mpt 4.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nomenklatura/__init__.py +11 -0
- nomenklatura/cache.py +194 -0
- nomenklatura/cli.py +260 -0
- nomenklatura/conflicting_match.py +80 -0
- nomenklatura/data/er-unstable.pkl +0 -0
- nomenklatura/data/regression-v1.pkl +0 -0
- nomenklatura/db.py +139 -0
- nomenklatura/delta.py +4 -0
- nomenklatura/enrich/__init__.py +94 -0
- nomenklatura/enrich/aleph.py +141 -0
- nomenklatura/enrich/common.py +219 -0
- nomenklatura/enrich/nominatim.py +72 -0
- nomenklatura/enrich/opencorporates.py +233 -0
- nomenklatura/enrich/openfigi.py +124 -0
- nomenklatura/enrich/permid.py +201 -0
- nomenklatura/enrich/wikidata.py +268 -0
- nomenklatura/enrich/yente.py +116 -0
- nomenklatura/exceptions.py +9 -0
- nomenklatura/index/__init__.py +5 -0
- nomenklatura/index/common.py +24 -0
- nomenklatura/index/entry.py +89 -0
- nomenklatura/index/index.py +170 -0
- nomenklatura/index/tokenizer.py +92 -0
- nomenklatura/judgement.py +21 -0
- nomenklatura/kv.py +40 -0
- nomenklatura/matching/__init__.py +47 -0
- nomenklatura/matching/bench.py +32 -0
- nomenklatura/matching/compare/__init__.py +0 -0
- nomenklatura/matching/compare/addresses.py +71 -0
- nomenklatura/matching/compare/countries.py +15 -0
- nomenklatura/matching/compare/dates.py +83 -0
- nomenklatura/matching/compare/gender.py +15 -0
- nomenklatura/matching/compare/identifiers.py +30 -0
- nomenklatura/matching/compare/names.py +157 -0
- nomenklatura/matching/compare/util.py +51 -0
- nomenklatura/matching/compat.py +66 -0
- nomenklatura/matching/erun/__init__.py +0 -0
- nomenklatura/matching/erun/countries.py +42 -0
- nomenklatura/matching/erun/identifiers.py +64 -0
- nomenklatura/matching/erun/misc.py +71 -0
- nomenklatura/matching/erun/model.py +110 -0
- nomenklatura/matching/erun/names.py +126 -0
- nomenklatura/matching/erun/train.py +135 -0
- nomenklatura/matching/erun/util.py +28 -0
- nomenklatura/matching/logic_v1/__init__.py +0 -0
- nomenklatura/matching/logic_v1/identifiers.py +104 -0
- nomenklatura/matching/logic_v1/model.py +76 -0
- nomenklatura/matching/logic_v1/multi.py +21 -0
- nomenklatura/matching/logic_v1/phonetic.py +142 -0
- nomenklatura/matching/logic_v2/__init__.py +0 -0
- nomenklatura/matching/logic_v2/identifiers.py +124 -0
- nomenklatura/matching/logic_v2/model.py +98 -0
- nomenklatura/matching/logic_v2/names/__init__.py +3 -0
- nomenklatura/matching/logic_v2/names/analysis.py +51 -0
- nomenklatura/matching/logic_v2/names/distance.py +181 -0
- nomenklatura/matching/logic_v2/names/magic.py +60 -0
- nomenklatura/matching/logic_v2/names/match.py +195 -0
- nomenklatura/matching/logic_v2/names/pairing.py +81 -0
- nomenklatura/matching/logic_v2/names/util.py +89 -0
- nomenklatura/matching/name_based/__init__.py +4 -0
- nomenklatura/matching/name_based/misc.py +86 -0
- nomenklatura/matching/name_based/model.py +59 -0
- nomenklatura/matching/name_based/names.py +59 -0
- nomenklatura/matching/pairs.py +42 -0
- nomenklatura/matching/regression_v1/__init__.py +0 -0
- nomenklatura/matching/regression_v1/misc.py +75 -0
- nomenklatura/matching/regression_v1/model.py +110 -0
- nomenklatura/matching/regression_v1/names.py +63 -0
- nomenklatura/matching/regression_v1/train.py +87 -0
- nomenklatura/matching/regression_v1/util.py +31 -0
- nomenklatura/matching/svm_v1/__init__.py +5 -0
- nomenklatura/matching/svm_v1/misc.py +94 -0
- nomenklatura/matching/svm_v1/model.py +168 -0
- nomenklatura/matching/svm_v1/names.py +81 -0
- nomenklatura/matching/svm_v1/train.py +186 -0
- nomenklatura/matching/svm_v1/util.py +30 -0
- nomenklatura/matching/types.py +227 -0
- nomenklatura/matching/util.py +62 -0
- nomenklatura/publish/__init__.py +0 -0
- nomenklatura/publish/dates.py +49 -0
- nomenklatura/publish/edges.py +32 -0
- nomenklatura/py.typed +0 -0
- nomenklatura/resolver/__init__.py +6 -0
- nomenklatura/resolver/common.py +2 -0
- nomenklatura/resolver/edge.py +107 -0
- nomenklatura/resolver/identifier.py +60 -0
- nomenklatura/resolver/linker.py +101 -0
- nomenklatura/resolver/resolver.py +565 -0
- nomenklatura/settings.py +17 -0
- nomenklatura/store/__init__.py +41 -0
- nomenklatura/store/base.py +130 -0
- nomenklatura/store/level.py +272 -0
- nomenklatura/store/memory.py +102 -0
- nomenklatura/store/redis_.py +131 -0
- nomenklatura/store/sql.py +219 -0
- nomenklatura/store/util.py +48 -0
- nomenklatura/store/versioned.py +371 -0
- nomenklatura/tui/__init__.py +17 -0
- nomenklatura/tui/app.py +294 -0
- nomenklatura/tui/app.tcss +52 -0
- nomenklatura/tui/comparison.py +81 -0
- nomenklatura/tui/util.py +35 -0
- nomenklatura/util.py +26 -0
- nomenklatura/versions.py +119 -0
- nomenklatura/wikidata/__init__.py +14 -0
- nomenklatura/wikidata/client.py +122 -0
- nomenklatura/wikidata/lang.py +94 -0
- nomenklatura/wikidata/model.py +139 -0
- nomenklatura/wikidata/props.py +70 -0
- nomenklatura/wikidata/qualified.py +49 -0
- nomenklatura/wikidata/query.py +66 -0
- nomenklatura/wikidata/value.py +87 -0
- nomenklatura/xref.py +125 -0
- nomenklatura_mpt-4.1.9.dist-info/METADATA +159 -0
- nomenklatura_mpt-4.1.9.dist-info/RECORD +118 -0
- nomenklatura_mpt-4.1.9.dist-info/WHEEL +4 -0
- nomenklatura_mpt-4.1.9.dist-info/entry_points.txt +3 -0
- nomenklatura_mpt-4.1.9.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,92 @@
|
|
1
|
+
from typing import Generic, Generator, Optional, Tuple, Set
|
2
|
+
from normality import WS, category_replace, ascii_text
|
3
|
+
from rigour.ids import StrictFormat
|
4
|
+
from rigour.names import tokenize_name
|
5
|
+
from rigour.names import remove_person_prefixes
|
6
|
+
from rigour.names.org_types import replace_org_types_display
|
7
|
+
from followthemoney import registry, Property, DS, SE
|
8
|
+
|
9
|
+
|
10
|
+
WORD_FIELD = "word"
|
11
|
+
NAME_PART_FIELD = "namepart"
|
12
|
+
SKIP_FULL = (
|
13
|
+
# registry.name,
|
14
|
+
registry.address,
|
15
|
+
registry.text,
|
16
|
+
registry.string,
|
17
|
+
registry.number,
|
18
|
+
registry.json,
|
19
|
+
)
|
20
|
+
TEXT_TYPES = (
|
21
|
+
registry.text,
|
22
|
+
registry.string,
|
23
|
+
registry.address,
|
24
|
+
registry.identifier,
|
25
|
+
registry.name,
|
26
|
+
)
|
27
|
+
|
28
|
+
|
29
|
+
def normalize_name(name: Optional[str]) -> Optional[str]:
|
30
|
+
"""Normalize a name by removing prefixes and suffixes."""
|
31
|
+
if name is None:
|
32
|
+
return None
|
33
|
+
name = name.lower()
|
34
|
+
name = " ".join(tokenize_name(name))
|
35
|
+
name = ascii_text(name)
|
36
|
+
if len(name) < 2:
|
37
|
+
return None
|
38
|
+
return name
|
39
|
+
|
40
|
+
|
41
|
+
class Tokenizer(Generic[DS, SE]):
|
42
|
+
def value(
|
43
|
+
self, prop: Property, value: str
|
44
|
+
) -> Generator[Tuple[str, str], None, None]:
|
45
|
+
"""Perform type-specific token generation for a property value."""
|
46
|
+
type = prop.type
|
47
|
+
if not prop.matchable:
|
48
|
+
return
|
49
|
+
if type in (registry.url, registry.topic, registry.entity):
|
50
|
+
return
|
51
|
+
if type not in SKIP_FULL:
|
52
|
+
token_value = value[:100].lower()
|
53
|
+
yield type.name, token_value
|
54
|
+
if type == registry.date:
|
55
|
+
if len(value) > 4:
|
56
|
+
yield type.name, value[:4]
|
57
|
+
yield type.name, value[:10]
|
58
|
+
return
|
59
|
+
if type == registry.name:
|
60
|
+
name_parts: Set[str] = set()
|
61
|
+
# this needs to happen before the replacements
|
62
|
+
text = remove_person_prefixes(value)
|
63
|
+
for token in tokenize_name(text.lower(), token_min_length=3):
|
64
|
+
name_parts.add(token)
|
65
|
+
# Super hard-core string scrubbing
|
66
|
+
cleaned = normalize_name(text)
|
67
|
+
if cleaned is not None:
|
68
|
+
cleaned = replace_org_types_display(cleaned, normalizer=normalize_name)
|
69
|
+
yield type.name, cleaned
|
70
|
+
for token in cleaned.split(WS):
|
71
|
+
name_parts.add(token)
|
72
|
+
for part in name_parts:
|
73
|
+
if len(part) > 2 and len(part) < 30:
|
74
|
+
yield NAME_PART_FIELD, part
|
75
|
+
return
|
76
|
+
if type == registry.identifier:
|
77
|
+
clean_id = StrictFormat.normalize(value)
|
78
|
+
if clean_id is not None:
|
79
|
+
yield type.name, clean_id
|
80
|
+
return
|
81
|
+
if type in TEXT_TYPES:
|
82
|
+
text = value.lower()
|
83
|
+
replaced = category_replace(text)
|
84
|
+
for word in replaced.split(WS):
|
85
|
+
if len(word) >= 3:
|
86
|
+
yield WORD_FIELD, word
|
87
|
+
|
88
|
+
def entity(self, entity: SE) -> Generator[Tuple[str, str], None, None]:
|
89
|
+
# yield f"d:{entity.dataset.name}", 0.0
|
90
|
+
for prop, value in entity.itervalues():
|
91
|
+
for field, token in self.value(prop, value):
|
92
|
+
yield field, token
|
@@ -0,0 +1,21 @@
|
|
1
|
+
from enum import Enum
|
2
|
+
|
3
|
+
|
4
|
+
class Judgement(Enum):
|
5
|
+
"""A judgement of whether two entities are the same."""
|
6
|
+
|
7
|
+
POSITIVE = "positive"
|
8
|
+
NEGATIVE = "negative"
|
9
|
+
UNSURE = "unsure"
|
10
|
+
NO_JUDGEMENT = "no_judgement"
|
11
|
+
|
12
|
+
def __add__(self, other: "Judgement") -> "Judgement":
|
13
|
+
pair = {self, other}
|
14
|
+
if pair == {Judgement.POSITIVE}:
|
15
|
+
return Judgement.POSITIVE
|
16
|
+
elif pair == {Judgement.POSITIVE, Judgement.NEGATIVE}:
|
17
|
+
return Judgement.NEGATIVE
|
18
|
+
return Judgement.UNSURE
|
19
|
+
|
20
|
+
def to_dict(self) -> str:
|
21
|
+
return str(self.value)
|
nomenklatura/kv.py
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
import redis
|
2
|
+
import logging
|
3
|
+
from typing import Union
|
4
|
+
from functools import cache
|
5
|
+
from redis.client import Redis
|
6
|
+
from fakeredis import FakeStrictRedis
|
7
|
+
|
8
|
+
from rigour.env import ENCODING
|
9
|
+
from nomenklatura import settings
|
10
|
+
|
11
|
+
log = logging.getLogger(__name__)
|
12
|
+
|
13
|
+
|
14
|
+
@cache
|
15
|
+
def get_redis() -> "Redis[bytes]":
|
16
|
+
"""Return a Redis connection configured from the environment."""
|
17
|
+
if settings.TESTING or not len(settings.REDIS_URL.strip()):
|
18
|
+
log.info("Using in-memory key-value store...")
|
19
|
+
return FakeStrictRedis(decode_responses=False)
|
20
|
+
db = redis.from_url(settings.REDIS_URL, decode_responses=False)
|
21
|
+
# for kvrocks:
|
22
|
+
if len(db.config_get("redis-cursor-compatible")):
|
23
|
+
db.config_set("redis-cursor-compatible", "yes")
|
24
|
+
return db
|
25
|
+
|
26
|
+
|
27
|
+
def close_redis() -> None:
|
28
|
+
"""Close the Redis connection."""
|
29
|
+
get_redis().close()
|
30
|
+
get_redis.cache_clear()
|
31
|
+
|
32
|
+
|
33
|
+
def b(s: str) -> bytes:
|
34
|
+
"""Encode a string to bytes."""
|
35
|
+
return s.encode(ENCODING)
|
36
|
+
|
37
|
+
|
38
|
+
def bv(s: Union[bytes, str, int, float]) -> bytes:
|
39
|
+
"""Decode bytes to a string."""
|
40
|
+
return s # type: ignore
|
@@ -0,0 +1,47 @@
|
|
1
|
+
from typing import List, Type, Optional
|
2
|
+
from nomenklatura.matching.regression_v1.model import RegressionV1
|
3
|
+
from nomenklatura.matching.svm_v1.model import SVMV1
|
4
|
+
from nomenklatura.matching.regression_v1.train import train_matcher as train_v1_matcher
|
5
|
+
from nomenklatura.matching.svm_v1.train import train_matcher as train_svm_matcher
|
6
|
+
from nomenklatura.matching.name_based import NameMatcher, NameQualifiedMatcher
|
7
|
+
from nomenklatura.matching.erun.model import EntityResolveRegression
|
8
|
+
from nomenklatura.matching.erun.train import train_matcher as train_erun_matcher
|
9
|
+
from nomenklatura.matching.logic_v1.model import LogicV1
|
10
|
+
from nomenklatura.matching.logic_v2.model import LogicV2
|
11
|
+
from nomenklatura.matching.types import ScoringAlgorithm, ScoringConfig
|
12
|
+
|
13
|
+
ALGORITHMS: List[Type[ScoringAlgorithm]] = [
|
14
|
+
LogicV1,
|
15
|
+
LogicV2,
|
16
|
+
NameMatcher,
|
17
|
+
NameQualifiedMatcher,
|
18
|
+
RegressionV1,
|
19
|
+
EntityResolveRegression,
|
20
|
+
SVMV1
|
21
|
+
]
|
22
|
+
|
23
|
+
DefaultAlgorithm = RegressionV1
|
24
|
+
|
25
|
+
|
26
|
+
def get_algorithm(name: str) -> Optional[Type[ScoringAlgorithm]]:
|
27
|
+
"""Return the scoring algorithm class with the given name."""
|
28
|
+
for algorithm in ALGORITHMS:
|
29
|
+
if algorithm.NAME == name:
|
30
|
+
return algorithm
|
31
|
+
return None
|
32
|
+
|
33
|
+
|
34
|
+
__all__ = [
|
35
|
+
"RegressionV1",
|
36
|
+
"EntityResolveRegression",
|
37
|
+
"train_v1_matcher",
|
38
|
+
"train_svm_matcher",
|
39
|
+
"train_erun_matcher",
|
40
|
+
"DefaultAlgorithm",
|
41
|
+
"ScoringAlgorithm",
|
42
|
+
"NameMatcher",
|
43
|
+
"NameQualifiedMatcher",
|
44
|
+
"ScoringConfig",
|
45
|
+
"LogicV1",
|
46
|
+
"LogicV2",
|
47
|
+
]
|
@@ -0,0 +1,32 @@
|
|
1
|
+
import logging
|
2
|
+
import datetime
|
3
|
+
from timeit import timeit
|
4
|
+
from itertools import cycle
|
5
|
+
from followthemoney.util import PathLike
|
6
|
+
|
7
|
+
from nomenklatura.matching import get_algorithm
|
8
|
+
from nomenklatura.matching.pairs import read_pairs
|
9
|
+
from nomenklatura.matching.types import ScoringConfig
|
10
|
+
|
11
|
+
|
12
|
+
log = logging.getLogger(__name__)
|
13
|
+
|
14
|
+
|
15
|
+
def bench_matcher(name: str, pairs_file: PathLike, number: int) -> None:
|
16
|
+
config = ScoringConfig.defaults()
|
17
|
+
log.info("Loading pairs from %s", pairs_file)
|
18
|
+
pairs = list(read_pairs(pairs_file))
|
19
|
+
log.info("Read %d pairs", len(pairs))
|
20
|
+
matcher = get_algorithm(name)
|
21
|
+
if matcher is None:
|
22
|
+
raise ValueError("No matcher named %s", name)
|
23
|
+
log.info("Loaded %s", matcher.NAME)
|
24
|
+
infinite_pairs = cycle(pairs)
|
25
|
+
|
26
|
+
def compare_one_pair() -> None:
|
27
|
+
pair = next(infinite_pairs)
|
28
|
+
matcher.compare(pair.left, pair.right, config)
|
29
|
+
|
30
|
+
log.info("Running benchmark for %d iterations", number)
|
31
|
+
seconds = timeit(compare_one_pair, number=number)
|
32
|
+
log.info("Total time %s", datetime.timedelta(seconds=seconds))
|
File without changes
|
@@ -0,0 +1,71 @@
|
|
1
|
+
from functools import lru_cache
|
2
|
+
from typing import List, Set
|
3
|
+
from followthemoney.proxy import E
|
4
|
+
from followthemoney.types import registry
|
5
|
+
from itertools import product
|
6
|
+
from rigour.text import levenshtein_similarity
|
7
|
+
from rigour.addresses import normalize_address, remove_address_keywords
|
8
|
+
|
9
|
+
from nomenklatura.matching.types import FtResult, ScoringConfig
|
10
|
+
from nomenklatura.matching.util import has_schema
|
11
|
+
|
12
|
+
|
13
|
+
@lru_cache(maxsize=128)
|
14
|
+
def _normalize_address(addr: str) -> Set[str]:
|
15
|
+
"""Normalize an address string into tokens."""
|
16
|
+
norm = normalize_address(addr, latinize=True)
|
17
|
+
if norm is None:
|
18
|
+
return set()
|
19
|
+
norm = remove_address_keywords(norm, latinize=True)
|
20
|
+
if norm is None:
|
21
|
+
return set()
|
22
|
+
return set([n for n in norm.split() if len(n) > 0])
|
23
|
+
|
24
|
+
|
25
|
+
def _address_match(query_addrs: List[str], result_addrs: List[str]) -> FtResult:
|
26
|
+
"""Text similarity between addresses."""
|
27
|
+
if len(query_addrs) == 0 or len(result_addrs) == 0:
|
28
|
+
return FtResult(score=0.0, detail="No addresses provided")
|
29
|
+
max_result = FtResult(score=0.0, detail=None)
|
30
|
+
query_norms = [_normalize_address(addr) for addr in query_addrs]
|
31
|
+
result_norms = [_normalize_address(addr) for addr in result_addrs]
|
32
|
+
for query_tokens, result_tokens in product(query_norms, result_norms):
|
33
|
+
if len(query_tokens) == 0 or len(result_tokens) == 0:
|
34
|
+
continue
|
35
|
+
# pick out tokens that are in both sets and treat those as safe gains
|
36
|
+
overlap = query_tokens.intersection(result_tokens)
|
37
|
+
if len(overlap) == len(query_tokens) or len(overlap) == len(result_tokens):
|
38
|
+
detail = f"Address matches subset: {' '.join(overlap)}"
|
39
|
+
return FtResult(score=1.0, detail=detail)
|
40
|
+
|
41
|
+
# sort the address tokens alphabetically to address different orderings
|
42
|
+
query_rem = sorted([t for t in query_tokens if t not in overlap])
|
43
|
+
query_fuzzy = " ".join(query_rem)
|
44
|
+
result_rem = sorted([t for t in result_tokens if t not in overlap])
|
45
|
+
result_fuzzy = " ".join(result_rem)
|
46
|
+
fuzzy_len = max(len(query_fuzzy), len(result_fuzzy))
|
47
|
+
score = levenshtein_similarity(query_fuzzy, result_fuzzy, max_edits=fuzzy_len)
|
48
|
+
|
49
|
+
# combine the scores from overlap and levenshtein
|
50
|
+
rem_len = max(len(query_rem), len(result_rem))
|
51
|
+
score = (len(overlap) + (rem_len * (score))) / (rem_len + len(overlap))
|
52
|
+
if score > max_result.score:
|
53
|
+
detail = f"Matched addresses: {query_fuzzy} <-> {result_fuzzy}"
|
54
|
+
max_result = FtResult(score=score, detail=detail)
|
55
|
+
return max_result
|
56
|
+
|
57
|
+
|
58
|
+
def address_entity_match(query: E, result: E, config: ScoringConfig) -> FtResult:
|
59
|
+
"""Two address entities relate to similar addresses."""
|
60
|
+
if not has_schema(query, result, "Address"):
|
61
|
+
return FtResult(score=0.0, detail=None)
|
62
|
+
return _address_match(query.get("full"), result.get("full"))
|
63
|
+
|
64
|
+
|
65
|
+
def address_prop_match(query: E, result: E, config: ScoringConfig) -> FtResult:
|
66
|
+
"""Two entities have similar stated addresses."""
|
67
|
+
if has_schema(query, result, "Address"):
|
68
|
+
return FtResult(score=0.0, detail=None)
|
69
|
+
query_addrs = query.get_type_values(registry.address, matchable=True)
|
70
|
+
result_addrs = result.get_type_values(registry.address, matchable=True)
|
71
|
+
return _address_match(query_addrs, result_addrs)
|
@@ -0,0 +1,15 @@
|
|
1
|
+
from followthemoney.proxy import E
|
2
|
+
from followthemoney.types import registry
|
3
|
+
|
4
|
+
from nomenklatura.matching.types import FtResult, ScoringConfig
|
5
|
+
from nomenklatura.matching.util import type_pair
|
6
|
+
|
7
|
+
|
8
|
+
def country_mismatch(query: E, result: E, config: ScoringConfig) -> FtResult:
|
9
|
+
"""Both entities are linked to different countries."""
|
10
|
+
qv, rv = type_pair(query, result, registry.country)
|
11
|
+
if len(qv) > 0 and len(rv) > 0:
|
12
|
+
if len(set(qv).intersection(rv)) == 0:
|
13
|
+
detail = f"Different countries: {qv} / {rv}"
|
14
|
+
return FtResult(score=1.0, detail=detail)
|
15
|
+
return FtResult(score=0.0, detail=None)
|
@@ -0,0 +1,83 @@
|
|
1
|
+
from typing import Iterable, Set
|
2
|
+
from prefixdate import Precision
|
3
|
+
from followthemoney.proxy import E
|
4
|
+
|
5
|
+
from nomenklatura.matching.types import FtResult, ScoringConfig
|
6
|
+
from nomenklatura.matching.compare.util import has_overlap
|
7
|
+
from nomenklatura.matching.util import props_pair
|
8
|
+
|
9
|
+
|
10
|
+
def _dates_precision(values: Iterable[str], precision: Precision) -> Set[str]:
|
11
|
+
dates = set()
|
12
|
+
for value in values:
|
13
|
+
if len(value) >= precision.value:
|
14
|
+
dates.add(value[: precision.value])
|
15
|
+
return dates
|
16
|
+
|
17
|
+
|
18
|
+
def _flip_day_month(value: str) -> str:
|
19
|
+
# This is such a common mistake we want to accomodate it.
|
20
|
+
year, month, day = value.split("-", 2)
|
21
|
+
return f"{year}-{day}-{month}"
|
22
|
+
|
23
|
+
|
24
|
+
def dob_matches(query: E, result: E) -> float:
|
25
|
+
"""The birth date of the two entities is the same."""
|
26
|
+
query_dates, result_dates = props_pair(query, result, ["birthDate"])
|
27
|
+
if len(query_dates) == 0 or len(result_dates) == 0:
|
28
|
+
return 0.0
|
29
|
+
result_days = _dates_precision(result_dates, Precision.DAY)
|
30
|
+
query_days = _dates_precision(query_dates, Precision.DAY)
|
31
|
+
if has_overlap(query_days, result_days):
|
32
|
+
return 1.0
|
33
|
+
query_flipped = [_flip_day_month(d) for d in query_days]
|
34
|
+
if has_overlap(query_flipped, result_days):
|
35
|
+
return 0.5
|
36
|
+
return 0.0
|
37
|
+
|
38
|
+
|
39
|
+
def dob_year_matches(query: E, result: E) -> float:
|
40
|
+
"""The birth date of the two entities is the same."""
|
41
|
+
query_dates, result_dates = props_pair(query, result, ["birthDate"])
|
42
|
+
query_years = _dates_precision(query_dates, Precision.YEAR)
|
43
|
+
result_years = _dates_precision(result_dates, Precision.YEAR)
|
44
|
+
if has_overlap(query_years, result_years):
|
45
|
+
return 1.0
|
46
|
+
return 0.0
|
47
|
+
|
48
|
+
|
49
|
+
def dob_day_disjoint(query: E, result: E, config: ScoringConfig) -> FtResult:
|
50
|
+
"""The birth date of the two entities is not the same."""
|
51
|
+
query_dates, result_dates = props_pair(query, result, ["birthDate"])
|
52
|
+
if len(query_dates) == 0 or len(result_dates) == 0:
|
53
|
+
return FtResult(score=0.0, detail="No birth dates provided")
|
54
|
+
result_days = _dates_precision(result_dates, Precision.DAY)
|
55
|
+
query_days = _dates_precision(query_dates, Precision.DAY)
|
56
|
+
if len(result_days) == 0 or len(query_days) == 0:
|
57
|
+
return FtResult(score=0.0, detail="No birth days provided")
|
58
|
+
if has_overlap(query_days, result_days):
|
59
|
+
match = ", ".join(query_days.intersection(result_days))
|
60
|
+
detail = f"Birth day match: {match}"
|
61
|
+
return FtResult(score=0.0, detail=detail)
|
62
|
+
query_flipped = [_flip_day_month(d) for d in query_days]
|
63
|
+
if has_overlap(query_flipped, result_days):
|
64
|
+
match = ", ".join(result_days.intersection(query_flipped))
|
65
|
+
detail = f"Birth day mis-match (flipped): {match}"
|
66
|
+
return FtResult(score=0.5, detail=detail)
|
67
|
+
detail = f"Birth day mis-match: {', '.join(query_days)} vs {', '.join(result_days)}"
|
68
|
+
return FtResult(score=1.0, detail=detail)
|
69
|
+
|
70
|
+
|
71
|
+
def dob_year_disjoint(query: E, result: E, config: ScoringConfig) -> FtResult:
|
72
|
+
"""The birth date of the two entities is not the same."""
|
73
|
+
query_dates, result_dates = props_pair(query, result, ["birthDate"])
|
74
|
+
query_years = _dates_precision(query_dates, Precision.YEAR)
|
75
|
+
result_years = _dates_precision(result_dates, Precision.YEAR)
|
76
|
+
if len(query_years) == 0 or len(result_years) == 0:
|
77
|
+
return FtResult(score=0.0, detail="No birth years provided")
|
78
|
+
common = query_years.intersection(result_years)
|
79
|
+
if len(common) > 0:
|
80
|
+
detail = f"Birth year match: {', '.join(common)}"
|
81
|
+
return FtResult(score=0.0, detail=detail)
|
82
|
+
detail = f"Birth years: {', '.join(query_years)} vs {', '.join(result_years)}"
|
83
|
+
return FtResult(score=1.0, detail=detail)
|
@@ -0,0 +1,15 @@
|
|
1
|
+
from followthemoney.proxy import E
|
2
|
+
|
3
|
+
from nomenklatura.matching.types import FtResult, ScoringConfig
|
4
|
+
from nomenklatura.matching.util import props_pair
|
5
|
+
from nomenklatura.matching.util import FNUL
|
6
|
+
|
7
|
+
|
8
|
+
def gender_mismatch(query: E, result: E, config: ScoringConfig) -> FtResult:
|
9
|
+
"""Both entities have a different gender associated with them."""
|
10
|
+
qv, rv = props_pair(query, result, ["gender"])
|
11
|
+
if len(qv) > 0 and len(rv) > 0:
|
12
|
+
if len(set(qv).intersection(rv)) == 0:
|
13
|
+
detail = f"Different genders: {qv} / {rv}"
|
14
|
+
return FtResult(score=1.0, detail=detail)
|
15
|
+
return FtResult(score=FNUL, detail=None)
|
@@ -0,0 +1,30 @@
|
|
1
|
+
from rigour.ids import StrictFormat
|
2
|
+
from followthemoney import E, registry
|
3
|
+
|
4
|
+
from nomenklatura.matching.types import FtResult, ScoringConfig
|
5
|
+
from nomenklatura.matching.util import type_pair, props_pair, has_schema
|
6
|
+
from nomenklatura.matching.compare.util import clean_map
|
7
|
+
from nomenklatura.matching.util import FNUL
|
8
|
+
|
9
|
+
|
10
|
+
def crypto_wallet_address(query: E, result: E, config: ScoringConfig) -> FtResult:
|
11
|
+
"""Two cryptocurrency wallets have the same public key."""
|
12
|
+
if not has_schema(query, result, "CryptoWallet"):
|
13
|
+
return FtResult(score=FNUL, detail=None)
|
14
|
+
lv, rv = props_pair(query, result, ["publicKey"])
|
15
|
+
for key in lv.intersection(rv):
|
16
|
+
if len(key) > 10:
|
17
|
+
return FtResult(score=1.0, detail="Matched address: %s" % key)
|
18
|
+
return FtResult(score=FNUL, detail=None)
|
19
|
+
|
20
|
+
|
21
|
+
def identifier_match(query: E, result: E, config: ScoringConfig) -> FtResult:
|
22
|
+
"""Two entities have the same tax or registration identifier."""
|
23
|
+
query_ids_, result_ids_ = type_pair(query, result, registry.identifier)
|
24
|
+
query_ids = clean_map(query_ids_, StrictFormat.normalize)
|
25
|
+
result_ids = clean_map(result_ids_, StrictFormat.normalize)
|
26
|
+
common = query_ids.intersection(result_ids)
|
27
|
+
if len(common) > 0:
|
28
|
+
detail = "Matched identifiers: %s" % ", ".join(common)
|
29
|
+
return FtResult(score=1.0, detail=detail)
|
30
|
+
return FtResult(score=FNUL, detail=None)
|
@@ -0,0 +1,157 @@
|
|
1
|
+
from typing import List, Dict, Tuple
|
2
|
+
from itertools import product
|
3
|
+
from followthemoney.proxy import E
|
4
|
+
from followthemoney.types import registry
|
5
|
+
from rigour.text.distance import levenshtein_similarity
|
6
|
+
from rigour.text.distance import jaro_winkler, is_levenshtein_plausible
|
7
|
+
from nomenklatura.matching.types import FtResult, ScoringConfig
|
8
|
+
from nomenklatura.matching.util import type_pair, props_pair, has_schema
|
9
|
+
from nomenklatura.matching.compare.util import is_disjoint, clean_map
|
10
|
+
from nomenklatura.matching.compat import clean_name_ascii, clean_name_light
|
11
|
+
from nomenklatura.matching.compat import fingerprint_name, name_words, names_word_list
|
12
|
+
from nomenklatura.matching.util import FNUL
|
13
|
+
|
14
|
+
|
15
|
+
def _name_parts(name: str) -> List[str]:
|
16
|
+
return name_words(clean_name_ascii(name))
|
17
|
+
|
18
|
+
|
19
|
+
def _align_name_parts(query: List[str], result: List[str]) -> float:
|
20
|
+
if len(query) == 0 or len(result) == 0:
|
21
|
+
return 0.0
|
22
|
+
|
23
|
+
scores: Dict[Tuple[str, str], float] = {}
|
24
|
+
# compute all pairwise scores for name parts:
|
25
|
+
for qn, rn in product(set(query), set(result)):
|
26
|
+
score = jaro_winkler(qn, rn)
|
27
|
+
if score > 0.0 and is_levenshtein_plausible(qn, rn):
|
28
|
+
scores[(qn, rn)] = score
|
29
|
+
pairs: List[Tuple[str, str]] = []
|
30
|
+
# original length of query:
|
31
|
+
length = len(query)
|
32
|
+
total_score = 1.0
|
33
|
+
# find the best pairing for each name part by score:
|
34
|
+
for (qn, rn), score in sorted(scores.items(), key=lambda i: i[1], reverse=True):
|
35
|
+
# one name part can only be used once, but can show up multiple times:
|
36
|
+
while qn in query and rn in result:
|
37
|
+
query.remove(qn)
|
38
|
+
result.remove(rn)
|
39
|
+
total_score = total_score * score
|
40
|
+
pairs.append((qn, rn))
|
41
|
+
# assume there should be at least a candidate for each query name part:
|
42
|
+
if len(pairs) < length:
|
43
|
+
return 0.0
|
44
|
+
# weakest evidence first to bias jaro-winkler for lower scores on imperfect matches:
|
45
|
+
aligned = pairs[::-1]
|
46
|
+
query_aligned = "".join(p[0] for p in aligned)
|
47
|
+
result_aligned = "".join(p[1] for p in aligned)
|
48
|
+
if not is_levenshtein_plausible(query_aligned, result_aligned):
|
49
|
+
return 0.0
|
50
|
+
# return an amped-up jaro-winkler score for the aligned name parts:
|
51
|
+
return total_score
|
52
|
+
# return jaro_winkler(query_aligned, result_aligned)
|
53
|
+
|
54
|
+
|
55
|
+
def person_name_jaro_winkler(query: E, result: E) -> float:
|
56
|
+
"""Compare two persons' names using the Jaro-Winkler string similarity algorithm."""
|
57
|
+
if not has_schema(query, result, "Person"):
|
58
|
+
return FNUL
|
59
|
+
query_names_, result_names_ = type_pair(query, result, registry.name)
|
60
|
+
query_names = [_name_parts(n) for n in query_names_]
|
61
|
+
result_names = [_name_parts(n) for n in result_names_]
|
62
|
+
score = 0.0
|
63
|
+
for qn, rn in product(query_names, result_names):
|
64
|
+
qns = "".join(qn)
|
65
|
+
rns = "".join(rn)
|
66
|
+
if is_levenshtein_plausible(qns, rns):
|
67
|
+
score = max(score, jaro_winkler(qns, rns) ** len(qns))
|
68
|
+
score = max(score, _align_name_parts(list(qn), list(rn)))
|
69
|
+
return score
|
70
|
+
|
71
|
+
|
72
|
+
def name_fingerprint_levenshtein(query: E, result: E) -> float:
|
73
|
+
"""Two non-person entities have similar fingerprinted names. This includes
|
74
|
+
simplifying entity type names (e.g. "Limited" -> "Ltd") and uses the
|
75
|
+
Damerau-Levensthein string distance algorithm."""
|
76
|
+
if has_schema(query, result, "Person"):
|
77
|
+
return FNUL
|
78
|
+
query_names, result_names = type_pair(query, result, registry.name)
|
79
|
+
max_score = FNUL
|
80
|
+
for qn, rn in product(query_names, result_names):
|
81
|
+
score = levenshtein_similarity(qn, rn)
|
82
|
+
max_score = max(max_score, score)
|
83
|
+
qfp = fingerprint_name(qn)
|
84
|
+
rfp = fingerprint_name(rn)
|
85
|
+
if qfp is None or rfp is None:
|
86
|
+
continue
|
87
|
+
score = levenshtein_similarity(qfp.replace(" ", ""), rfp.replace(" ", ""))
|
88
|
+
max_score = max(max_score, score)
|
89
|
+
qtokens = name_words(qfp, min_length=2)
|
90
|
+
rtokens = name_words(rfp, min_length=2)
|
91
|
+
for part in name_words(clean_name_ascii(rfp), min_length=2):
|
92
|
+
if part not in rtokens:
|
93
|
+
rtokens.append(part)
|
94
|
+
|
95
|
+
scores: Dict[Tuple[str, str], float] = {}
|
96
|
+
# compute all pairwise scores for name parts:
|
97
|
+
for q, r in product(set(qtokens), set(rtokens)):
|
98
|
+
scores[(q, r)] = levenshtein_similarity(
|
99
|
+
q, r, max_edits=None, max_percent=1.0
|
100
|
+
)
|
101
|
+
aligned: List[Tuple[str, str, float]] = []
|
102
|
+
# find the best pairing for each name part by score:
|
103
|
+
for (q, r), score in sorted(
|
104
|
+
scores.items(), key=lambda i: (i[1], i[0]), reverse=True
|
105
|
+
):
|
106
|
+
# one name part can only be used once, but can show up multiple times:
|
107
|
+
while q in qtokens and r in rtokens:
|
108
|
+
qtokens.remove(q)
|
109
|
+
rtokens.remove(r)
|
110
|
+
aligned.append((q, r, score))
|
111
|
+
# assume there should be at least a candidate for each query name part:
|
112
|
+
if len(qtokens):
|
113
|
+
continue
|
114
|
+
qaligned = "".join(p[0] for p in aligned)
|
115
|
+
raligned = "".join(p[1] for p in aligned)
|
116
|
+
score = levenshtein_similarity(qaligned, raligned)
|
117
|
+
max_score = max(max_score, score)
|
118
|
+
return max_score
|
119
|
+
|
120
|
+
|
121
|
+
def name_literal_match(query: E, result: E, config: ScoringConfig) -> FtResult:
|
122
|
+
"""Two entities have the same name, without normalization applied to the name."""
|
123
|
+
query_names, result_names = type_pair(query, result, registry.name)
|
124
|
+
qnames = clean_map(query_names, clean_name_light)
|
125
|
+
rnames = clean_map(result_names, clean_name_light)
|
126
|
+
overlap = qnames.intersection(rnames)
|
127
|
+
if len(overlap) == 0:
|
128
|
+
return FtResult(score=FNUL, detail=None)
|
129
|
+
detail = f"Identical names: {', '.join(overlap)}"
|
130
|
+
return FtResult(score=1.0, detail=detail)
|
131
|
+
|
132
|
+
|
133
|
+
def last_name_mismatch(query: E, result: E) -> float:
|
134
|
+
"""The two persons have different last names."""
|
135
|
+
qv, rv = props_pair(query, result, ["lastName"])
|
136
|
+
qvt = names_word_list(qv, min_length=2)
|
137
|
+
rvt = names_word_list(rv, min_length=2)
|
138
|
+
# TODO: levenshtein
|
139
|
+
# for (qn, rn) in product(qvt, rvt):
|
140
|
+
# similarity = levenshtein_similarity(qn, rn)
|
141
|
+
return 1.0 if is_disjoint(qvt, rvt) else FNUL
|
142
|
+
|
143
|
+
|
144
|
+
def weak_alias_match(query: E, result: E, config: ScoringConfig) -> FtResult:
|
145
|
+
"""The query name is exactly the same as a result's weak alias."""
|
146
|
+
# NOTE: This is unbalanced, i.e. it treats 'query' and 'result' differently.
|
147
|
+
# cf. https://ofac.treasury.gov/faqs/topic/1646
|
148
|
+
query_names = query.get_type_values(registry.name)
|
149
|
+
query_names.extend(query.get("weakAlias", quiet=True))
|
150
|
+
result_names = result.get("weakAlias", quiet=True)
|
151
|
+
qnames = clean_map(query_names, clean_name_light)
|
152
|
+
rnames = clean_map(result_names, clean_name_light)
|
153
|
+
overlap = qnames.intersection(rnames)
|
154
|
+
if len(overlap) == 0:
|
155
|
+
return FtResult(score=FNUL, detail=None)
|
156
|
+
detail = f"Matched weak alias: {', '.join(overlap)}"
|
157
|
+
return FtResult(score=1.0, detail=detail)
|
@@ -0,0 +1,51 @@
|
|
1
|
+
import re
|
2
|
+
from typing import List, Set, Union, Iterable, Callable, Optional
|
3
|
+
|
4
|
+
CleanFunc = Optional[Callable[[str], Optional[str]]]
|
5
|
+
FIND_NUM = re.compile(r"\d{1,}")
|
6
|
+
|
7
|
+
|
8
|
+
def is_disjoint(
|
9
|
+
left: Union[Set[str], List[str]],
|
10
|
+
right: Union[Set[str], List[str]],
|
11
|
+
) -> bool:
|
12
|
+
"""Returns true if both sequences are non-empty but have no common values."""
|
13
|
+
if len(left) and len(right):
|
14
|
+
if set(left).isdisjoint(right):
|
15
|
+
return True
|
16
|
+
return False
|
17
|
+
|
18
|
+
|
19
|
+
def has_overlap(
|
20
|
+
left: Union[Set[str], List[str]],
|
21
|
+
right: Union[Set[str], List[str]],
|
22
|
+
) -> bool:
|
23
|
+
"""Returns true if both sequences are non-empty and have common values."""
|
24
|
+
if not set(left).isdisjoint(right):
|
25
|
+
return True
|
26
|
+
return False
|
27
|
+
|
28
|
+
|
29
|
+
def clean_map(
|
30
|
+
texts: Iterable[Optional[str]],
|
31
|
+
clean: CleanFunc = None,
|
32
|
+
) -> Set[str]:
|
33
|
+
"""Apply a cleaning function to a set of strings and only return non-empty ones."""
|
34
|
+
out: Set[str] = set()
|
35
|
+
for text in texts:
|
36
|
+
if text is None or len(text) == 0:
|
37
|
+
continue
|
38
|
+
if clean is not None:
|
39
|
+
text = clean(text)
|
40
|
+
if text is None or len(text) == 0:
|
41
|
+
continue
|
42
|
+
out.add(text)
|
43
|
+
return out
|
44
|
+
|
45
|
+
|
46
|
+
def extract_numbers(values: List[str]) -> Set[str]:
|
47
|
+
"""Extract all numbers from a list of strings."""
|
48
|
+
numbers: Set[str] = set()
|
49
|
+
for value in values:
|
50
|
+
numbers.update(FIND_NUM.findall(value))
|
51
|
+
return numbers
|