nomenklatura-mpt 4.1.10__tar.gz → 4.1.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/PKG-INFO +1 -1
  2. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/__init__.py +7 -0
  3. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/logic_v2/names/match.py +17 -0
  4. nomenklatura_mpt-4.1.12/nomenklatura/matching/logic_v3/identifiers.py +104 -0
  5. nomenklatura_mpt-4.1.12/nomenklatura/matching/logic_v3/model.py +99 -0
  6. nomenklatura_mpt-4.1.12/nomenklatura/matching/logic_v3/multi.py +21 -0
  7. nomenklatura_mpt-4.1.12/nomenklatura/matching/logic_v3/phonetic.py +142 -0
  8. nomenklatura_mpt-4.1.12/nomenklatura/matching/logic_v4/identifiers.py +104 -0
  9. nomenklatura_mpt-4.1.12/nomenklatura/matching/logic_v4/model.py +105 -0
  10. nomenklatura_mpt-4.1.12/nomenklatura/matching/logic_v4/multi.py +21 -0
  11. nomenklatura_mpt-4.1.12/nomenklatura/matching/logic_v4/phonetic.py +142 -0
  12. nomenklatura_mpt-4.1.12/nomenklatura/publish/__init__.py +0 -0
  13. nomenklatura_mpt-4.1.12/nomenklatura/py.typed +0 -0
  14. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/pyproject.toml +1 -1
  15. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/.gitignore +0 -0
  16. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/LICENSE +0 -0
  17. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/README.md +0 -0
  18. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/__init__.py +0 -0
  19. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/cache.py +0 -0
  20. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/cli.py +0 -0
  21. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/conflicting_match.py +0 -0
  22. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/data/er-unstable.pkl +0 -0
  23. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/data/regression-v1.pkl +0 -0
  24. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/data/svm-v1.pkl +0 -0
  25. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/db.py +0 -0
  26. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/delta.py +0 -0
  27. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/enrich/__init__.py +0 -0
  28. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/enrich/aleph.py +0 -0
  29. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/enrich/common.py +0 -0
  30. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/enrich/nominatim.py +0 -0
  31. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/enrich/opencorporates.py +0 -0
  32. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/enrich/openfigi.py +0 -0
  33. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/enrich/permid.py +0 -0
  34. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/enrich/wikidata.py +0 -0
  35. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/enrich/yente.py +0 -0
  36. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/exceptions.py +0 -0
  37. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/index/__init__.py +0 -0
  38. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/index/common.py +0 -0
  39. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/index/entry.py +0 -0
  40. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/index/index.py +0 -0
  41. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/index/tokenizer.py +0 -0
  42. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/judgement.py +0 -0
  43. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/kv.py +0 -0
  44. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/bench.py +0 -0
  45. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/compare/__init__.py +0 -0
  46. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/compare/addresses.py +0 -0
  47. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/compare/countries.py +0 -0
  48. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/compare/dates.py +0 -0
  49. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/compare/gender.py +0 -0
  50. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/compare/identifiers.py +0 -0
  51. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/compare/names.py +0 -0
  52. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/compare/util.py +0 -0
  53. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/compat.py +0 -0
  54. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/erun/__init__.py +0 -0
  55. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/erun/countries.py +0 -0
  56. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/erun/identifiers.py +0 -0
  57. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/erun/misc.py +0 -0
  58. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/erun/model.py +0 -0
  59. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/erun/names.py +0 -0
  60. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/erun/train.py +0 -0
  61. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/erun/util.py +0 -0
  62. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/logic_v1/__init__.py +0 -0
  63. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/logic_v1/identifiers.py +0 -0
  64. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/logic_v1/model.py +0 -0
  65. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/logic_v1/multi.py +0 -0
  66. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/logic_v1/phonetic.py +0 -0
  67. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/logic_v2/__init__.py +0 -0
  68. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/logic_v2/identifiers.py +0 -0
  69. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/logic_v2/model.py +0 -0
  70. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/logic_v2/names/__init__.py +0 -0
  71. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/logic_v2/names/analysis.py +0 -0
  72. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/logic_v2/names/distance.py +0 -0
  73. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/logic_v2/names/magic.py +0 -0
  74. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/logic_v2/names/pairing.py +0 -0
  75. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/logic_v2/names/util.py +0 -0
  76. {nomenklatura_mpt-4.1.10/nomenklatura/matching/regression_v1 → nomenklatura_mpt-4.1.12/nomenklatura/matching/logic_v3}/__init__.py +0 -0
  77. {nomenklatura_mpt-4.1.10/nomenklatura/publish → nomenklatura_mpt-4.1.12/nomenklatura/matching/logic_v4}/__init__.py +0 -0
  78. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/name_based/__init__.py +0 -0
  79. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/name_based/misc.py +0 -0
  80. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/name_based/model.py +0 -0
  81. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/name_based/names.py +0 -0
  82. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/pairs.py +0 -0
  83. /nomenklatura_mpt-4.1.10/nomenklatura/py.typed → /nomenklatura_mpt-4.1.12/nomenklatura/matching/regression_v1/__init__.py +0 -0
  84. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/regression_v1/misc.py +0 -0
  85. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/regression_v1/model.py +0 -0
  86. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/regression_v1/names.py +0 -0
  87. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/regression_v1/train.py +0 -0
  88. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/regression_v1/util.py +0 -0
  89. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/svm_v1/__init__.py +0 -0
  90. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/svm_v1/misc.py +0 -0
  91. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/svm_v1/model.py +0 -0
  92. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/svm_v1/names.py +0 -0
  93. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/svm_v1/train.py +0 -0
  94. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/svm_v1/util.py +0 -0
  95. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/types.py +0 -0
  96. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/matching/util.py +0 -0
  97. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/publish/dates.py +0 -0
  98. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/publish/edges.py +0 -0
  99. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/resolver/__init__.py +0 -0
  100. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/resolver/common.py +0 -0
  101. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/resolver/edge.py +0 -0
  102. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/resolver/identifier.py +0 -0
  103. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/resolver/linker.py +0 -0
  104. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/resolver/resolver.py +0 -0
  105. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/settings.py +0 -0
  106. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/store/__init__.py +0 -0
  107. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/store/base.py +0 -0
  108. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/store/level.py +0 -0
  109. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/store/memory.py +0 -0
  110. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/store/redis_.py +0 -0
  111. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/store/sql.py +0 -0
  112. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/store/util.py +0 -0
  113. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/store/versioned.py +0 -0
  114. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/tui/__init__.py +0 -0
  115. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/tui/app.py +0 -0
  116. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/tui/app.tcss +0 -0
  117. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/tui/comparison.py +0 -0
  118. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/tui/util.py +0 -0
  119. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/util.py +0 -0
  120. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/versions.py +0 -0
  121. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/wikidata/__init__.py +0 -0
  122. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/wikidata/client.py +0 -0
  123. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/wikidata/lang.py +0 -0
  124. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/wikidata/model.py +0 -0
  125. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/wikidata/props.py +0 -0
  126. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/wikidata/qualified.py +0 -0
  127. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/wikidata/query.py +0 -0
  128. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/wikidata/value.py +0 -0
  129. {nomenklatura_mpt-4.1.10 → nomenklatura_mpt-4.1.12}/nomenklatura/xref.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nomenklatura_mpt
3
- Version: 4.1.10
3
+ Version: 4.1.12
4
4
  Summary: Make record linkages in followthemoney data.
5
5
  Project-URL: Documentation, https://github.com/opensanctions/nomenklatura/
6
6
  Project-URL: Repository, https://github.com/opensanctions/nomenklatura.git
@@ -8,11 +8,15 @@ from nomenklatura.matching.erun.model import EntityResolveRegression
8
8
  from nomenklatura.matching.erun.train import train_matcher as train_erun_matcher
9
9
  from nomenklatura.matching.logic_v1.model import LogicV1
10
10
  from nomenklatura.matching.logic_v2.model import LogicV2
11
+ from nomenklatura.matching.logic_v3.model import LogicV3
12
+ from nomenklatura.matching.logic_v4.model import LogicV4
11
13
  from nomenklatura.matching.types import ScoringAlgorithm, ScoringConfig
12
14
 
13
15
  ALGORITHMS: List[Type[ScoringAlgorithm]] = [
14
16
  LogicV1,
15
17
  LogicV2,
18
+ LogicV3,
19
+ LogicV4,
16
20
  NameMatcher,
17
21
  NameQualifiedMatcher,
18
22
  RegressionV1,
@@ -44,4 +48,7 @@ __all__ = [
44
48
  "ScoringConfig",
45
49
  "LogicV1",
46
50
  "LogicV2",
51
+ "LogicV3",
52
+ "LogicV4",
53
+ "SVMV1",
47
54
  ]
@@ -193,3 +193,20 @@ def name_match(query: E, result: E, config: ScoringConfig) -> FtResult:
193
193
  if best.detail is None:
194
194
  best.detail = "No names available for matching"
195
195
  return best
196
+
197
+
198
+
199
+
200
+ def name_match_levenshtein(query: E, result: E, config: ScoringConfig) -> FtResult:
201
+ """Match two entities by analyzing and comparing their names."""
202
+ schema = model.common_schema(query.schema, result.schema)
203
+ type_tag = schema_type_tag(schema)
204
+ best = FtResult(score=0.0, detail=None)
205
+ if type_tag == NameTypeTag.UNK:
206
+ # Name matching is not supported for entities that are not listed
207
+ # as a person, organization, or a thing.
208
+ best.detail = "Unsuited for name matching: %s" % schema.name
209
+ return best
210
+
211
+ return match_object_names(query, result, config)
212
+
@@ -0,0 +1,104 @@
1
+ from itertools import product
2
+ from rigour.ids import LEI, ISIN, INN, OGRN, IMO, BIC
3
+ from rigour.ids import StrictFormat
4
+ from rigour.text.distance import levenshtein
5
+ from followthemoney import E, registry
6
+
7
+ from nomenklatura.matching.types import FtResult, ScoringConfig
8
+ from nomenklatura.matching.util import has_schema, type_pair
9
+ from nomenklatura.matching.compare.util import clean_map, CleanFunc
10
+
11
+
12
+ def _id_prop_match(
13
+ query: E,
14
+ result: E,
15
+ prop_name: str,
16
+ clean: CleanFunc = None,
17
+ ) -> bool:
18
+ """Check if a specific property identifier is shared by two entities."""
19
+ prop = query.schema.get(prop_name)
20
+ if prop is None:
21
+ return False
22
+ lv = clean_map(query.get(prop), clean=clean)
23
+ if not len(lv):
24
+ return False
25
+ rv_ = result.get_type_values(prop.type, matchable=True)
26
+ rv = clean_map(rv_, clean=clean)
27
+ common = lv.intersection(rv)
28
+ return len(common) > 0
29
+
30
+
31
+ def _bidi_id_prop_match(
32
+ query: E,
33
+ result: E,
34
+ prop_name: str,
35
+ clean: CleanFunc = None,
36
+ ) -> FtResult:
37
+ """Check if a specific property identifier is shared by two entities."""
38
+ if _id_prop_match(query, result, prop_name, clean=clean):
39
+ return FtResult(score=1.0, detail="Property match: %r" % prop_name)
40
+ if _id_prop_match(result, query, prop_name, clean=clean):
41
+ return FtResult(score=1.0, detail="Property match: %r" % prop_name)
42
+ return FtResult(score=0.0, detail="No match: %r" % prop_name)
43
+
44
+
45
+ def lei_code_match(query: E, result: E, config: ScoringConfig) -> FtResult:
46
+ """Two entities have the same Legal Entity Identifier."""
47
+ return _bidi_id_prop_match(query, result, "leiCode", LEI.normalize)
48
+
49
+
50
+ def bic_code_match(query: E, result: E, config: ScoringConfig) -> FtResult:
51
+ """Two entities have the same SWIFT BIC."""
52
+ return _bidi_id_prop_match(query, result, "swiftBic", BIC.normalize)
53
+
54
+
55
+ def ogrn_code_match(query: E, result: E, config: ScoringConfig) -> FtResult:
56
+ """Two entities have the same Russian company registration (OGRN) code."""
57
+ return _bidi_id_prop_match(query, result, "ogrnCode", OGRN.normalize)
58
+
59
+
60
+ def inn_code_match(query: E, result: E, config: ScoringConfig) -> FtResult:
61
+ """Two entities have the same Russian tax identifier (INN)."""
62
+ return _bidi_id_prop_match(query, result, "innCode", INN.normalize)
63
+
64
+
65
+ def isin_security_match(query: E, result: E, config: ScoringConfig) -> FtResult:
66
+ """Two securities have the same ISIN."""
67
+ if not has_schema(query, result, "Security"):
68
+ return FtResult(score=0.0, detail="None of the entities is a security")
69
+ return _bidi_id_prop_match(query, result, "isin", ISIN.normalize)
70
+
71
+
72
+ def vessel_imo_mmsi_match(query: E, result: E, config: ScoringConfig) -> FtResult:
73
+ """Two vessels have the same IMO or MMSI identifier."""
74
+ imo_res = _bidi_id_prop_match(query, result, "imoNumber", IMO.normalize)
75
+ if imo_res.score > 0.0:
76
+ return imo_res
77
+ return _bidi_id_prop_match(query, result, "mmsi")
78
+
79
+
80
+ def orgid_disjoint(query: E, result: E, config: ScoringConfig) -> FtResult:
81
+ """Two companies or organizations have different tax identifiers or registration
82
+ numbers."""
83
+ if not has_schema(query, result, "Organization"):
84
+ return FtResult(score=0.0, detail=None)
85
+ query_ids_, result_ids_ = type_pair(query, result, registry.identifier)
86
+ query_ids = clean_map(query_ids_, StrictFormat.normalize)
87
+ result_ids = clean_map(result_ids_, StrictFormat.normalize)
88
+ if not len(query_ids) or not len(result_ids):
89
+ return FtResult(score=0.0, detail=None)
90
+ common = query_ids.intersection(result_ids)
91
+ if len(common) > 0:
92
+ return FtResult(score=0.0, detail=None)
93
+ max_ratio = 0.0
94
+ for query_id, result_id in product(query_ids, result_ids):
95
+ distance = levenshtein(query_id, result_id)
96
+ max_len = max(len(query_id), len(result_id))
97
+ ratio = 1.0 - (distance / float(max_len))
98
+ if ratio > 0.7:
99
+ max_ratio = max(max_ratio, ratio)
100
+ detail = "Mismatched identifiers: %s vs %s" % (
101
+ ", ".join(query_ids),
102
+ ", ".join(result_ids),
103
+ )
104
+ return FtResult(score=1 - max_ratio, detail=detail)
@@ -0,0 +1,99 @@
1
+ from typing import Dict, List
2
+
3
+ from nomenklatura.matching.types import Feature, HeuristicAlgorithm
4
+ from nomenklatura.matching.types import ConfigVar, ConfigVarType
5
+ from nomenklatura.matching.compare.countries import country_mismatch
6
+ from nomenklatura.matching.compare.gender import gender_mismatch
7
+ from nomenklatura.matching.compare.identifiers import crypto_wallet_address
8
+ from nomenklatura.matching.compare.identifiers import identifier_match
9
+ from nomenklatura.matching.compare.dates import dob_day_disjoint, dob_year_disjoint
10
+ from nomenklatura.matching.compare.names import weak_alias_match
11
+ from nomenklatura.matching.compare.addresses import address_entity_match
12
+ from nomenklatura.matching.compare.addresses import address_prop_match
13
+ from nomenklatura.matching.logic_v2.names.match import name_match_levenshtein
14
+ from nomenklatura.matching.logic_v2.identifiers import bic_code_match
15
+ from nomenklatura.matching.logic_v2.identifiers import inn_code_match, ogrn_code_match
16
+ from nomenklatura.matching.logic_v2.identifiers import isin_security_match
17
+ from nomenklatura.matching.logic_v2.identifiers import lei_code_match
18
+ from nomenklatura.matching.logic_v2.identifiers import vessel_imo_mmsi_match
19
+ from nomenklatura.matching.logic_v2.identifiers import uei_code_match
20
+ from nomenklatura.matching.logic_v2.identifiers import npi_code_match
21
+ from nomenklatura.matching.util import FNUL
22
+
23
+
24
+ class LogicV3(HeuristicAlgorithm):
25
+ """A rule-based matching system that generates a set of basic scores via
26
+ name and identifier-based matching, and then qualifies that score using
27
+ supporting or contradicting features of the two entities. Version 3 uses
28
+ the same set of features as version 2, but replaces the name_match feature
29
+ with a new implementation. This new name matching function uses strict levenshtein
30
+ for name matching"""
31
+
32
+ NAME = "logic-v3"
33
+ features = [
34
+ Feature(func=name_match_levenshtein, weight=1.0),
35
+ Feature(func=address_entity_match, weight=0.98),
36
+ Feature(func=crypto_wallet_address, weight=0.98),
37
+ Feature(func=isin_security_match, weight=0.98),
38
+ Feature(func=lei_code_match, weight=0.95),
39
+ Feature(func=ogrn_code_match, weight=0.95),
40
+ Feature(func=vessel_imo_mmsi_match, weight=0.95),
41
+ Feature(func=inn_code_match, weight=0.95),
42
+ Feature(func=bic_code_match, weight=0.95),
43
+ Feature(func=uei_code_match, weight=0.95),
44
+ Feature(func=npi_code_match, weight=0.95),
45
+ Feature(func=identifier_match, weight=0.85),
46
+ Feature(func=weak_alias_match, weight=0.8),
47
+ Feature(func=address_prop_match, weight=0.2, qualifier=True),
48
+ Feature(func=country_mismatch, weight=-0.2, qualifier=True),
49
+ Feature(func=dob_year_disjoint, weight=-0.15, qualifier=True),
50
+ Feature(func=dob_day_disjoint, weight=-0.25, qualifier=True),
51
+ Feature(func=gender_mismatch, weight=-0.2, qualifier=True),
52
+ ]
53
+ CONFIG = {
54
+ "nm_number_mismatch": ConfigVar(
55
+ type=ConfigVarType.FLOAT,
56
+ description="Penalty for mismatching numbers in object or company names.",
57
+ default=0.3,
58
+ ),
59
+ "nm_extra_query_name": ConfigVar(
60
+ type=ConfigVarType.FLOAT,
61
+ description="Weight for name parts in the query not matched to the result.",
62
+ default=0.8,
63
+ ),
64
+ "nm_extra_result_name": ConfigVar(
65
+ type=ConfigVarType.FLOAT,
66
+ description="Weight for name parts in the result not matched to the query.",
67
+ default=0.2,
68
+ ),
69
+ "nm_family_name_weight": ConfigVar(
70
+ type=ConfigVarType.FLOAT,
71
+ description="Extra weight multiplier for family name in person matches (John Smith vs. John Gruber is clearly distinct).",
72
+ default=1.3,
73
+ ),
74
+ "nm_fuzzy_cutoff_factor": ConfigVar(
75
+ type=ConfigVarType.FLOAT,
76
+ description="Extra factor for when a fuzzy match is triggered in name matching. "
77
+ "Below a certain threshold, a fuzzy match is considered as a non-match (score = 0.0). "
78
+ "Adjusting this multiplier will raise this threshold, making a fuzzy match trigger more leniently.",
79
+ default=1.0,
80
+ ),
81
+ }
82
+
83
+ @classmethod
84
+ def compute_score(
85
+ cls, scores: Dict[str, float], weights: Dict[str, float]
86
+ ) -> float:
87
+ mains: List[float] = []
88
+ for feat in cls.features:
89
+ if feat.qualifier:
90
+ continue
91
+ weight = scores.get(feat.name, FNUL) * weights.get(feat.name, FNUL)
92
+ mains.append(weight)
93
+ score = max(mains)
94
+ for feat in cls.features:
95
+ if not feat.qualifier:
96
+ continue
97
+ weight = scores.get(feat.name, FNUL) * weights.get(feat.name, FNUL)
98
+ score += weight
99
+ return score
@@ -0,0 +1,21 @@
1
+ from followthemoney.proxy import E
2
+ from followthemoney.types import registry
3
+
4
+ from nomenklatura.matching.types import FtResult, ScoringConfig
5
+ from nomenklatura.matching.compare.util import extract_numbers
6
+ from nomenklatura.matching.util import type_pair, has_schema
7
+
8
+
9
+ def numbers_mismatch(query: E, result: E, config: ScoringConfig) -> FtResult:
10
+ """Find numbers in names and addresses and penalise different numbers."""
11
+ if has_schema(query, result, "Address"):
12
+ qv, rv = type_pair(query, result, registry.address)
13
+ else:
14
+ qv, rv = type_pair(query, result, registry.name)
15
+ qvn = extract_numbers(qv)
16
+ rvn = extract_numbers(rv)
17
+ base = min(len(qvn), len(rvn))
18
+ mismatch = len(qvn.difference(rvn))
19
+ # print("numbers_mismatch", mismatch, base, qvn, rvn)
20
+ score = float(mismatch) / float(max(1, base))
21
+ return FtResult(score=score, detail="Mismatching numbers: %s" % mismatch)
@@ -0,0 +1,142 @@
1
+ from functools import cached_property
2
+ from typing import List, Optional
3
+ from itertools import product
4
+ from normality import ascii_text
5
+ from followthemoney.proxy import E
6
+ from followthemoney.types import registry
7
+ from rigour.text.scripts import can_latinize
8
+ from rigour.text.distance import is_levenshtein_plausible
9
+ from rigour.text.phonetics import metaphone, soundex
10
+ from rigour.names import tokenize_name
11
+ from rigour.util import list_intersection
12
+
13
+ from nomenklatura.matching.util import type_pair, has_schema
14
+ from nomenklatura.matching.compat import fingerprint_name, name_words
15
+
16
+
17
+ class NameTokenPhonetic:
18
+ def __init__(self, token: str):
19
+ self.token = token
20
+ self.ascii = ascii_text(token) if can_latinize(token) else None
21
+
22
+ @cached_property
23
+ def metaphone(self) -> Optional[str]:
24
+ if self.ascii is not None:
25
+ phoneme = metaphone(self.ascii)
26
+ if len(phoneme) >= 3:
27
+ return phoneme
28
+ return None
29
+
30
+ # def __repr__(self) -> str:
31
+ # return f"<NameTokenPhonetic {self.token!r}, {self.ascii!r}, {self.metaphone!r}>"
32
+
33
+ @classmethod
34
+ def from_name(cls, name: str) -> List["NameTokenPhonetic"]:
35
+ tokens = tokenize_name(name.lower(), token_min_length=2)
36
+ return [cls(token) for token in tokens]
37
+
38
+
39
+ def metaphone_token(token: str) -> str:
40
+ if token.isalpha() and len(token) > 1:
41
+ out = metaphone(token)
42
+ # doesn't handle non-ascii characters
43
+ if len(out) >= 3:
44
+ return out
45
+ return token.upper()
46
+
47
+
48
+ def soundex_token(token: str) -> str:
49
+ if token.isalpha() and len(token) > 1:
50
+ out = soundex(token)
51
+ # doesn't handle non-ascii characters
52
+ if len(out):
53
+ return out
54
+ return token.upper()
55
+
56
+
57
+ def compare_parts_phonetic(left: NameTokenPhonetic, right: NameTokenPhonetic) -> bool:
58
+ if left.metaphone is None or right.metaphone is None:
59
+ return left.ascii == right.ascii
60
+ if (
61
+ left.metaphone == right.metaphone
62
+ and left.ascii is not None
63
+ and right.ascii is not None
64
+ ):
65
+ # Secondary check for Levenshtein distance:
66
+ if is_levenshtein_plausible(left.ascii, right.ascii):
67
+ return True
68
+ return False
69
+
70
+
71
+ def _clean_phonetic_entity(original: str) -> Optional[str]:
72
+ """Normalize a legal entity name without transliteration."""
73
+ if not can_latinize(original):
74
+ return None
75
+ return fingerprint_name(original)
76
+
77
+
78
+ def _token_names_compare(
79
+ query_names: List[List[str]], result_names: List[List[str]]
80
+ ) -> float:
81
+ score = 0.0
82
+ for q, r in product(query_names, result_names):
83
+ # length = max(2.0, (len(q) + len(r)) / 2.0)
84
+ length = max(2.0, len(q))
85
+ combo = len(list_intersection(q, r)) / float(length)
86
+ score = max(score, combo)
87
+ return score
88
+
89
+
90
+ def person_name_phonetic_match(query: E, result: E) -> float:
91
+ """Two persons have similar names, using a phonetic algorithm."""
92
+ if not has_schema(query, result, "Person"):
93
+ return 0.0
94
+ query_names_, result_names_ = type_pair(query, result, registry.name)
95
+ query_parts = [NameTokenPhonetic.from_name(n) for n in query_names_]
96
+ result_parts = [NameTokenPhonetic.from_name(n) for n in result_names_]
97
+ score = 0.0
98
+ for q, r in product(query_parts, result_parts):
99
+ if len(q) == 0:
100
+ continue
101
+ matches = list(r)
102
+ matched = 0
103
+ for part in q:
104
+ for other in matches:
105
+ if compare_parts_phonetic(part, other):
106
+ matches.remove(other)
107
+ matched += 1
108
+ break
109
+ score = max(score, matched / float(len(q)))
110
+ return score
111
+
112
+
113
+ def _metaphone_tokens(token: str) -> List[str]:
114
+ words: List[str] = []
115
+ for word in name_words(_clean_phonetic_entity(token), min_length=2):
116
+ words.append(metaphone_token(word))
117
+ return words
118
+
119
+
120
+ def name_metaphone_match(query: E, result: E) -> float:
121
+ """Two entities (person and non-person) have similar names, using the metaphone
122
+ algorithm."""
123
+ query_names_, result_names_ = type_pair(query, result, registry.name)
124
+ query_names = [_metaphone_tokens(n) for n in query_names_]
125
+ result_names = [_metaphone_tokens(n) for n in result_names_]
126
+ return _token_names_compare(query_names, result_names)
127
+
128
+
129
+ def _soundex_tokens(token: str) -> List[str]:
130
+ words: List[str] = []
131
+ for word in name_words(_clean_phonetic_entity(token), min_length=2):
132
+ words.append(soundex_token(word))
133
+ return words
134
+
135
+
136
+ def name_soundex_match(query: E, result: E) -> float:
137
+ """Two entities (person and non-person) have similar names, using the soundex
138
+ algorithm."""
139
+ query_names_, result_names_ = type_pair(query, result, registry.name)
140
+ query_names = [_soundex_tokens(n) for n in query_names_]
141
+ result_names = [_soundex_tokens(n) for n in result_names_]
142
+ return _token_names_compare(query_names, result_names)
@@ -0,0 +1,104 @@
1
+ from itertools import product
2
+ from rigour.ids import LEI, ISIN, INN, OGRN, IMO, BIC
3
+ from rigour.ids import StrictFormat
4
+ from rigour.text.distance import levenshtein
5
+ from followthemoney import E, registry
6
+
7
+ from nomenklatura.matching.types import FtResult, ScoringConfig
8
+ from nomenklatura.matching.util import has_schema, type_pair
9
+ from nomenklatura.matching.compare.util import clean_map, CleanFunc
10
+
11
+
12
+ def _id_prop_match(
13
+ query: E,
14
+ result: E,
15
+ prop_name: str,
16
+ clean: CleanFunc = None,
17
+ ) -> bool:
18
+ """Check if a specific property identifier is shared by two entities."""
19
+ prop = query.schema.get(prop_name)
20
+ if prop is None:
21
+ return False
22
+ lv = clean_map(query.get(prop), clean=clean)
23
+ if not len(lv):
24
+ return False
25
+ rv_ = result.get_type_values(prop.type, matchable=True)
26
+ rv = clean_map(rv_, clean=clean)
27
+ common = lv.intersection(rv)
28
+ return len(common) > 0
29
+
30
+
31
+ def _bidi_id_prop_match(
32
+ query: E,
33
+ result: E,
34
+ prop_name: str,
35
+ clean: CleanFunc = None,
36
+ ) -> FtResult:
37
+ """Check if a specific property identifier is shared by two entities."""
38
+ if _id_prop_match(query, result, prop_name, clean=clean):
39
+ return FtResult(score=1.0, detail="Property match: %r" % prop_name)
40
+ if _id_prop_match(result, query, prop_name, clean=clean):
41
+ return FtResult(score=1.0, detail="Property match: %r" % prop_name)
42
+ return FtResult(score=0.0, detail="No match: %r" % prop_name)
43
+
44
+
45
+ def lei_code_match(query: E, result: E, config: ScoringConfig) -> FtResult:
46
+ """Two entities have the same Legal Entity Identifier."""
47
+ return _bidi_id_prop_match(query, result, "leiCode", LEI.normalize)
48
+
49
+
50
+ def bic_code_match(query: E, result: E, config: ScoringConfig) -> FtResult:
51
+ """Two entities have the same SWIFT BIC."""
52
+ return _bidi_id_prop_match(query, result, "swiftBic", BIC.normalize)
53
+
54
+
55
+ def ogrn_code_match(query: E, result: E, config: ScoringConfig) -> FtResult:
56
+ """Two entities have the same Russian company registration (OGRN) code."""
57
+ return _bidi_id_prop_match(query, result, "ogrnCode", OGRN.normalize)
58
+
59
+
60
+ def inn_code_match(query: E, result: E, config: ScoringConfig) -> FtResult:
61
+ """Two entities have the same Russian tax identifier (INN)."""
62
+ return _bidi_id_prop_match(query, result, "innCode", INN.normalize)
63
+
64
+
65
+ def isin_security_match(query: E, result: E, config: ScoringConfig) -> FtResult:
66
+ """Two securities have the same ISIN."""
67
+ if not has_schema(query, result, "Security"):
68
+ return FtResult(score=0.0, detail="None of the entities is a security")
69
+ return _bidi_id_prop_match(query, result, "isin", ISIN.normalize)
70
+
71
+
72
+ def vessel_imo_mmsi_match(query: E, result: E, config: ScoringConfig) -> FtResult:
73
+ """Two vessels have the same IMO or MMSI identifier."""
74
+ imo_res = _bidi_id_prop_match(query, result, "imoNumber", IMO.normalize)
75
+ if imo_res.score > 0.0:
76
+ return imo_res
77
+ return _bidi_id_prop_match(query, result, "mmsi")
78
+
79
+
80
+ def orgid_disjoint(query: E, result: E, config: ScoringConfig) -> FtResult:
81
+ """Two companies or organizations have different tax identifiers or registration
82
+ numbers."""
83
+ if not has_schema(query, result, "Organization"):
84
+ return FtResult(score=0.0, detail=None)
85
+ query_ids_, result_ids_ = type_pair(query, result, registry.identifier)
86
+ query_ids = clean_map(query_ids_, StrictFormat.normalize)
87
+ result_ids = clean_map(result_ids_, StrictFormat.normalize)
88
+ if not len(query_ids) or not len(result_ids):
89
+ return FtResult(score=0.0, detail=None)
90
+ common = query_ids.intersection(result_ids)
91
+ if len(common) > 0:
92
+ return FtResult(score=0.0, detail=None)
93
+ max_ratio = 0.0
94
+ for query_id, result_id in product(query_ids, result_ids):
95
+ distance = levenshtein(query_id, result_id)
96
+ max_len = max(len(query_id), len(result_id))
97
+ ratio = 1.0 - (distance / float(max_len))
98
+ if ratio > 0.7:
99
+ max_ratio = max(max_ratio, ratio)
100
+ detail = "Mismatched identifiers: %s vs %s" % (
101
+ ", ".join(query_ids),
102
+ ", ".join(result_ids),
103
+ )
104
+ return FtResult(score=1 - max_ratio, detail=detail)
@@ -0,0 +1,105 @@
1
+ from typing import Dict, List
2
+
3
+ from nomenklatura.matching.logic_v1.phonetic import name_soundex_match, person_name_phonetic_match
4
+ from nomenklatura.matching.logic_v4.phonetic import name_metaphone_match
5
+ from nomenklatura.matching.types import Feature, FtResult, HeuristicAlgorithm
6
+ from nomenklatura.matching.types import ConfigVar, ConfigVarType
7
+ from nomenklatura.matching.compare.countries import country_mismatch
8
+ from nomenklatura.matching.compare.gender import gender_mismatch
9
+ from nomenklatura.matching.compare.identifiers import crypto_wallet_address
10
+ from nomenklatura.matching.compare.identifiers import identifier_match
11
+ from nomenklatura.matching.compare.dates import dob_day_disjoint, dob_year_disjoint
12
+ from nomenklatura.matching.compare.names import weak_alias_match
13
+ from nomenklatura.matching.compare.addresses import address_entity_match
14
+ from nomenklatura.matching.compare.addresses import address_prop_match
15
+ from nomenklatura.matching.logic_v2.names.match import name_match_levenshtein
16
+ from nomenklatura.matching.logic_v2.identifiers import bic_code_match
17
+ from nomenklatura.matching.logic_v2.identifiers import inn_code_match, ogrn_code_match
18
+ from nomenklatura.matching.logic_v2.identifiers import isin_security_match
19
+ from nomenklatura.matching.logic_v2.identifiers import lei_code_match
20
+ from nomenklatura.matching.logic_v2.identifiers import vessel_imo_mmsi_match
21
+ from nomenklatura.matching.logic_v2.identifiers import uei_code_match
22
+ from nomenklatura.matching.logic_v2.identifiers import npi_code_match
23
+ from nomenklatura.matching.util import FNUL
24
+
25
+
26
+ class LogicV4(HeuristicAlgorithm):
27
+ """A rule-based matching system that generates a set of basic scores via
28
+ name and identifier-based matching, and then qualifies that score using
29
+ supporting or contradicting features of the two entities. Version 4 uses
30
+ the same set of features as version 3, but adds phonetic name matching
31
+ as an additional name matching strategy. This new name matching function
32
+ uses metaphone and soundex algorithms for phonetic name matching."""
33
+
34
+ NAME = "logic-v4"
35
+ features = [
36
+ Feature(func=name_match_levenshtein, weight=1.0),
37
+ Feature(func=FtResult.wrap(person_name_phonetic_match), weight=0.9),
38
+ # These are there so they can be enabled using custom weights:
39
+ Feature(func=FtResult.wrap(name_metaphone_match), weight=FNUL),
40
+ Feature(func=FtResult.wrap(name_soundex_match), weight=FNUL),
41
+ Feature(func=address_entity_match, weight=0.98),
42
+ Feature(func=crypto_wallet_address, weight=0.98),
43
+ Feature(func=isin_security_match, weight=0.98),
44
+ Feature(func=lei_code_match, weight=0.95),
45
+ Feature(func=ogrn_code_match, weight=0.95),
46
+ Feature(func=vessel_imo_mmsi_match, weight=0.95),
47
+ Feature(func=inn_code_match, weight=0.95),
48
+ Feature(func=bic_code_match, weight=0.95),
49
+ Feature(func=uei_code_match, weight=0.95),
50
+ Feature(func=npi_code_match, weight=0.95),
51
+ Feature(func=identifier_match, weight=0.85),
52
+ Feature(func=weak_alias_match, weight=0.8),
53
+ Feature(func=address_prop_match, weight=0.2, qualifier=True),
54
+ Feature(func=country_mismatch, weight=-0.2, qualifier=True),
55
+ Feature(func=dob_year_disjoint, weight=-0.15, qualifier=True),
56
+ Feature(func=dob_day_disjoint, weight=-0.25, qualifier=True),
57
+ Feature(func=gender_mismatch, weight=-0.2, qualifier=True),
58
+ ]
59
+ CONFIG = {
60
+ "nm_number_mismatch": ConfigVar(
61
+ type=ConfigVarType.FLOAT,
62
+ description="Penalty for mismatching numbers in object or company names.",
63
+ default=0.3,
64
+ ),
65
+ "nm_extra_query_name": ConfigVar(
66
+ type=ConfigVarType.FLOAT,
67
+ description="Weight for name parts in the query not matched to the result.",
68
+ default=0.8,
69
+ ),
70
+ "nm_extra_result_name": ConfigVar(
71
+ type=ConfigVarType.FLOAT,
72
+ description="Weight for name parts in the result not matched to the query.",
73
+ default=0.2,
74
+ ),
75
+ "nm_family_name_weight": ConfigVar(
76
+ type=ConfigVarType.FLOAT,
77
+ description="Extra weight multiplier for family name in person matches (John Smith vs. John Gruber is clearly distinct).",
78
+ default=1.3,
79
+ ),
80
+ "nm_fuzzy_cutoff_factor": ConfigVar(
81
+ type=ConfigVarType.FLOAT,
82
+ description="Extra factor for when a fuzzy match is triggered in name matching. "
83
+ "Below a certain threshold, a fuzzy match is considered as a non-match (score = 0.0). "
84
+ "Adjusting this multiplier will raise this threshold, making a fuzzy match trigger more leniently.",
85
+ default=1.0,
86
+ ),
87
+ }
88
+
89
+ @classmethod
90
+ def compute_score(
91
+ cls, scores: Dict[str, float], weights: Dict[str, float]
92
+ ) -> float:
93
+ mains: List[float] = []
94
+ for feat in cls.features:
95
+ if feat.qualifier:
96
+ continue
97
+ weight = scores.get(feat.name, FNUL) * weights.get(feat.name, FNUL)
98
+ mains.append(weight)
99
+ score = max(mains)
100
+ for feat in cls.features:
101
+ if not feat.qualifier:
102
+ continue
103
+ weight = scores.get(feat.name, FNUL) * weights.get(feat.name, FNUL)
104
+ score += weight
105
+ return score
@@ -0,0 +1,21 @@
1
+ from followthemoney.proxy import E
2
+ from followthemoney.types import registry
3
+
4
+ from nomenklatura.matching.types import FtResult, ScoringConfig
5
+ from nomenklatura.matching.compare.util import extract_numbers
6
+ from nomenklatura.matching.util import type_pair, has_schema
7
+
8
+
9
+ def numbers_mismatch(query: E, result: E, config: ScoringConfig) -> FtResult:
10
+ """Find numbers in names and addresses and penalise different numbers."""
11
+ if has_schema(query, result, "Address"):
12
+ qv, rv = type_pair(query, result, registry.address)
13
+ else:
14
+ qv, rv = type_pair(query, result, registry.name)
15
+ qvn = extract_numbers(qv)
16
+ rvn = extract_numbers(rv)
17
+ base = min(len(qvn), len(rvn))
18
+ mismatch = len(qvn.difference(rvn))
19
+ # print("numbers_mismatch", mismatch, base, qvn, rvn)
20
+ score = float(mismatch) / float(max(1, base))
21
+ return FtResult(score=score, detail="Mismatching numbers: %s" % mismatch)