nomenklatura-mpt 4.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. nomenklatura/__init__.py +11 -0
  2. nomenklatura/cache.py +194 -0
  3. nomenklatura/cli.py +260 -0
  4. nomenklatura/conflicting_match.py +80 -0
  5. nomenklatura/data/er-unstable.pkl +0 -0
  6. nomenklatura/data/regression-v1.pkl +0 -0
  7. nomenklatura/db.py +139 -0
  8. nomenklatura/delta.py +4 -0
  9. nomenklatura/enrich/__init__.py +94 -0
  10. nomenklatura/enrich/aleph.py +141 -0
  11. nomenklatura/enrich/common.py +219 -0
  12. nomenklatura/enrich/nominatim.py +72 -0
  13. nomenklatura/enrich/opencorporates.py +233 -0
  14. nomenklatura/enrich/openfigi.py +124 -0
  15. nomenklatura/enrich/permid.py +201 -0
  16. nomenklatura/enrich/wikidata.py +268 -0
  17. nomenklatura/enrich/yente.py +116 -0
  18. nomenklatura/exceptions.py +9 -0
  19. nomenklatura/index/__init__.py +5 -0
  20. nomenklatura/index/common.py +24 -0
  21. nomenklatura/index/entry.py +89 -0
  22. nomenklatura/index/index.py +170 -0
  23. nomenklatura/index/tokenizer.py +92 -0
  24. nomenklatura/judgement.py +21 -0
  25. nomenklatura/kv.py +40 -0
  26. nomenklatura/matching/__init__.py +47 -0
  27. nomenklatura/matching/bench.py +32 -0
  28. nomenklatura/matching/compare/__init__.py +0 -0
  29. nomenklatura/matching/compare/addresses.py +71 -0
  30. nomenklatura/matching/compare/countries.py +15 -0
  31. nomenklatura/matching/compare/dates.py +83 -0
  32. nomenklatura/matching/compare/gender.py +15 -0
  33. nomenklatura/matching/compare/identifiers.py +30 -0
  34. nomenklatura/matching/compare/names.py +157 -0
  35. nomenklatura/matching/compare/util.py +51 -0
  36. nomenklatura/matching/compat.py +66 -0
  37. nomenklatura/matching/erun/__init__.py +0 -0
  38. nomenklatura/matching/erun/countries.py +42 -0
  39. nomenklatura/matching/erun/identifiers.py +64 -0
  40. nomenklatura/matching/erun/misc.py +71 -0
  41. nomenklatura/matching/erun/model.py +110 -0
  42. nomenklatura/matching/erun/names.py +126 -0
  43. nomenklatura/matching/erun/train.py +135 -0
  44. nomenklatura/matching/erun/util.py +28 -0
  45. nomenklatura/matching/logic_v1/__init__.py +0 -0
  46. nomenklatura/matching/logic_v1/identifiers.py +104 -0
  47. nomenklatura/matching/logic_v1/model.py +76 -0
  48. nomenklatura/matching/logic_v1/multi.py +21 -0
  49. nomenklatura/matching/logic_v1/phonetic.py +142 -0
  50. nomenklatura/matching/logic_v2/__init__.py +0 -0
  51. nomenklatura/matching/logic_v2/identifiers.py +124 -0
  52. nomenklatura/matching/logic_v2/model.py +98 -0
  53. nomenklatura/matching/logic_v2/names/__init__.py +3 -0
  54. nomenklatura/matching/logic_v2/names/analysis.py +51 -0
  55. nomenklatura/matching/logic_v2/names/distance.py +181 -0
  56. nomenklatura/matching/logic_v2/names/magic.py +60 -0
  57. nomenklatura/matching/logic_v2/names/match.py +195 -0
  58. nomenklatura/matching/logic_v2/names/pairing.py +81 -0
  59. nomenklatura/matching/logic_v2/names/util.py +89 -0
  60. nomenklatura/matching/name_based/__init__.py +4 -0
  61. nomenklatura/matching/name_based/misc.py +86 -0
  62. nomenklatura/matching/name_based/model.py +59 -0
  63. nomenklatura/matching/name_based/names.py +59 -0
  64. nomenklatura/matching/pairs.py +42 -0
  65. nomenklatura/matching/regression_v1/__init__.py +0 -0
  66. nomenklatura/matching/regression_v1/misc.py +75 -0
  67. nomenklatura/matching/regression_v1/model.py +110 -0
  68. nomenklatura/matching/regression_v1/names.py +63 -0
  69. nomenklatura/matching/regression_v1/train.py +87 -0
  70. nomenklatura/matching/regression_v1/util.py +31 -0
  71. nomenklatura/matching/svm_v1/__init__.py +5 -0
  72. nomenklatura/matching/svm_v1/misc.py +94 -0
  73. nomenklatura/matching/svm_v1/model.py +168 -0
  74. nomenklatura/matching/svm_v1/names.py +81 -0
  75. nomenklatura/matching/svm_v1/train.py +186 -0
  76. nomenklatura/matching/svm_v1/util.py +30 -0
  77. nomenklatura/matching/types.py +227 -0
  78. nomenklatura/matching/util.py +62 -0
  79. nomenklatura/publish/__init__.py +0 -0
  80. nomenklatura/publish/dates.py +49 -0
  81. nomenklatura/publish/edges.py +32 -0
  82. nomenklatura/py.typed +0 -0
  83. nomenklatura/resolver/__init__.py +6 -0
  84. nomenklatura/resolver/common.py +2 -0
  85. nomenklatura/resolver/edge.py +107 -0
  86. nomenklatura/resolver/identifier.py +60 -0
  87. nomenklatura/resolver/linker.py +101 -0
  88. nomenklatura/resolver/resolver.py +565 -0
  89. nomenklatura/settings.py +17 -0
  90. nomenklatura/store/__init__.py +41 -0
  91. nomenklatura/store/base.py +130 -0
  92. nomenklatura/store/level.py +272 -0
  93. nomenklatura/store/memory.py +102 -0
  94. nomenklatura/store/redis_.py +131 -0
  95. nomenklatura/store/sql.py +219 -0
  96. nomenklatura/store/util.py +48 -0
  97. nomenklatura/store/versioned.py +371 -0
  98. nomenklatura/tui/__init__.py +17 -0
  99. nomenklatura/tui/app.py +294 -0
  100. nomenklatura/tui/app.tcss +52 -0
  101. nomenklatura/tui/comparison.py +81 -0
  102. nomenklatura/tui/util.py +35 -0
  103. nomenklatura/util.py +26 -0
  104. nomenklatura/versions.py +119 -0
  105. nomenklatura/wikidata/__init__.py +14 -0
  106. nomenklatura/wikidata/client.py +122 -0
  107. nomenklatura/wikidata/lang.py +94 -0
  108. nomenklatura/wikidata/model.py +139 -0
  109. nomenklatura/wikidata/props.py +70 -0
  110. nomenklatura/wikidata/qualified.py +49 -0
  111. nomenklatura/wikidata/query.py +66 -0
  112. nomenklatura/wikidata/value.py +87 -0
  113. nomenklatura/xref.py +125 -0
  114. nomenklatura_mpt-4.1.9.dist-info/METADATA +159 -0
  115. nomenklatura_mpt-4.1.9.dist-info/RECORD +118 -0
  116. nomenklatura_mpt-4.1.9.dist-info/WHEEL +4 -0
  117. nomenklatura_mpt-4.1.9.dist-info/entry_points.txt +3 -0
  118. nomenklatura_mpt-4.1.9.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,268 @@
1
+ import logging
2
+ from typing import Generator, Optional, Set
3
+ from followthemoney.helpers import check_person_cutoff
4
+ from followthemoney import StatementEntity, registry, DS, SE
5
+ from requests import Session
6
+ from rigour.ids.wikidata import is_qid
7
+ from rigour.territories import get_territory_by_qid
8
+
9
+ from nomenklatura.cache import Cache
10
+ from nomenklatura.enrich.common import Enricher, EnricherConfig
11
+ from nomenklatura.wikidata.client import WikidataClient
12
+ from nomenklatura.wikidata.lang import LangText
13
+ from nomenklatura.wikidata.model import Claim, Item
14
+ from nomenklatura.wikidata.props import (
15
+ PROPS_ASSOCIATION,
16
+ PROPS_DIRECT,
17
+ PROPS_FAMILY,
18
+ PROPS_QUALIFIED,
19
+ PROPS_TOPICS,
20
+ )
21
+ from nomenklatura.wikidata.qualified import qualify_value
22
+ from nomenklatura.wikidata.value import clean_name, is_alias_strong
23
+
24
+ log = logging.getLogger(__name__)
25
+
26
+
27
+ class WikidataEnricher(Enricher[DS]):
28
+ def __init__(
29
+ self,
30
+ dataset: DS,
31
+ cache: Cache,
32
+ config: EnricherConfig,
33
+ session: Optional[Session] = None,
34
+ ):
35
+ super().__init__(dataset, cache, config, session)
36
+ self.depth = self.get_config_int("depth", 1)
37
+ self.client = WikidataClient(cache, self.session, cache_days=self.cache_days)
38
+
39
+ def keep_entity(self, entity: StatementEntity) -> bool:
40
+ if check_person_cutoff(entity):
41
+ return False
42
+ return True
43
+
44
+ def match(self, entity: SE) -> Generator[SE, None, None]:
45
+ if not entity.schema.is_a("Person"):
46
+ return
47
+
48
+ wikidata_id = self.get_wikidata_id(entity)
49
+
50
+ # Already has an ID associated with it:
51
+ if wikidata_id is not None:
52
+ item = self.client.fetch_item(wikidata_id)
53
+ if item is not None:
54
+ proxy = self.item_proxy(entity, item, schema=entity.schema.name)
55
+ if proxy is not None and self.keep_entity(proxy):
56
+ yield proxy
57
+ return
58
+
59
+ for name in entity.get("name", quiet=True):
60
+ params = {
61
+ "format": "json",
62
+ "search": name,
63
+ "action": "wbsearchentities",
64
+ "language": "en",
65
+ "strictlanguage": "false",
66
+ }
67
+ data = self.http_get_json_cached(WikidataClient.WD_API, params=params)
68
+ if "search" not in data:
69
+ self.http_remove_cache(WikidataClient.WD_API, params=params)
70
+ log.info("Search response [%s] does not include results" % name)
71
+ continue
72
+ for result in data["search"]:
73
+ item = self.client.fetch_item(result["id"])
74
+ if item is not None:
75
+ proxy = self.item_proxy(entity, item, schema=entity.schema.name)
76
+ if proxy is not None and self.keep_entity(proxy):
77
+ yield proxy
78
+
79
+ def expand(self, entity: SE, match: SE) -> Generator[SE, None, None]:
80
+ wikidata_id = self.get_wikidata_id(match)
81
+ if wikidata_id is None:
82
+ return
83
+ item = self.client.fetch_item(wikidata_id)
84
+ if item is None:
85
+ return
86
+ proxy = self.item_proxy(match, item, schema=match.schema.name)
87
+ if proxy is None or not self.keep_entity(proxy):
88
+ return
89
+ if "role.pep" in entity.get("topics", quiet=True):
90
+ proxy.add("topics", "role.pep")
91
+ yield proxy
92
+ yield from self.item_graph(proxy, item)
93
+
94
+ def get_wikidata_id(self, entity: SE) -> Optional[str]:
95
+ if entity.id is not None and is_qid(entity.id):
96
+ return str(entity.id)
97
+ for value in entity.get("wikidataId", quiet=True):
98
+ if is_qid(value):
99
+ return value
100
+ return None
101
+
102
+ def make_link(
103
+ self,
104
+ proxy: SE,
105
+ claim: Claim,
106
+ depth: int,
107
+ seen: Set[str],
108
+ schema: str,
109
+ other_schema: str,
110
+ source_prop: str,
111
+ target_prop: str,
112
+ ) -> Generator[SE, None, None]:
113
+ if depth < 1 or claim.qid is None or claim.qid in seen:
114
+ return
115
+ item = self.client.fetch_item(claim.qid)
116
+ if item is None:
117
+ return
118
+
119
+ other = self.item_proxy(proxy, item, schema=other_schema)
120
+ if other is None or not self.keep_entity(other):
121
+ return None
122
+ if proxy.id is None or other.id is None:
123
+ return None
124
+ # Hacky: if an entity is a PEP, then by definition their relatives and
125
+ # associates are RCA (relatives and close associates).
126
+ if "role.pep" in proxy.get("topics", quiet=True):
127
+ if "role.pep" not in other.get("topics"):
128
+ other.add("topics", "role.rca")
129
+ yield other
130
+ yield from self.item_graph(other, item, depth=depth - 1, seen=seen)
131
+ link = self.make_entity(proxy, schema)
132
+ min_id, max_id = sorted((proxy.id, other.id))
133
+ # FIXME: doesn't lead to collisions because claim.property has an inverse:
134
+ link.id = f"wd-{claim.property}-{min_id}-{max_id}"
135
+ link.id = link.id.lower()
136
+ link.add(source_prop, proxy.id)
137
+ link.add(target_prop, item.id)
138
+ claim.property_label.apply(link, "relationship")
139
+
140
+ for qual in claim.get_qualifier("P580"):
141
+ qual.text.apply(link, "startDate")
142
+
143
+ for qual in claim.get_qualifier("P582"):
144
+ qual.text.apply(link, "endDate")
145
+
146
+ for qual in claim.get_qualifier("P585"):
147
+ qual.text.apply(link, "date")
148
+
149
+ for qual in claim.get_qualifier("P1039"):
150
+ qual.text.apply(link, "relationship")
151
+
152
+ for qual in claim.get_qualifier("P2868"):
153
+ qual.text.apply(link, "relationship")
154
+
155
+ for ref in claim.references:
156
+ for snak in ref.get("P854"):
157
+ snak.text.apply(link, "sourceUrl")
158
+ yield link
159
+
160
+ def item_graph(
161
+ self,
162
+ proxy: SE,
163
+ item: Item,
164
+ depth: Optional[int] = None,
165
+ seen: Optional[Set[str]] = None,
166
+ ) -> Generator[SE, None, None]:
167
+ if seen is None:
168
+ seen = set()
169
+ seen = seen.union([item.id])
170
+ if depth is None:
171
+ depth = self.depth
172
+ for claim in item.claims:
173
+ # TODO: memberships, employers?
174
+ if claim.property in PROPS_FAMILY:
175
+ yield from self.make_link(
176
+ proxy,
177
+ claim,
178
+ depth,
179
+ seen,
180
+ schema="Family",
181
+ other_schema="Person",
182
+ source_prop="person",
183
+ target_prop="relative",
184
+ )
185
+ continue
186
+ if claim.property in PROPS_ASSOCIATION:
187
+ yield from self.make_link(
188
+ proxy,
189
+ claim,
190
+ depth,
191
+ seen,
192
+ schema="Associate",
193
+ other_schema="Person",
194
+ source_prop="person",
195
+ target_prop="associate",
196
+ )
197
+ continue
198
+
199
+ def item_proxy(self, ref: SE, item: Item, schema: str = "Person") -> Optional[SE]:
200
+ proxy = self.make_entity(ref, schema)
201
+ proxy.id = item.id
202
+ if item.modified is None:
203
+ return None
204
+ # proxy.add("modifiedAt", item.modified)
205
+ proxy.add("wikidataId", item.id)
206
+ names: Set[str] = set()
207
+ for label in item.labels:
208
+ label.apply(proxy, "name", clean=clean_name)
209
+ if label.text is not None:
210
+ names.add(label.text.lower())
211
+ if item.description is not None:
212
+ item.description.apply(proxy, "notes")
213
+ for alias in item.aliases:
214
+ if alias.text is None or alias.text.lower() in names:
215
+ continue
216
+ _strong = is_alias_strong(alias.text, names)
217
+ prop = "alias" if _strong else "weakAlias"
218
+ alias.apply(proxy, prop, clean=clean_name)
219
+ if _strong:
220
+ names.add(alias.text.lower())
221
+
222
+ if proxy.schema.is_a("Person") and not item.is_instance("Q5"):
223
+ log.debug("Person is not a Q5 [%s]: %s", item.id, item.labels)
224
+ return None
225
+
226
+ names_concat = " ".join(names)
227
+ for claim in item.claims:
228
+ if claim.property is None:
229
+ continue
230
+ ftm_prop = PROPS_DIRECT.get(claim.property)
231
+ if ftm_prop is None:
232
+ continue
233
+ if ftm_prop not in proxy.schema.properties:
234
+ log.info("Entity %s does not have property: %s", proxy.id, ftm_prop)
235
+ continue
236
+ ftm_prop_ = proxy.schema.get(ftm_prop)
237
+ if ftm_prop_ is None:
238
+ log.info("Entity %s does not have property: %s", proxy.id, ftm_prop)
239
+ continue
240
+ if ftm_prop_.type == registry.country:
241
+ territory = get_territory_by_qid(claim.qid)
242
+ if territory is None or territory.ftm_country is None:
243
+ continue
244
+ value = LangText(territory.ftm_country, original=claim.qid)
245
+ else:
246
+ value = claim.text
247
+
248
+ # Sanity check that the name parts are in any of the full names:
249
+ if ftm_prop in ("firstName", "lastName", "fatherName"):
250
+ if value.text is None or value.text.lower() not in names_concat:
251
+ continue
252
+
253
+ # Make sure the aliases look like the main name, otherwise mark them as weak:
254
+ if ftm_prop == "alias":
255
+ if value.text is None or value.text.lower() in names:
256
+ continue
257
+ _strong = is_alias_strong(value.text, names)
258
+ ftm_prop = "alias" if _strong else "weakAlias"
259
+
260
+ if ftm_prop in PROPS_QUALIFIED:
261
+ value = qualify_value(value, claim)
262
+ if ftm_prop == "topics":
263
+ topic = PROPS_TOPICS.get(claim.qid or "")
264
+ if topic is None:
265
+ continue
266
+ value = LangText(topic, original=claim.qid)
267
+ value.apply(proxy, ftm_prop)
268
+ return proxy
@@ -0,0 +1,116 @@
1
+ import os
2
+ import time
3
+ import logging
4
+ from banal import ensure_list
5
+ from typing import Any, Generator, Optional, Dict, List
6
+ from urllib.parse import urljoin
7
+ from followthemoney import registry, DS, SE
8
+ from followthemoney import StatementEntity
9
+ from followthemoney.namespace import Namespace
10
+ from requests import Session
11
+ from rigour.urls import build_url
12
+
13
+ from nomenklatura.cache import Cache
14
+ from nomenklatura.enrich.common import Enricher, EnricherConfig
15
+ from nomenklatura.enrich.common import EnrichmentException
16
+
17
+ log = logging.getLogger(__name__)
18
+
19
+
20
+ class YenteEnricher(Enricher[DS]):
21
+ """Uses the `yente` match API to look up entities in a specific dataset."""
22
+
23
+ def __init__(
24
+ self,
25
+ dataset: DS,
26
+ cache: Cache,
27
+ config: EnricherConfig,
28
+ session: Optional[Session] = None,
29
+ ):
30
+ super().__init__(dataset, cache, config, session)
31
+ self._api: str = config.pop("api")
32
+ self._yente_dataset: str = config.pop("dataset", "default")
33
+ self._cutoff: Optional[float] = config.pop("cutoff", None)
34
+ self._algorithm: Optional[float] = config.pop("algorithm", "best")
35
+ self._nested: bool = config.pop("expand_nested", True)
36
+ self._fuzzy: bool = config.pop("fuzzy", False)
37
+ self._ns: Optional[Namespace] = None
38
+ if self.get_config_bool("strip_namespace"):
39
+ self._ns = Namespace()
40
+
41
+ api_key: Optional[str] = os.path.expandvars(config.pop("api_key", "")).strip()
42
+ if api_key is None or not len(api_key):
43
+ api_key = os.environ.get("YENTE_API_KEY")
44
+ self._api_key: Optional[str] = api_key
45
+ if self._api_key is not None:
46
+ self.session.headers["Authorization"] = f"ApiKey {self._api_key}"
47
+
48
+ def make_url(self, entity: StatementEntity) -> str:
49
+ return urljoin(self._api, f"entities/{entity.id}")
50
+
51
+ def match(self, entity: SE) -> Generator[SE, None, None]:
52
+ if not entity.schema.matchable:
53
+ return
54
+ url = urljoin(self._api, f"match/{self._yente_dataset}")
55
+ params: Dict[str, Any] = {"fuzzy": self._fuzzy, "algorithm": self._algorithm}
56
+ if self._cutoff is not None:
57
+ params["cutoff"] = self._cutoff
58
+ url = build_url(url, params)
59
+ cache_key = f"{url}:{entity.id}"
60
+ props: Dict[str, List[str]] = {}
61
+ for prop in entity.iterprops():
62
+ if prop.type == registry.entity:
63
+ continue
64
+ if prop.matchable:
65
+ props[prop.name] = entity.get(prop)
66
+ query = {
67
+ "queries": {
68
+ "entity": {
69
+ "schema": entity.schema.name,
70
+ "properties": props,
71
+ }
72
+ }
73
+ }
74
+ for retry in range(4):
75
+ try:
76
+ response = self.http_post_json_cached(url, cache_key, query)
77
+ inner_resp = response.get("responses", {}).get("entity", {})
78
+ for result in inner_resp.get("results", []):
79
+ proxy = self.load_entity(entity, result)
80
+ proxy.add("sourceUrl", self.make_url(proxy))
81
+ if self._ns is not None:
82
+ proxy = self._ns.apply(proxy)
83
+ yield proxy
84
+ return
85
+ except EnrichmentException as exc:
86
+ log.info("Error matching %r: %s", entity, exc)
87
+ if retry == 3:
88
+ raise
89
+ time.sleep((retry + 1) ** 2)
90
+
91
+ def _traverse_nested(self, entity: SE, response: Any) -> Generator[SE, None, None]:
92
+ entity = self.load_entity(entity, response)
93
+ if self._ns is not None:
94
+ entity = self._ns.apply(entity)
95
+ yield entity
96
+ for prop_name, values in response.get("properties", {}).items():
97
+ prop = entity.schema.properties.get(prop_name)
98
+ if prop is None or prop.type != registry.entity:
99
+ continue
100
+ for value in ensure_list(values):
101
+ if isinstance(value, dict):
102
+ if prop.reverse is not None and not prop.reverse.stub:
103
+ reverse = prop.reverse.name
104
+ if reverse not in value["properties"]:
105
+ value["properties"][reverse] = []
106
+ value["properties"][reverse].append(entity.id)
107
+ yield from self._traverse_nested(entity, value)
108
+
109
+ def expand(self, entity: SE, match: SE) -> Generator[SE, None, None]:
110
+ url = self.make_url(match)
111
+ for source_url in match.get("sourceUrl", quiet=True):
112
+ if source_url.startswith(self._api):
113
+ url = source_url
114
+ url = build_url(url, {"nested": self._nested})
115
+ response = self.http_get_json_cached(url)
116
+ yield from self._traverse_nested(match, response)
@@ -0,0 +1,9 @@
1
+ from followthemoney.exc import FollowTheMoneyException
2
+
3
+
4
+ class NomenklaturaException(FollowTheMoneyException):
5
+ pass
6
+
7
+
8
+ class MetadataException(NomenklaturaException):
9
+ pass
@@ -0,0 +1,5 @@
1
+ from nomenklatura.index.index import Index
2
+ from nomenklatura.index.common import BaseIndex
3
+
4
+
5
+ __all__ = ["BaseIndex", "Index"]
@@ -0,0 +1,24 @@
1
+ from pathlib import Path
2
+ from typing import Generic, Iterable, List, Tuple
3
+ from followthemoney import DS, SE
4
+ from nomenklatura.resolver import Identifier
5
+ from nomenklatura.store import View
6
+
7
+
8
+ class BaseIndex(Generic[DS, SE]):
9
+ MAX_PAIRS = 10_000
10
+ name: str
11
+
12
+ def __init__(self, view: View[DS, SE], data_dir: Path) -> None:
13
+ raise NotImplementedError
14
+
15
+ def build(self) -> None:
16
+ raise NotImplementedError
17
+
18
+ def pairs(
19
+ self, max_pairs: int = MAX_PAIRS
20
+ ) -> Iterable[Tuple[Tuple[Identifier, Identifier], float]]:
21
+ raise NotImplementedError
22
+
23
+ def match(self, entity: SE) -> List[Tuple[Identifier, float]]:
24
+ raise NotImplementedError
@@ -0,0 +1,89 @@
1
+ from typing import Any, Dict, Generator, Tuple
2
+
3
+ from nomenklatura.resolver import Identifier
4
+
5
+
6
+ class Entry(object):
7
+ """A set of entities and a weight associated with a given term in the index."""
8
+
9
+ __slots__ = "idf", "entities"
10
+
11
+ def __init__(self) -> None:
12
+ self.entities: Dict[Identifier, int] = dict()
13
+
14
+ def add(self, entity_id: Identifier) -> None:
15
+ """Mark the given entity as relevant to the entry's token."""
16
+ # This is insane and meant to trade perf for memory:
17
+ try:
18
+ self.entities[entity_id] += 1
19
+ except KeyError:
20
+ self.entities[entity_id] = 1
21
+
22
+ def frequencies(
23
+ self, field: "Field"
24
+ ) -> Generator[Tuple[Identifier, float], None, None]:
25
+ """
26
+ Term Frequency (TF) for each entity in this entry.
27
+
28
+ TF being the number of occurrences of this token in the entity divided
29
+ by the total number of tokens in the entity (scoped to this field).
30
+ """
31
+ for entity_id, mentions in self.entities.items():
32
+ field_len = max(1, field.entities[entity_id])
33
+ yield entity_id, (mentions / field_len)
34
+
35
+ def __repr__(self) -> str:
36
+ return "<Entry(%r)>" % len(self.entities)
37
+
38
+ def to_dict(self) -> Dict[str, Any]:
39
+ return {"entities": self.entities}
40
+
41
+ @classmethod
42
+ def from_dict(cls, data: Dict[str, Any]) -> "Entry":
43
+ obj = cls()
44
+ obj.entities = data["entities"]
45
+ return obj
46
+
47
+
48
+ class Field(object):
49
+ """Index of all tokens of the same type."""
50
+
51
+ __slots__ = "len", "avg_len", "tokens", "entities"
52
+
53
+ def __init__(self) -> None:
54
+ self.len = 0
55
+ self.avg_len = 0.0
56
+ self.tokens: Dict[str, Entry] = {}
57
+ self.entities: Dict[Identifier, int] = {}
58
+
59
+ def add(self, entity_id: Identifier, token: str) -> None:
60
+ if token not in self.tokens:
61
+ self.tokens[token] = Entry()
62
+ self.tokens[token].add(entity_id)
63
+ try:
64
+ self.entities[entity_id] += 1
65
+ except KeyError:
66
+ self.entities[entity_id] = 1
67
+
68
+ def compute(self) -> None:
69
+ self.len = max(1, len(self.entities))
70
+ self.avg_len = sum(self.entities.values()) / self.len
71
+
72
+ def to_dict(self) -> Dict[str, Any]:
73
+ return {
74
+ "tokens": {t: e.to_dict() for t, e in self.tokens.items()},
75
+ "entities": {i.id: c for i, c in self.entities.items()},
76
+ }
77
+
78
+ @classmethod
79
+ def from_dict(cls, data: Dict[str, Any]) -> "Field":
80
+ obj = cls()
81
+ inverted = data["tokens"].items()
82
+ obj.tokens = {t: Entry.from_dict(i) for t, i in inverted}
83
+ # obj.entities = cast(Dict[str, int], data.get("entities"))
84
+ entities: Dict[str, int] = data.get("entities", {})
85
+ obj.entities = {Identifier.get(e): c for e, c in entities.items()}
86
+ return obj
87
+
88
+ def __repr__(self) -> str:
89
+ return "<Field(%d, %.3f)>" % (self.len, self.avg_len)
@@ -0,0 +1,170 @@
1
+ from pathlib import Path
2
+ import pickle
3
+ import logging
4
+ from itertools import combinations
5
+ from typing import Any, Dict, List, Set, Tuple
6
+ from followthemoney import registry, DS, SE
7
+ from followthemoney.util import PathLike
8
+
9
+ from nomenklatura.resolver import Pair, Identifier
10
+ from nomenklatura.store import View
11
+ from nomenklatura.index.entry import Field
12
+ from nomenklatura.index.tokenizer import NAME_PART_FIELD, WORD_FIELD, Tokenizer
13
+ from nomenklatura.index.common import BaseIndex
14
+
15
+ log = logging.getLogger(__name__)
16
+
17
+
18
+ class Index(BaseIndex[DS, SE]):
19
+ """
20
+ An in-memory search index to match entities against a given dataset.
21
+
22
+ For each field in the dataset, the index stores the IDs which contains each
23
+ token, along with the absolute frequency of each token in the document.
24
+ """
25
+
26
+ name = "memory"
27
+
28
+ BOOSTS = {
29
+ NAME_PART_FIELD: 2.0,
30
+ WORD_FIELD: 0.5,
31
+ registry.name.name: 10.0,
32
+ # registry.country.name: 1.5,
33
+ # registry.date.name: 1.5,
34
+ # registry.language: 0.7,
35
+ # registry.iban.name: 3.0,
36
+ registry.phone.name: 3.0,
37
+ registry.email.name: 3.0,
38
+ # registry.entity: 0.0,
39
+ # registry.topic: 2.1,
40
+ registry.address.name: 2.5,
41
+ registry.identifier.name: 3.0,
42
+ }
43
+
44
+ __slots__ = "view", "fields", "tokenizer", "entities"
45
+
46
+ def __init__(self, view: View[DS, SE], data_dir: Path):
47
+ self.view = view
48
+ self.tokenizer = Tokenizer[DS, SE]()
49
+ self.fields: Dict[str, Field] = {}
50
+ self.entities: Set[Identifier] = set()
51
+
52
+ def index(self, entity: SE) -> None:
53
+ """Index one entity. This is not idempotent, you need to remove the
54
+ entity before re-indexing it."""
55
+ if not entity.schema.matchable or entity.id is None:
56
+ return
57
+ ident = Identifier.get(entity.id)
58
+ for field, token in self.tokenizer.entity(entity):
59
+ if field not in self.fields:
60
+ self.fields[field] = Field()
61
+ self.fields[field].add(ident, token)
62
+ self.entities.add(ident)
63
+
64
+ def build(self) -> None:
65
+ """Index all entities in the dataset."""
66
+ log.info("Building index from: %r...", self.view)
67
+ self.fields = {}
68
+ self.entities = set()
69
+ for entity in self.view.entities():
70
+ self.index(entity)
71
+ self.commit()
72
+ log.info("Built index: %r", self)
73
+
74
+ def commit(self) -> None:
75
+ for field in self.fields.values():
76
+ field.compute()
77
+
78
+ def pairs(self, max_pairs: int = BaseIndex.MAX_PAIRS) -> List[Tuple[Pair, float]]:
79
+ """
80
+ A second method of doing xref: summing up the pairwise match value
81
+ for all entities linearly. This uses a lot of memory but is really
82
+ fast.
83
+
84
+ The score of each pair is the the sum of the product of term frequencies for
85
+ each co-occurring token in each field of the pair.
86
+
87
+ We skip any tokens with more than 100 entities.
88
+ """
89
+ pairs: Dict[Pair, float] = {}
90
+ log.info("Building index blocking pairs...")
91
+ for field_name, field in self.fields.items():
92
+ boost = self.BOOSTS.get(field_name, 1.0)
93
+ for idx, entry in enumerate(field.tokens.values()):
94
+ if idx % 10000 == 0:
95
+ log.info("Pairwise xref [%s]: %d" % (field_name, idx))
96
+
97
+ if len(entry.entities) == 1 or len(entry.entities) > 100:
98
+ continue
99
+ entities = entry.frequencies(field)
100
+ for (left, lw), (right, rw) in combinations(entities, 2):
101
+ if lw == 0.0 or rw == 0.0:
102
+ continue
103
+ pair = (max(left, right), min(left, right))
104
+ if pair not in pairs:
105
+ pairs[pair] = 0
106
+ score = (lw + rw) * boost
107
+ pairs[pair] += score
108
+
109
+ return sorted(pairs.items(), key=lambda p: p[1], reverse=True)[:max_pairs]
110
+
111
+ def match(self, entity: SE) -> List[Tuple[Identifier, float]]:
112
+ """Match an entity against the index, returning a list of
113
+ (entity_id, score) pairs."""
114
+ scores: Dict[Identifier, float] = {}
115
+ for field_name, token in self.tokenizer.entity(entity):
116
+ field = self.fields.get(field_name)
117
+ if field is None:
118
+ continue
119
+ entry = field.tokens.get(token)
120
+ if entry is None:
121
+ continue
122
+ for ident, weight in entry.frequencies(field):
123
+ if ident not in scores:
124
+ scores[ident] = 0.0
125
+ scores[ident] += weight * self.BOOSTS.get(field_name, 1.0)
126
+ return sorted(scores.items(), key=lambda s: s[1], reverse=True)
127
+
128
+ def save(self, path: PathLike) -> None:
129
+ with open(path, "wb") as fh:
130
+ pickle.dump(self.to_dict(), fh)
131
+
132
+ @classmethod
133
+ def load(cls, view: View[DS, SE], path: Path, data_dir: Path) -> "Index[DS, SE]":
134
+ index = Index(view, data_dir)
135
+ if not path.exists():
136
+ log.debug("Cannot load: %r", index)
137
+ index.build()
138
+ index.save(path)
139
+ return index
140
+
141
+ with open(path, "rb") as fh:
142
+ state = pickle.load(fh)
143
+ index.from_dict(state)
144
+ index.commit()
145
+ log.debug("Loaded: %r", index)
146
+ return index
147
+
148
+ def to_dict(self) -> Dict[str, Any]:
149
+ """Prepare an index for pickling."""
150
+ return {
151
+ "fields": {n: f.to_dict() for n, f in self.fields.items()},
152
+ "entities": [e.id for e in self.entities],
153
+ }
154
+
155
+ def from_dict(self, state: Dict[str, Any]) -> None:
156
+ """Restore a pickled index."""
157
+ fields = state["fields"].items()
158
+ self.fields = {t: Field.from_dict(i) for t, i in fields}
159
+ entities: List[str] = state.get("entities", [])
160
+ self.entities = set((Identifier.get(e) for e in entities))
161
+
162
+ def __len__(self) -> int:
163
+ return len(self.entities)
164
+
165
+ def __repr__(self) -> str:
166
+ return "<Index(%r, %d, %d)>" % (
167
+ self.view.scope.name,
168
+ len(self.fields),
169
+ len(self.entities),
170
+ )