nomenklatura-mpt 4.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. nomenklatura/__init__.py +11 -0
  2. nomenklatura/cache.py +194 -0
  3. nomenklatura/cli.py +260 -0
  4. nomenklatura/conflicting_match.py +80 -0
  5. nomenklatura/data/er-unstable.pkl +0 -0
  6. nomenklatura/data/regression-v1.pkl +0 -0
  7. nomenklatura/db.py +139 -0
  8. nomenklatura/delta.py +4 -0
  9. nomenklatura/enrich/__init__.py +94 -0
  10. nomenklatura/enrich/aleph.py +141 -0
  11. nomenklatura/enrich/common.py +219 -0
  12. nomenklatura/enrich/nominatim.py +72 -0
  13. nomenklatura/enrich/opencorporates.py +233 -0
  14. nomenklatura/enrich/openfigi.py +124 -0
  15. nomenklatura/enrich/permid.py +201 -0
  16. nomenklatura/enrich/wikidata.py +268 -0
  17. nomenklatura/enrich/yente.py +116 -0
  18. nomenklatura/exceptions.py +9 -0
  19. nomenklatura/index/__init__.py +5 -0
  20. nomenklatura/index/common.py +24 -0
  21. nomenklatura/index/entry.py +89 -0
  22. nomenklatura/index/index.py +170 -0
  23. nomenklatura/index/tokenizer.py +92 -0
  24. nomenklatura/judgement.py +21 -0
  25. nomenklatura/kv.py +40 -0
  26. nomenklatura/matching/__init__.py +47 -0
  27. nomenklatura/matching/bench.py +32 -0
  28. nomenklatura/matching/compare/__init__.py +0 -0
  29. nomenklatura/matching/compare/addresses.py +71 -0
  30. nomenklatura/matching/compare/countries.py +15 -0
  31. nomenklatura/matching/compare/dates.py +83 -0
  32. nomenklatura/matching/compare/gender.py +15 -0
  33. nomenklatura/matching/compare/identifiers.py +30 -0
  34. nomenklatura/matching/compare/names.py +157 -0
  35. nomenklatura/matching/compare/util.py +51 -0
  36. nomenklatura/matching/compat.py +66 -0
  37. nomenklatura/matching/erun/__init__.py +0 -0
  38. nomenklatura/matching/erun/countries.py +42 -0
  39. nomenklatura/matching/erun/identifiers.py +64 -0
  40. nomenklatura/matching/erun/misc.py +71 -0
  41. nomenklatura/matching/erun/model.py +110 -0
  42. nomenklatura/matching/erun/names.py +126 -0
  43. nomenklatura/matching/erun/train.py +135 -0
  44. nomenklatura/matching/erun/util.py +28 -0
  45. nomenklatura/matching/logic_v1/__init__.py +0 -0
  46. nomenklatura/matching/logic_v1/identifiers.py +104 -0
  47. nomenklatura/matching/logic_v1/model.py +76 -0
  48. nomenklatura/matching/logic_v1/multi.py +21 -0
  49. nomenklatura/matching/logic_v1/phonetic.py +142 -0
  50. nomenklatura/matching/logic_v2/__init__.py +0 -0
  51. nomenklatura/matching/logic_v2/identifiers.py +124 -0
  52. nomenklatura/matching/logic_v2/model.py +98 -0
  53. nomenklatura/matching/logic_v2/names/__init__.py +3 -0
  54. nomenklatura/matching/logic_v2/names/analysis.py +51 -0
  55. nomenklatura/matching/logic_v2/names/distance.py +181 -0
  56. nomenklatura/matching/logic_v2/names/magic.py +60 -0
  57. nomenklatura/matching/logic_v2/names/match.py +195 -0
  58. nomenklatura/matching/logic_v2/names/pairing.py +81 -0
  59. nomenklatura/matching/logic_v2/names/util.py +89 -0
  60. nomenklatura/matching/name_based/__init__.py +4 -0
  61. nomenklatura/matching/name_based/misc.py +86 -0
  62. nomenklatura/matching/name_based/model.py +59 -0
  63. nomenklatura/matching/name_based/names.py +59 -0
  64. nomenklatura/matching/pairs.py +42 -0
  65. nomenklatura/matching/regression_v1/__init__.py +0 -0
  66. nomenklatura/matching/regression_v1/misc.py +75 -0
  67. nomenklatura/matching/regression_v1/model.py +110 -0
  68. nomenklatura/matching/regression_v1/names.py +63 -0
  69. nomenklatura/matching/regression_v1/train.py +87 -0
  70. nomenklatura/matching/regression_v1/util.py +31 -0
  71. nomenklatura/matching/svm_v1/__init__.py +5 -0
  72. nomenklatura/matching/svm_v1/misc.py +94 -0
  73. nomenklatura/matching/svm_v1/model.py +168 -0
  74. nomenklatura/matching/svm_v1/names.py +81 -0
  75. nomenklatura/matching/svm_v1/train.py +186 -0
  76. nomenklatura/matching/svm_v1/util.py +30 -0
  77. nomenklatura/matching/types.py +227 -0
  78. nomenklatura/matching/util.py +62 -0
  79. nomenklatura/publish/__init__.py +0 -0
  80. nomenklatura/publish/dates.py +49 -0
  81. nomenklatura/publish/edges.py +32 -0
  82. nomenklatura/py.typed +0 -0
  83. nomenklatura/resolver/__init__.py +6 -0
  84. nomenklatura/resolver/common.py +2 -0
  85. nomenklatura/resolver/edge.py +107 -0
  86. nomenklatura/resolver/identifier.py +60 -0
  87. nomenklatura/resolver/linker.py +101 -0
  88. nomenklatura/resolver/resolver.py +565 -0
  89. nomenklatura/settings.py +17 -0
  90. nomenklatura/store/__init__.py +41 -0
  91. nomenklatura/store/base.py +130 -0
  92. nomenklatura/store/level.py +272 -0
  93. nomenklatura/store/memory.py +102 -0
  94. nomenklatura/store/redis_.py +131 -0
  95. nomenklatura/store/sql.py +219 -0
  96. nomenklatura/store/util.py +48 -0
  97. nomenklatura/store/versioned.py +371 -0
  98. nomenklatura/tui/__init__.py +17 -0
  99. nomenklatura/tui/app.py +294 -0
  100. nomenklatura/tui/app.tcss +52 -0
  101. nomenklatura/tui/comparison.py +81 -0
  102. nomenklatura/tui/util.py +35 -0
  103. nomenklatura/util.py +26 -0
  104. nomenklatura/versions.py +119 -0
  105. nomenklatura/wikidata/__init__.py +14 -0
  106. nomenklatura/wikidata/client.py +122 -0
  107. nomenklatura/wikidata/lang.py +94 -0
  108. nomenklatura/wikidata/model.py +139 -0
  109. nomenklatura/wikidata/props.py +70 -0
  110. nomenklatura/wikidata/qualified.py +49 -0
  111. nomenklatura/wikidata/query.py +66 -0
  112. nomenklatura/wikidata/value.py +87 -0
  113. nomenklatura/xref.py +125 -0
  114. nomenklatura_mpt-4.1.9.dist-info/METADATA +159 -0
  115. nomenklatura_mpt-4.1.9.dist-info/RECORD +118 -0
  116. nomenklatura_mpt-4.1.9.dist-info/WHEEL +4 -0
  117. nomenklatura_mpt-4.1.9.dist-info/entry_points.txt +3 -0
  118. nomenklatura_mpt-4.1.9.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,94 @@
1
+ import logging
2
+ from requests import Session
3
+ from importlib import import_module
4
+ from typing import Iterable, Generator, Optional, Type, cast
5
+ from followthemoney import DS, SE
6
+
7
+ from nomenklatura.cache import Cache
8
+ from nomenklatura.matching import DefaultAlgorithm
9
+ from nomenklatura.enrich.common import Enricher, EnricherConfig
10
+ from nomenklatura.enrich.common import EnrichmentAbort, EnrichmentException
11
+ from nomenklatura.judgement import Judgement
12
+ from nomenklatura.matching.types import ScoringConfig
13
+ from nomenklatura.resolver import Resolver
14
+
15
+ log = logging.getLogger(__name__)
16
+ __all__ = [
17
+ "Enricher",
18
+ "EnrichmentAbort",
19
+ "EnrichmentException",
20
+ "make_enricher",
21
+ "enrich",
22
+ "match",
23
+ ]
24
+
25
+
26
+ def make_enricher(
27
+ dataset: DS,
28
+ cache: Cache,
29
+ config: EnricherConfig,
30
+ http_session: Optional[Session] = None,
31
+ ) -> Enricher[DS]:
32
+ enricher_type = config.pop("type")
33
+ if ":" not in enricher_type:
34
+ raise RuntimeError("Invalid import path: %r" % enricher_type)
35
+ module_name, clazz_name = enricher_type.split(":", 1)
36
+ module = import_module(module_name)
37
+ clazz = getattr(module, clazz_name)
38
+ if clazz is None or not issubclass(clazz, Enricher):
39
+ raise RuntimeError("Invalid enricher: %r" % enricher_type)
40
+ enr_clazz = cast(Type[Enricher[DS]], clazz)
41
+ return enr_clazz(dataset, cache, config, session=http_session)
42
+
43
+
44
+ # nk match -i entities.json -o entities-with-matches.json -r resolver.json
45
+ # then:
46
+ # nk dedupe -i entities-with-matches.json -r resolver.json
47
+ def match(
48
+ enricher: Enricher[DS],
49
+ resolver: Resolver[SE],
50
+ entities: Iterable[SE],
51
+ config: Optional[ScoringConfig] = None,
52
+ ) -> Generator[SE, None, None]:
53
+ if config is None:
54
+ config = ScoringConfig.defaults()
55
+ for entity in entities:
56
+ yield entity
57
+ try:
58
+ for match in enricher.match_wrapped(entity):
59
+ if entity.id is None or match.id is None:
60
+ continue
61
+ if not resolver.check_candidate(entity.id, match.id):
62
+ continue
63
+ if not entity.schema.can_match(match.schema):
64
+ continue
65
+ result = DefaultAlgorithm.compare(entity, match, config)
66
+ log.info("Match [%s]: %.2f -> %s", entity, result.score, match)
67
+ resolver.suggest(entity.id, match.id, result.score)
68
+ match.datasets.add(enricher.dataset.name)
69
+ match = resolver.apply(match)
70
+ yield match
71
+ except EnrichmentException:
72
+ log.exception("Failed to match: %r" % entity)
73
+
74
+
75
+ # nk enrich -i entities.json -r resolver.json -o combined.json
76
+ def enrich(
77
+ enricher: Enricher[DS], resolver: Resolver[SE], entities: Iterable[SE]
78
+ ) -> Generator[SE, None, None]:
79
+ for entity in entities:
80
+ try:
81
+ for match in enricher.match_wrapped(entity):
82
+ if entity.id is None or match.id is None:
83
+ continue
84
+ judgement = resolver.get_judgement(match.id, entity.id)
85
+ if judgement != Judgement.POSITIVE:
86
+ continue
87
+
88
+ log.info("Enrich [%s]: %r", entity, match)
89
+ for adjacent in enricher.expand_wrapped(entity, match):
90
+ adjacent.datasets.add(enricher.dataset.name)
91
+ adjacent = resolver.apply(adjacent)
92
+ yield adjacent
93
+ except EnrichmentException:
94
+ log.exception("Failed to enrich: %r" % entity)
@@ -0,0 +1,141 @@
1
+ import os
2
+ import uuid
3
+ import logging
4
+ from banal import is_mapping, ensure_list, hash_data
5
+ from typing import Any, Dict, cast, Generator, Optional
6
+ from urllib.parse import urljoin
7
+ from functools import cached_property
8
+ from followthemoney.exc import InvalidData
9
+ from followthemoney.namespace import Namespace
10
+ from followthemoney import DS, SE
11
+ from requests import Session
12
+ from rigour.urls import build_url
13
+
14
+ from nomenklatura.cache import Cache
15
+ from nomenklatura.enrich.common import Enricher, EnricherConfig
16
+
17
+ log = logging.getLogger(__name__)
18
+
19
+
20
+ class AlephEnricher(Enricher[DS]):
21
+ def __init__(
22
+ self,
23
+ dataset: DS,
24
+ cache: Cache,
25
+ config: EnricherConfig,
26
+ session: Optional[Session] = None,
27
+ ):
28
+ super().__init__(dataset, cache, config, session)
29
+ self._host: str = os.environ.get("ALEPH_HOST", "https://aleph.occrp.org/")
30
+ self._host = self.get_config_expand("host") or self._host
31
+ self._base_url: str = urljoin(self._host, "/api/2/")
32
+ self._collection: Optional[str] = self.get_config_expand("collection")
33
+ self._ns: Optional[Namespace] = None
34
+ if self.get_config_bool("strip_namespace"):
35
+ self._ns = Namespace()
36
+ self._api_key: Optional[str] = os.environ.get("ALEPH_API_KEY")
37
+ self._api_key = self.get_config_expand("api_key") or self._api_key
38
+ if self._api_key is not None:
39
+ self.session.headers["Authorization"] = f"ApiKey {self._api_key}"
40
+ self.session.headers["X-Aleph-Session"] = str(uuid.uuid4())
41
+
42
+ @cached_property
43
+ def collection_id(self) -> Optional[str]:
44
+ if self._collection is None:
45
+ return None
46
+ url = urljoin(self._base_url, "collections")
47
+ url = build_url(url, {"filter:foreign_id": self._collection})
48
+ res = self.session.get(url)
49
+ res.raise_for_status()
50
+ response = res.json()
51
+ for result in response.get("results", []):
52
+ return cast(str, result["id"])
53
+ return None
54
+
55
+ def load_aleph_entity(self, entity: SE, data: Dict[str, Any]) -> Optional[SE]:
56
+ data["referents"] = [data["id"]]
57
+ try:
58
+ proxy = super().load_entity(entity, data)
59
+ except InvalidData:
60
+ log.warning("Server model mismatch: %s" % data.get("schema"))
61
+ return None
62
+ links = data.get("links", {})
63
+ proxy.add("alephUrl", links.get("self"), quiet=True, cleaned=True)
64
+ collection = data.get("collection", {})
65
+ proxy.add("publisher", collection.get("label"), quiet=True, cleaned=True)
66
+ # clinks = collection.get("links", {})
67
+ # entity.add("publisherUrl", clinks.get("ui"), quiet=True, cleaned=True)
68
+ return proxy
69
+
70
+ def convert_nested(
71
+ self, entity: SE, data: Dict[str, Any]
72
+ ) -> Generator[SE, None, None]:
73
+ proxy = self.load_aleph_entity(entity, data)
74
+ if proxy is not None:
75
+ if self._ns is not None:
76
+ entity = self._ns.apply(entity)
77
+ yield proxy
78
+ properties = data.get("properties", {})
79
+ for _, values in properties.items():
80
+ for value in ensure_list(values):
81
+ if is_mapping(value):
82
+ proxy = self.load_aleph_entity(entity, value)
83
+ if proxy is not None:
84
+ yield proxy
85
+
86
+ # def enrich_entity(self, entity):
87
+ # url = self.api._make_url("match")
88
+ # for page in range(10):
89
+ # data = self.post_match(url, entity)
90
+ # for res in data.get("results", []):
91
+ # proxy = self.convert_entity(res)
92
+ # yield self.make_match(entity, proxy)
93
+
94
+ # url = data.get("next")
95
+ # if url is None:
96
+ # break
97
+
98
+ # def expand_entity(self, entity):
99
+ # for url in entity.get("alephUrl", quiet=True):
100
+ # data = self.get_api(url)
101
+ # yield from self.convert_nested(data)
102
+
103
+ # _, entity_id = url.rsplit("/", 1)
104
+ # filters = (("entities", entity_id),)
105
+ # search_api = self.api._make_url("entities", filters=filters)
106
+ # while True:
107
+ # res = self.get_api(search_api)
108
+ # for data in ensure_list(res.get("results")):
109
+ # yield from self.convert_nested(data)
110
+
111
+ # search_api = res.get("next")
112
+ # if search_api is None:
113
+ # break
114
+
115
+ def match(self, entity: SE) -> Generator[SE, None, None]:
116
+ if not entity.schema.matchable:
117
+ return
118
+ url = urljoin(self._base_url, "match")
119
+ if self.collection_id is not None:
120
+ url = build_url(url, {"collection_ids": self.collection_id})
121
+ query = {
122
+ "schema": entity.schema.name,
123
+ "properties": entity.properties,
124
+ }
125
+ cache_id = entity.id or hash_data(query)
126
+ cache_key = f"{url}:{cache_id}"
127
+ response = self.http_post_json_cached(url, cache_key, query)
128
+ for result in response.get("results", []):
129
+ proxy = self.load_aleph_entity(entity, result)
130
+ if proxy is not None:
131
+ if self._ns is not None:
132
+ entity = self._ns.apply(entity)
133
+ yield proxy
134
+
135
+ def expand(self, entity: SE, match: SE) -> Generator[SE, None, None]:
136
+ url = urljoin(self._base_url, f"entities/{match.id}")
137
+ for aleph_url in match.get("alephUrl", quiet=True):
138
+ if aleph_url.startswith(self._base_url):
139
+ url = aleph_url.replace("/entities/", "/api/2/entities/")
140
+ response = self.http_get_json_cached(url)
141
+ yield from self.convert_nested(match, response)
@@ -0,0 +1,219 @@
1
+ import os
2
+ import json
3
+ import logging
4
+ import traceback
5
+ from banal import as_bool
6
+ from typing import Union, Any, Dict, Optional, Generator, Generic
7
+ from abc import ABC, abstractmethod
8
+ from requests import Session
9
+ from requests.exceptions import RequestException, ChunkedEncodingError
10
+ from followthemoney import DS, registry
11
+ from followthemoney import StatementEntity, SE
12
+ from followthemoney.types.topic import TopicType
13
+ from rigour.urls import build_url, ParamsType
14
+
15
+ from nomenklatura import __version__
16
+ from nomenklatura.cache import Cache
17
+ from nomenklatura.util import HeadersType
18
+
19
+ EnricherConfig = Dict[str, Any]
20
+ log = logging.getLogger(__name__)
21
+
22
+
23
+ class EnrichmentException(Exception):
24
+ pass
25
+
26
+
27
+ class EnrichmentAbort(Exception):
28
+ pass
29
+
30
+
31
+ class BaseEnricher(Generic[DS]):
32
+ def __init__(self, dataset: DS, cache: Cache, config: EnricherConfig):
33
+ self.dataset = dataset
34
+ self.cache = cache
35
+ self.config = config
36
+ self.cache_days = int(config.pop("cache_days", 90))
37
+ self._filter_schemata = config.pop("schemata", [])
38
+ self._filter_topics = config.pop("topics", [])
39
+
40
+ def get_config_expand(
41
+ self, name: str, default: Optional[str] = None
42
+ ) -> Optional[str]:
43
+ value = self.config.get(name, default)
44
+ if value is None:
45
+ return None
46
+ return str(os.path.expandvars(value))
47
+
48
+ def get_config_int(self, name: str, default: Union[int, str]) -> int:
49
+ return int(self.config.get(name, default))
50
+
51
+ def get_config_bool(self, name: str, default: Union[bool, str] = False) -> int:
52
+ return as_bool(self.config.get(name, default))
53
+
54
+ def _filter_entity(self, entity: StatementEntity) -> bool:
55
+ """Check if the given entity should be filtered out. Filters
56
+ can be applied by schema or by topic."""
57
+ if len(self._filter_schemata):
58
+ if entity.schema.name not in self._filter_schemata:
59
+ return False
60
+ _filter_topics = set(self._filter_topics)
61
+ if "all" in _filter_topics:
62
+ assert isinstance(registry.topic, TopicType)
63
+ _filter_topics.update(registry.topic.names.keys())
64
+ if len(_filter_topics):
65
+ topics = set(entity.get_type_values(registry.topic))
66
+ if not len(topics.intersection(_filter_topics)):
67
+ return False
68
+ return True
69
+
70
+
71
+ class Enricher(BaseEnricher[DS], ABC):
72
+ def __init__(
73
+ self,
74
+ dataset: DS,
75
+ cache: Cache,
76
+ config: EnricherConfig,
77
+ session: Optional[Session] = None,
78
+ ):
79
+ super().__init__(dataset, cache, config)
80
+ self._session: Optional[Session] = session
81
+
82
+ @property
83
+ def session(self) -> Session:
84
+ if self._session is None:
85
+ self._session = Session()
86
+ self._session.headers["User-Agent"] = f"nomenklatura/{__version__}"
87
+ return self._session
88
+
89
+ def http_get_cached(
90
+ self,
91
+ url: str,
92
+ params: ParamsType = None,
93
+ hidden: ParamsType = None,
94
+ cache_days: Optional[int] = None,
95
+ ) -> str:
96
+ url = build_url(url, params=params)
97
+ cache_days_ = self.cache_days if cache_days is None else cache_days
98
+ response = self.cache.get(url, max_age=cache_days_)
99
+ if response is None:
100
+ log.debug("HTTP GET: %s", url)
101
+ hidden_url = build_url(url, params=hidden)
102
+ try:
103
+ resp = self.session.get(hidden_url)
104
+ resp.raise_for_status()
105
+ except RequestException as rex:
106
+ if rex.response is not None and rex.response.status_code in (401, 403):
107
+ raise EnrichmentAbort("Authorization failure: %s" % url) from rex
108
+ msg = "HTTP fetch failed [%s]: %s" % (url, rex)
109
+ log.info(f"{msg}\n{traceback.format_exc()}")
110
+ raise EnrichmentException(msg) from rex
111
+ response = resp.text
112
+ if cache_days_ > 0:
113
+ self.cache.set(url, response)
114
+ return response
115
+
116
+ def http_remove_cache(self, url: str, params: ParamsType = None) -> None:
117
+ url = build_url(url, params=params)
118
+ self.cache.delete(url)
119
+
120
+ def http_get_json_cached(
121
+ self,
122
+ url: str,
123
+ params: ParamsType = None,
124
+ hidden: ParamsType = None,
125
+ cache_days: Optional[int] = None,
126
+ ) -> Any:
127
+ res = self.http_get_cached(url, params, hidden=hidden, cache_days=cache_days)
128
+ return json.loads(res)
129
+
130
+ def http_post_json_cached(
131
+ self,
132
+ url: str,
133
+ cache_key: str,
134
+ json: Any = None,
135
+ data: Any = None,
136
+ headers: HeadersType = None,
137
+ cache_days: Optional[int] = None,
138
+ retry_chunked_encoding_error: int = 1,
139
+ ) -> Any:
140
+ cache_days_ = self.cache_days if cache_days is None else cache_days
141
+ resp_data = self.cache.get_json(cache_key, max_age=cache_days_)
142
+ if resp_data is None:
143
+ try:
144
+ resp = self.session.post(url, json=json, data=data, headers=headers)
145
+ resp.raise_for_status()
146
+ except ChunkedEncodingError as rex:
147
+ # Due to https://github.com/urllib3/urllib3/issues/2751#issuecomment-2567630065,
148
+ # urllib3's Retry strategy will not retry on chunked encoding errors.
149
+ # Since urllib won't retry it, retry it here.
150
+ # urllib does close the connection.
151
+ if (
152
+ "Response ended prematurely" in str(rex)
153
+ and retry_chunked_encoding_error > 0
154
+ ):
155
+ log.info("Retrying due to chunked encoding error: %s", rex)
156
+ return self.http_post_json_cached(
157
+ url,
158
+ cache_key,
159
+ json=json,
160
+ data=data,
161
+ headers=headers,
162
+ cache_days=cache_days,
163
+ retry_chunked_encoding_error=retry_chunked_encoding_error - 1,
164
+ )
165
+
166
+ msg = "HTTP POST failed [%s]: %s" % (url, rex)
167
+ raise EnrichmentException(msg) from rex
168
+ except RequestException as rex:
169
+ if rex.response is not None and rex.response.status_code in (401, 403):
170
+ raise EnrichmentAbort("Authorization failure: %s" % url) from rex
171
+
172
+ msg = "HTTP POST failed [%s]: %s" % (url, rex)
173
+ log.info(f"{msg}\n{traceback.format_exc()}")
174
+ raise EnrichmentException(msg) from rex
175
+ resp_data = resp.json()
176
+ if cache_days_ > 0:
177
+ self.cache.set_json(cache_key, resp_data)
178
+ return resp_data
179
+
180
+ def _make_data_entity(
181
+ self, entity: SE, data: Dict[str, Any], cleaned: bool = True
182
+ ) -> SE:
183
+ """Create an entity which is of the same sub-type of SE as the given
184
+ query entity."""
185
+ return type(entity).from_data(self.dataset, data, cleaned=cleaned)
186
+
187
+ def load_entity(self, entity: SE, data: Dict[str, Any]) -> SE:
188
+ proxy = self._make_data_entity(entity, data, cleaned=False)
189
+ for prop in proxy.iterprops():
190
+ if prop.stub:
191
+ proxy.pop(prop)
192
+ return proxy
193
+
194
+ def make_entity(self, entity: SE, schema: str) -> SE:
195
+ """Create a new entity of the given schema."""
196
+ return self._make_data_entity(entity, {"schema": schema})
197
+
198
+ def match_wrapped(self, entity: SE) -> Generator[SE, None, None]:
199
+ if not self._filter_entity(entity):
200
+ return
201
+ yield from self.match(entity)
202
+
203
+ def expand_wrapped(self, entity: SE, match: SE) -> Generator[SE, None, None]:
204
+ if not self._filter_entity(entity):
205
+ return
206
+ yield from self.expand(entity, match)
207
+
208
+ @abstractmethod
209
+ def match(self, entity: SE) -> Generator[SE, None, None]:
210
+ raise NotImplementedError()
211
+
212
+ @abstractmethod
213
+ def expand(self, entity: SE, match: SE) -> Generator[SE, None, None]:
214
+ raise NotImplementedError()
215
+
216
+ def close(self) -> None:
217
+ self.cache.close()
218
+ if self._session is not None:
219
+ self._session.close()
@@ -0,0 +1,72 @@
1
+ import logging
2
+ from normality import squash_spaces
3
+ from typing import Any, Dict, Iterable, Generator, Optional
4
+
5
+ from requests import Session
6
+ from followthemoney import DS, SE
7
+ from followthemoney import StatementEntity
8
+
9
+ from nomenklatura.cache import Cache
10
+ from nomenklatura.enrich.common import Enricher, EnricherConfig
11
+
12
+
13
+ log = logging.getLogger(__name__)
14
+ NOMINATIM = "https://nominatim.openstreetmap.org/search.php"
15
+
16
+
17
+ class NominatimEnricher(Enricher[DS]):
18
+ def __init__(
19
+ self,
20
+ dataset: DS,
21
+ cache: Cache,
22
+ config: EnricherConfig,
23
+ session: Optional[Session] = None,
24
+ ):
25
+ super().__init__(dataset, cache, config, session)
26
+ self.cache.preload(f"{NOMINATIM}%")
27
+
28
+ def search_nominatim(self, address: StatementEntity) -> Iterable[Dict[str, Any]]:
29
+ for full in address.get("full"):
30
+ full_norm = squash_spaces(full)
31
+ if len(full_norm) < 5:
32
+ log.warning("Skipping tiny address: %s", full)
33
+ continue
34
+ params = {
35
+ "q": full_norm,
36
+ "countrycodes": address.get("country"),
37
+ "format": "jsonv2",
38
+ "accept-language": "en",
39
+ "addressdetails": 1,
40
+ }
41
+ results = self.http_get_json_cached(NOMINATIM, params)
42
+ log.info("OpenStreetMap geocoded [%s]: %d results", full, len(results))
43
+ for result in results:
44
+ yield result
45
+ # FIXME: only best result for now.
46
+ return
47
+
48
+ def match(self, entity: SE) -> Generator[SE, None, None]:
49
+ if not entity.schema.is_a("Address"):
50
+ return
51
+
52
+ for result in self.search_nominatim(entity):
53
+ # pprint(result)
54
+ addr = self.make_entity(entity, "Address")
55
+ osm_type = result.get("osm_type")
56
+ osm_id = result.get("osm_id")
57
+ if osm_id is None or osm_type is None:
58
+ continue
59
+ addr.id = f"osm-{osm_type}-{osm_id}"
60
+ addr.add("full", result["display_name"])
61
+ # addr.add("latitude", result.get("lat"))
62
+ # addr.add("longitude", result.get("lon"))
63
+ addr_data: Dict[str, str] = result.get("address", {})
64
+ addr.add("country", addr_data.get("country"))
65
+ addr.add("country", addr_data.get("country_code"))
66
+ addr.add("city", addr_data.get("city"))
67
+ addr.add("state", addr_data.get("state"))
68
+ addr.add("postalCode", addr_data.get("postcode"))
69
+ yield addr
70
+
71
+ def expand(self, entity: SE, match: SE) -> Generator[SE, None, None]:
72
+ yield match