nomenklatura-mpt 4.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nomenklatura/__init__.py +11 -0
- nomenklatura/cache.py +194 -0
- nomenklatura/cli.py +260 -0
- nomenklatura/conflicting_match.py +80 -0
- nomenklatura/data/er-unstable.pkl +0 -0
- nomenklatura/data/regression-v1.pkl +0 -0
- nomenklatura/db.py +139 -0
- nomenklatura/delta.py +4 -0
- nomenklatura/enrich/__init__.py +94 -0
- nomenklatura/enrich/aleph.py +141 -0
- nomenklatura/enrich/common.py +219 -0
- nomenklatura/enrich/nominatim.py +72 -0
- nomenklatura/enrich/opencorporates.py +233 -0
- nomenklatura/enrich/openfigi.py +124 -0
- nomenklatura/enrich/permid.py +201 -0
- nomenklatura/enrich/wikidata.py +268 -0
- nomenklatura/enrich/yente.py +116 -0
- nomenklatura/exceptions.py +9 -0
- nomenklatura/index/__init__.py +5 -0
- nomenklatura/index/common.py +24 -0
- nomenklatura/index/entry.py +89 -0
- nomenklatura/index/index.py +170 -0
- nomenklatura/index/tokenizer.py +92 -0
- nomenklatura/judgement.py +21 -0
- nomenklatura/kv.py +40 -0
- nomenklatura/matching/__init__.py +47 -0
- nomenklatura/matching/bench.py +32 -0
- nomenklatura/matching/compare/__init__.py +0 -0
- nomenklatura/matching/compare/addresses.py +71 -0
- nomenklatura/matching/compare/countries.py +15 -0
- nomenklatura/matching/compare/dates.py +83 -0
- nomenklatura/matching/compare/gender.py +15 -0
- nomenklatura/matching/compare/identifiers.py +30 -0
- nomenklatura/matching/compare/names.py +157 -0
- nomenklatura/matching/compare/util.py +51 -0
- nomenklatura/matching/compat.py +66 -0
- nomenklatura/matching/erun/__init__.py +0 -0
- nomenklatura/matching/erun/countries.py +42 -0
- nomenklatura/matching/erun/identifiers.py +64 -0
- nomenklatura/matching/erun/misc.py +71 -0
- nomenklatura/matching/erun/model.py +110 -0
- nomenklatura/matching/erun/names.py +126 -0
- nomenklatura/matching/erun/train.py +135 -0
- nomenklatura/matching/erun/util.py +28 -0
- nomenklatura/matching/logic_v1/__init__.py +0 -0
- nomenklatura/matching/logic_v1/identifiers.py +104 -0
- nomenklatura/matching/logic_v1/model.py +76 -0
- nomenklatura/matching/logic_v1/multi.py +21 -0
- nomenklatura/matching/logic_v1/phonetic.py +142 -0
- nomenklatura/matching/logic_v2/__init__.py +0 -0
- nomenklatura/matching/logic_v2/identifiers.py +124 -0
- nomenklatura/matching/logic_v2/model.py +98 -0
- nomenklatura/matching/logic_v2/names/__init__.py +3 -0
- nomenklatura/matching/logic_v2/names/analysis.py +51 -0
- nomenklatura/matching/logic_v2/names/distance.py +181 -0
- nomenklatura/matching/logic_v2/names/magic.py +60 -0
- nomenklatura/matching/logic_v2/names/match.py +195 -0
- nomenklatura/matching/logic_v2/names/pairing.py +81 -0
- nomenklatura/matching/logic_v2/names/util.py +89 -0
- nomenklatura/matching/name_based/__init__.py +4 -0
- nomenklatura/matching/name_based/misc.py +86 -0
- nomenklatura/matching/name_based/model.py +59 -0
- nomenklatura/matching/name_based/names.py +59 -0
- nomenklatura/matching/pairs.py +42 -0
- nomenklatura/matching/regression_v1/__init__.py +0 -0
- nomenklatura/matching/regression_v1/misc.py +75 -0
- nomenklatura/matching/regression_v1/model.py +110 -0
- nomenklatura/matching/regression_v1/names.py +63 -0
- nomenklatura/matching/regression_v1/train.py +87 -0
- nomenklatura/matching/regression_v1/util.py +31 -0
- nomenklatura/matching/svm_v1/__init__.py +5 -0
- nomenklatura/matching/svm_v1/misc.py +94 -0
- nomenklatura/matching/svm_v1/model.py +168 -0
- nomenklatura/matching/svm_v1/names.py +81 -0
- nomenklatura/matching/svm_v1/train.py +186 -0
- nomenklatura/matching/svm_v1/util.py +30 -0
- nomenklatura/matching/types.py +227 -0
- nomenklatura/matching/util.py +62 -0
- nomenklatura/publish/__init__.py +0 -0
- nomenklatura/publish/dates.py +49 -0
- nomenklatura/publish/edges.py +32 -0
- nomenklatura/py.typed +0 -0
- nomenklatura/resolver/__init__.py +6 -0
- nomenklatura/resolver/common.py +2 -0
- nomenklatura/resolver/edge.py +107 -0
- nomenklatura/resolver/identifier.py +60 -0
- nomenklatura/resolver/linker.py +101 -0
- nomenklatura/resolver/resolver.py +565 -0
- nomenklatura/settings.py +17 -0
- nomenklatura/store/__init__.py +41 -0
- nomenklatura/store/base.py +130 -0
- nomenklatura/store/level.py +272 -0
- nomenklatura/store/memory.py +102 -0
- nomenklatura/store/redis_.py +131 -0
- nomenklatura/store/sql.py +219 -0
- nomenklatura/store/util.py +48 -0
- nomenklatura/store/versioned.py +371 -0
- nomenklatura/tui/__init__.py +17 -0
- nomenklatura/tui/app.py +294 -0
- nomenklatura/tui/app.tcss +52 -0
- nomenklatura/tui/comparison.py +81 -0
- nomenklatura/tui/util.py +35 -0
- nomenklatura/util.py +26 -0
- nomenklatura/versions.py +119 -0
- nomenklatura/wikidata/__init__.py +14 -0
- nomenklatura/wikidata/client.py +122 -0
- nomenklatura/wikidata/lang.py +94 -0
- nomenklatura/wikidata/model.py +139 -0
- nomenklatura/wikidata/props.py +70 -0
- nomenklatura/wikidata/qualified.py +49 -0
- nomenklatura/wikidata/query.py +66 -0
- nomenklatura/wikidata/value.py +87 -0
- nomenklatura/xref.py +125 -0
- nomenklatura_mpt-4.1.9.dist-info/METADATA +159 -0
- nomenklatura_mpt-4.1.9.dist-info/RECORD +118 -0
- nomenklatura_mpt-4.1.9.dist-info/WHEEL +4 -0
- nomenklatura_mpt-4.1.9.dist-info/entry_points.txt +3 -0
- nomenklatura_mpt-4.1.9.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,233 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
from normality import slugify_text
|
4
|
+
from typing import cast, Any, Dict, Generator, Optional
|
5
|
+
from urllib.parse import urlparse
|
6
|
+
from banal import ensure_dict
|
7
|
+
from followthemoney import registry, DS, SE
|
8
|
+
from requests import Session
|
9
|
+
from requests.exceptions import RequestException
|
10
|
+
from rigour.urls import build_url, ParamsType
|
11
|
+
|
12
|
+
from nomenklatura.cache import Cache
|
13
|
+
from nomenklatura.enrich.common import Enricher, EnricherConfig
|
14
|
+
from nomenklatura.enrich.common import EnrichmentAbort, EnrichmentException
|
15
|
+
|
16
|
+
|
17
|
+
log = logging.getLogger(__name__)
|
18
|
+
|
19
|
+
|
20
|
+
def parse_date(raw: Any) -> Optional[str]:
|
21
|
+
return registry.date.clean(raw)
|
22
|
+
|
23
|
+
|
24
|
+
class OpenCorporatesEnricher(Enricher[DS]):
|
25
|
+
COMPANY_SEARCH_API = "https://api.opencorporates.com/v0.4/companies/search"
|
26
|
+
OFFICER_SEARCH_API = "https://api.opencorporates.com/v0.4/officers/search"
|
27
|
+
UI_PART = "://opencorporates.com/"
|
28
|
+
API_PART = "://api.opencorporates.com/v0.4/"
|
29
|
+
|
30
|
+
def __init__(
|
31
|
+
self,
|
32
|
+
dataset: DS,
|
33
|
+
cache: Cache,
|
34
|
+
config: EnricherConfig,
|
35
|
+
session: Optional[Session] = None,
|
36
|
+
):
|
37
|
+
super().__init__(dataset, cache, config, session)
|
38
|
+
token_var = "${OPENCORPORATES_API_TOKEN}"
|
39
|
+
self.api_token: Optional[str] = self.get_config_expand("api_token", token_var)
|
40
|
+
self.quota_exceeded = False
|
41
|
+
if self.api_token == token_var:
|
42
|
+
self.api_token = None
|
43
|
+
if self.api_token is None:
|
44
|
+
log.warning("OpenCorporates has no API token (%s)" % token_var)
|
45
|
+
self.cache.preload(f"{self.COMPANY_SEARCH_API}%")
|
46
|
+
|
47
|
+
def oc_get_cached(self, url: str, params: ParamsType = None) -> Optional[Any]:
|
48
|
+
url = build_url(url, params=params)
|
49
|
+
response = self.cache.get(url, max_age=self.cache_days)
|
50
|
+
if response is None:
|
51
|
+
if self.quota_exceeded:
|
52
|
+
return None
|
53
|
+
hidden_url = build_url(url, params={"api_token": self.api_token})
|
54
|
+
try:
|
55
|
+
resp = self.session.get(hidden_url)
|
56
|
+
resp.raise_for_status()
|
57
|
+
except RequestException as rex:
|
58
|
+
if rex.response is not None:
|
59
|
+
if rex.response.status_code in (403, 429):
|
60
|
+
log.info("OpenCorporates quota exceeded; using only cache now.")
|
61
|
+
self.quota_exceeded = True
|
62
|
+
return None
|
63
|
+
elif rex.response.status_code == 401:
|
64
|
+
raise EnrichmentAbort(
|
65
|
+
"Authorization failure: %s" % url
|
66
|
+
) from rex
|
67
|
+
msg = "HTTP fetch failed [%s]: %s" % (url, rex)
|
68
|
+
raise EnrichmentException(msg) from rex
|
69
|
+
response = resp.text
|
70
|
+
self.cache.set(url, response)
|
71
|
+
return json.loads(response)
|
72
|
+
|
73
|
+
def match(self, entity: SE) -> Generator[SE, None, None]:
|
74
|
+
if not entity.schema.matchable:
|
75
|
+
return
|
76
|
+
if entity.has("opencorporatesUrl"):
|
77
|
+
# TODO: fetch entity here when we start to expand with content!
|
78
|
+
return
|
79
|
+
|
80
|
+
if entity.schema.name in ["Company", "Organization", "LegalEntity"]:
|
81
|
+
yield from self.search_companies(entity)
|
82
|
+
if entity.schema.name in ["Person", "LegalEntity", "Company", "Organization"]:
|
83
|
+
# yield from self.search_officers(entity)
|
84
|
+
pass
|
85
|
+
|
86
|
+
def expand(self, entity: SE, match: SE) -> Generator[SE, None, None]:
|
87
|
+
clone = self.make_entity(match, match.schema.name)
|
88
|
+
clone.id = match.id
|
89
|
+
clone.add("opencorporatesUrl", match.get("opencorporatesUrl"))
|
90
|
+
yield clone
|
91
|
+
|
92
|
+
# def expand_entity(self, entity):
|
93
|
+
# for url in entity.get("opencorporatesUrl", quiet=True):
|
94
|
+
# url = self.make_url(url)
|
95
|
+
# data = self.get_api(url).get("results", {})
|
96
|
+
# if "company" in data:
|
97
|
+
# yield from self.expand_company(entity, data)
|
98
|
+
# if "officer" in data:
|
99
|
+
# yield from self.expand_officer(data, officer=entity)
|
100
|
+
|
101
|
+
def make_entity_id(self, url: str) -> str:
|
102
|
+
parsed = urlparse(url)
|
103
|
+
path = slugify_text(parsed.path, sep="-")
|
104
|
+
assert path is not None, "Invalid OpenCorporates URL: %s" % url
|
105
|
+
return f"oc-{path}"
|
106
|
+
|
107
|
+
def jurisdiction_to_country(self, juris: Optional[Any]) -> Optional[str]:
|
108
|
+
if juris is None:
|
109
|
+
return None
|
110
|
+
return str(juris).split("_", 1)[0]
|
111
|
+
|
112
|
+
def company_entity(
|
113
|
+
self, ref: SE, data: Dict[str, Any], entity: Optional[SE] = None
|
114
|
+
) -> SE:
|
115
|
+
if "company" in data:
|
116
|
+
data = ensure_dict(data.get("company", data))
|
117
|
+
oc_url = cast(Optional[str], data.get("opencorporates_url"))
|
118
|
+
if oc_url is None:
|
119
|
+
raise ValueError("Company has no URL: %r" % data)
|
120
|
+
if entity is None:
|
121
|
+
entity = self.make_entity(ref, "Company")
|
122
|
+
entity.id = self.make_entity_id(oc_url)
|
123
|
+
entity.add("name", data.get("name"))
|
124
|
+
|
125
|
+
# TODO: make this an adjacent object?
|
126
|
+
address: Dict[str, Any] = ensure_dict(data.get("registered_address"))
|
127
|
+
entity.add("country", address.get("country"))
|
128
|
+
|
129
|
+
juris = self.jurisdiction_to_country(data.get("jurisdiction_code"))
|
130
|
+
entity.add("jurisdiction", juris)
|
131
|
+
entity.add("alias", data.get("alternative_names"))
|
132
|
+
entity.add("address", data.get("registered_address_in_full"))
|
133
|
+
entity.add("sourceUrl", data.get("registry_url"))
|
134
|
+
entity.add("legalForm", data.get("company_type"))
|
135
|
+
inc_date = data.get("incorporation_date")
|
136
|
+
entity.add("incorporationDate", parse_date(inc_date))
|
137
|
+
dis_date = data.get("dissolution_date")
|
138
|
+
entity.add("dissolutionDate", parse_date(dis_date))
|
139
|
+
entity.add("status", data.get("current_status"))
|
140
|
+
entity.add("registrationNumber", data.get("company_number"))
|
141
|
+
entity.add("opencorporatesUrl", oc_url)
|
142
|
+
source = data.get("source", {})
|
143
|
+
entity.add("publisher", source.get("publisher"))
|
144
|
+
entity.add("publisherUrl", source.get("url"))
|
145
|
+
entity.add("retrievedAt", parse_date(source.get("retrieved_at")))
|
146
|
+
for code in data.get("industry_codes", []):
|
147
|
+
code = code.get("industry_code", code)
|
148
|
+
entity.add("sector", code.get("description"))
|
149
|
+
for previous in data.get("previous_names", []):
|
150
|
+
entity.add("previousName", previous.get("company_name"))
|
151
|
+
for alias in data.get("alternative_names", []):
|
152
|
+
entity.add("alias", alias.get("company_name"))
|
153
|
+
return entity
|
154
|
+
|
155
|
+
# def officer_entity(self, data, entity=None):
|
156
|
+
# if "officer" in data:
|
157
|
+
# data = ensure_dict(data.get("officer", data))
|
158
|
+
# person = data.get("occupation") or data.get("date_of_birth")
|
159
|
+
# schema = "Person" if person else "LegalEntity"
|
160
|
+
# entity = model.make_entity(schema)
|
161
|
+
# entity.make_id(data.get("opencorporates_url"))
|
162
|
+
# entity.add("name", data.get("name"))
|
163
|
+
# entity.add("country", data.get("nationality"))
|
164
|
+
# entity.add("jurisdiction", data.get("jurisdiction_code"))
|
165
|
+
# entity.add("address", data.get("address"))
|
166
|
+
# entity.add("birthDate", data.get("date_of_birth"), quiet=True)
|
167
|
+
# entity.add("position", data.get("occupation"), quiet=True)
|
168
|
+
# entity.add("opencorporatesUrl", data.get("opencorporates_url"))
|
169
|
+
# source = data.get("source", {})
|
170
|
+
# entity.add("publisher", source.get("publisher"))
|
171
|
+
# entity.add("publisherUrl", source.get("url"))
|
172
|
+
# entity.add("retrievedAt", source.get("retrieved_at"))
|
173
|
+
# return entity
|
174
|
+
|
175
|
+
def search_companies(self, entity: SE) -> Generator[SE, None, None]:
|
176
|
+
countries = entity.get_type_values(registry.country)
|
177
|
+
params = {"q": entity.caption, "sparse": True, "country_codes": countries}
|
178
|
+
for page in range(1, 9):
|
179
|
+
params["page"] = page
|
180
|
+
results = self.oc_get_cached(self.COMPANY_SEARCH_API, params=params)
|
181
|
+
if results is None:
|
182
|
+
break
|
183
|
+
|
184
|
+
# print(results)
|
185
|
+
for company in results.get("results", {}).get("companies", []):
|
186
|
+
proxy = self.company_entity(entity, company)
|
187
|
+
yield proxy
|
188
|
+
if page >= results.get("total_pages", 0):
|
189
|
+
break
|
190
|
+
|
191
|
+
# def search_officers(self, entity):
|
192
|
+
# params = self.get_query(entity)
|
193
|
+
# for page in range(1, 9):
|
194
|
+
# params["page"] = page
|
195
|
+
# url = self.make_url(self.OFFICER_SEARCH_API, params)
|
196
|
+
# results = self.get_api(url)
|
197
|
+
# officers = results.get("results", {}).get("officers")
|
198
|
+
# for officer in ensure_list(officers):
|
199
|
+
# proxy = self.officer_entity(officer)
|
200
|
+
# yield self.make_match(entity, proxy)
|
201
|
+
# if page >= results.get("total_pages", 0):
|
202
|
+
# break
|
203
|
+
|
204
|
+
# def enrich_entity(self, entity):
|
205
|
+
# schema = entity.schema.name
|
206
|
+
|
207
|
+
# if schema in ["Person", "LegalEntity", "Company", "Organization"]:
|
208
|
+
# yield from self.search_officers(entity)
|
209
|
+
|
210
|
+
# def expand_company(self, entity, data):
|
211
|
+
# data = ensure_dict(data.get("company", data))
|
212
|
+
# entity = self.company_entity(data, entity=entity)
|
213
|
+
# for officer in ensure_list(data.get("officers")):
|
214
|
+
# yield from self.expand_officer(officer, company=entity)
|
215
|
+
# yield entity
|
216
|
+
|
217
|
+
# def expand_officer(self, data, entity=None, company=None):
|
218
|
+
# data = ensure_dict(data.get("officer", data))
|
219
|
+
# entity = self.officer_entity(data, entity=entity)
|
220
|
+
# yield entity
|
221
|
+
|
222
|
+
# company = self.company_entity(data.get("company"), entity=company)
|
223
|
+
# yield company
|
224
|
+
|
225
|
+
# if company.id and entity.id:
|
226
|
+
# directorship = model.make_entity("Directorship")
|
227
|
+
# directorship.make_id(data.get("opencorporates_url"), "Directorship")
|
228
|
+
# directorship.add("director", entity)
|
229
|
+
# directorship.add("startDate", data.get("start_date"))
|
230
|
+
# directorship.add("endDate", data.get("end_date"))
|
231
|
+
# directorship.add("organization", company)
|
232
|
+
# directorship.add("role", data.get("position"))
|
233
|
+
# yield directorship
|
@@ -0,0 +1,124 @@
|
|
1
|
+
import os
|
2
|
+
import logging
|
3
|
+
from typing import Generator, Dict, Optional
|
4
|
+
from followthemoney.util import make_entity_id
|
5
|
+
from followthemoney import DS, SE
|
6
|
+
from requests import Session
|
7
|
+
|
8
|
+
from nomenklatura.cache import Cache
|
9
|
+
from nomenklatura.enrich.common import Enricher, EnricherConfig
|
10
|
+
|
11
|
+
log = logging.getLogger(__name__)
|
12
|
+
|
13
|
+
|
14
|
+
class OpenFIGIEnricher(Enricher[DS]):
|
15
|
+
"""Uses the `OpenFIGI` search API to look up FIGIs by company name."""
|
16
|
+
|
17
|
+
SEARCH_URL = "https://api.openfigi.com/v3/search"
|
18
|
+
MAPPING_URL = "https://api.openfigi.com/v3/mapping"
|
19
|
+
|
20
|
+
def __init__(
|
21
|
+
self,
|
22
|
+
dataset: DS,
|
23
|
+
cache: Cache,
|
24
|
+
config: EnricherConfig,
|
25
|
+
session: Optional[Session] = None,
|
26
|
+
):
|
27
|
+
super().__init__(dataset, cache, config, session)
|
28
|
+
api_key_var = "${OPENFIGI_API_KEY}"
|
29
|
+
self.api_key: Optional[str] = self.get_config_expand("api_key", api_key_var)
|
30
|
+
if self.api_key == api_key_var:
|
31
|
+
self.api_key = None
|
32
|
+
if self.api_key is None:
|
33
|
+
log.warning("PermID has no API token (%s)" % api_key_var)
|
34
|
+
|
35
|
+
api_key = os.environ.get("OPENFIGI_API_KEY")
|
36
|
+
if api_key is not None:
|
37
|
+
self.session.headers["X-OPENFIGI-APIKEY"] = api_key
|
38
|
+
|
39
|
+
def make_company_id(self, name: str) -> str:
|
40
|
+
return f"figi-company-{make_entity_id(name)}"
|
41
|
+
|
42
|
+
def make_security_id(self, figi: str) -> str:
|
43
|
+
return f"figi-{figi}"
|
44
|
+
|
45
|
+
def search(self, query: str) -> Generator[Dict[str, str], None, None]:
|
46
|
+
body = {"query": query}
|
47
|
+
next = None
|
48
|
+
|
49
|
+
while True:
|
50
|
+
if next is not None:
|
51
|
+
body["start"] = next
|
52
|
+
|
53
|
+
log.info(f"Searching {query!r}, offset={next}")
|
54
|
+
cache_key = f"{self.SEARCH_URL}:{query}:{next}"
|
55
|
+
resp = self.http_post_json_cached(self.SEARCH_URL, cache_key, json=body)
|
56
|
+
if "data" in resp:
|
57
|
+
yield from resp["data"]
|
58
|
+
|
59
|
+
next = resp.get("next", None)
|
60
|
+
if next is None:
|
61
|
+
break
|
62
|
+
|
63
|
+
def match_organization(self, entity: SE) -> Generator[SE, None, None]:
|
64
|
+
for name in entity.get("name"):
|
65
|
+
for match in self.search(name):
|
66
|
+
match_name = match.get("name", None)
|
67
|
+
if match_name is None:
|
68
|
+
continue
|
69
|
+
other = self.make_entity(entity, "Company")
|
70
|
+
other.id = self.make_company_id(match_name)
|
71
|
+
other.add("name", match_name)
|
72
|
+
other.add("topics", "corp.public")
|
73
|
+
yield other
|
74
|
+
|
75
|
+
def match_security(self, entity: SE) -> Generator[SE, None, None]:
|
76
|
+
for isin in entity.get("isin"):
|
77
|
+
cache_key = f"{self.MAPPING_URL}:ISIN:{isin}"
|
78
|
+
query = [{"idType": "ID_ISIN", "idValue": isin}]
|
79
|
+
resp = self.http_post_json_cached(self.MAPPING_URL, cache_key, json=query)
|
80
|
+
for section in resp:
|
81
|
+
for item in section.get("data", []):
|
82
|
+
figi = item["figi"]
|
83
|
+
if figi != item.get("compositeFIGI", figi):
|
84
|
+
continue
|
85
|
+
security = self.make_entity(entity, "Security")
|
86
|
+
# security.id = self.make_security_id(item["figi"])
|
87
|
+
security.id = entity.id
|
88
|
+
security.add("isin", isin)
|
89
|
+
security.add("figiCode", item["figi"])
|
90
|
+
security.add("ticker", item["ticker"])
|
91
|
+
security.add("type", item["securityType"])
|
92
|
+
yield security
|
93
|
+
|
94
|
+
def match(self, entity: SE) -> Generator[SE, None, None]:
|
95
|
+
if entity.schema.is_a("Organization"):
|
96
|
+
yield from self.match_organization(entity)
|
97
|
+
if entity.schema.is_a("Security"):
|
98
|
+
yield from self.match_security(entity)
|
99
|
+
|
100
|
+
def expand(self, entity: SE, match: SE) -> Generator[SE, None, None]:
|
101
|
+
if match.schema.is_a("Security"):
|
102
|
+
yield match
|
103
|
+
if match.schema.is_a("Organization"):
|
104
|
+
name = match.first("name")
|
105
|
+
if name is None:
|
106
|
+
return
|
107
|
+
yield match
|
108
|
+
for item in self.search(name):
|
109
|
+
# Only emit the securities which match the name of the positive match
|
110
|
+
# to the company exactly. Skip everything else.
|
111
|
+
if item["name"] != name:
|
112
|
+
continue
|
113
|
+
|
114
|
+
figi = item["figi"]
|
115
|
+
security = self.make_entity(match, "Security")
|
116
|
+
security.id = self.make_security_id(figi)
|
117
|
+
security.add("figiCode", figi)
|
118
|
+
security.add("issuer", match)
|
119
|
+
security.add("ticker", item["ticker"])
|
120
|
+
security.add("type", item["securityType"])
|
121
|
+
# if item["exchCode"] is not None:
|
122
|
+
# security.add("notes", f'exchange {item["exchCode"]}')
|
123
|
+
security.add("description", item["securityDescription"])
|
124
|
+
yield security
|
@@ -0,0 +1,201 @@
|
|
1
|
+
import csv
|
2
|
+
import io
|
3
|
+
import json
|
4
|
+
import logging
|
5
|
+
from functools import lru_cache
|
6
|
+
from itertools import product
|
7
|
+
from typing import cast, Set, Generator, Optional, Dict, Any
|
8
|
+
from urllib.parse import urljoin
|
9
|
+
|
10
|
+
from followthemoney import StatementEntity, registry, DS, SE
|
11
|
+
from lxml import etree
|
12
|
+
from requests import Session
|
13
|
+
|
14
|
+
from nomenklatura.cache import Cache
|
15
|
+
from nomenklatura.enrich.common import Enricher, EnricherConfig
|
16
|
+
from nomenklatura.enrich.common import EnrichmentAbort
|
17
|
+
from nomenklatura.matching.compat import fingerprint_name
|
18
|
+
|
19
|
+
log = logging.getLogger(__name__)
|
20
|
+
|
21
|
+
GN = "{http://www.geonames.org/ontology#}"
|
22
|
+
STATUS = {
|
23
|
+
"tr-org:statusActive": "Active",
|
24
|
+
"tr-org:statusInActive": "Inactive",
|
25
|
+
}
|
26
|
+
|
27
|
+
|
28
|
+
class PermIDEnricher(Enricher[DS]):
|
29
|
+
MATCHING_API = "https://api-eit.refinitiv.com/permid/match"
|
30
|
+
|
31
|
+
def __init__(
|
32
|
+
self,
|
33
|
+
dataset: DS,
|
34
|
+
cache: Cache,
|
35
|
+
config: EnricherConfig,
|
36
|
+
session: Optional[Session] = None,
|
37
|
+
):
|
38
|
+
super().__init__(dataset, cache, config, session)
|
39
|
+
token_var = "${PERMID_API_TOKEN}"
|
40
|
+
self.api_token: Optional[str] = self.get_config_expand("api_token", token_var)
|
41
|
+
if self.api_token == token_var:
|
42
|
+
self.api_token = None
|
43
|
+
if self.api_token is None:
|
44
|
+
log.warning("PermID has no API token (%s)" % token_var)
|
45
|
+
self.quota_exceeded = False
|
46
|
+
|
47
|
+
def entity_to_queries(self, entity: StatementEntity) -> bytes:
|
48
|
+
names = entity.get_type_values(registry.name, matchable=True)
|
49
|
+
countries = entity.get("jurisdiction", quiet=True)
|
50
|
+
if not len(countries):
|
51
|
+
countries = entity.get_type_values(registry.country, matchable=True)
|
52
|
+
country_set = {c.upper()[:2] for c in countries}
|
53
|
+
if len(country_set) == 0:
|
54
|
+
country_set.add("")
|
55
|
+
if len(names) * len(country_set) < 999:
|
56
|
+
country_set.add("")
|
57
|
+
if len(names) * len(country_set) < 999:
|
58
|
+
fp = fingerprint_name(entity.caption)
|
59
|
+
if fp is not None and fp not in names:
|
60
|
+
names.append(fp)
|
61
|
+
for name in entity.get("name", quiet=True):
|
62
|
+
if len(names) * len(country_set) >= 999:
|
63
|
+
break
|
64
|
+
fp = fingerprint_name(entity.caption)
|
65
|
+
if fp is not None and fp not in names:
|
66
|
+
names.append(fp)
|
67
|
+
sio = io.StringIO()
|
68
|
+
writer = csv.writer(sio, dialect=csv.unix_dialect, delimiter=",")
|
69
|
+
# LocalID,Standard Identifier,Name,Country,Street,City,PostalCode,State,Website
|
70
|
+
writer.writerow(["LocalID", "Standard Identifier", "Name", "Country"])
|
71
|
+
lei_code = entity.first("leiCode", quiet=True)
|
72
|
+
if lei_code is not None:
|
73
|
+
lei_code = f"LEI:{lei_code}"
|
74
|
+
else:
|
75
|
+
lei_code = ""
|
76
|
+
for name, country in list(product(names, country_set))[:999]:
|
77
|
+
writer.writerow([entity.id, lei_code, name, country])
|
78
|
+
sio.seek(0)
|
79
|
+
return sio.getvalue().encode("utf-8")
|
80
|
+
|
81
|
+
@lru_cache(maxsize=1000)
|
82
|
+
def fetch_placename(self, value: Optional[str]) -> Optional[str]:
|
83
|
+
if value is None:
|
84
|
+
return None
|
85
|
+
if not value.startswith("http://sws.geonames.org/"):
|
86
|
+
raise ValueError("Not a GeoNames URL: %s" % value)
|
87
|
+
url = urljoin(value, "about.rdf")
|
88
|
+
res = self.http_get_cached(url, cache_days=120)
|
89
|
+
try:
|
90
|
+
doc = etree.fromstring(res.encode("utf=8"))
|
91
|
+
except Exception:
|
92
|
+
log.warn("Invalid GeoNames response: %s", url)
|
93
|
+
self.http_remove_cache(url)
|
94
|
+
return None
|
95
|
+
for code in doc.findall(".//%scountryCode" % GN):
|
96
|
+
return code.text
|
97
|
+
for name in doc.findall(".//%sname" % GN):
|
98
|
+
return name.text
|
99
|
+
return value
|
100
|
+
|
101
|
+
def fetch_permid(self, url: str) -> Optional[Dict[str, Any]]:
|
102
|
+
params = {"format": "json-ld"}
|
103
|
+
hidden = {"access-token": self.api_token}
|
104
|
+
res_raw = self.http_get_cached(url, params=params, hidden=hidden, cache_days=90)
|
105
|
+
try:
|
106
|
+
return cast(Dict[str, Any], json.loads(res_raw))
|
107
|
+
except Exception:
|
108
|
+
log.info("Invalid response from PermID: %s", url)
|
109
|
+
self.http_remove_cache(url, params=params)
|
110
|
+
return None
|
111
|
+
|
112
|
+
def fetch_perm_org(self, entity: SE, url: str) -> Optional[SE]:
|
113
|
+
res = self.fetch_permid(url)
|
114
|
+
if res is None:
|
115
|
+
return None
|
116
|
+
res.pop("@id", None)
|
117
|
+
res.pop("@type", None)
|
118
|
+
res.pop("@context", None)
|
119
|
+
res.pop("hasPrimaryIndustryGroup", None)
|
120
|
+
|
121
|
+
perm_id = res.pop("tr-common:hasPermId", url.rsplit("-", 1)[-1])
|
122
|
+
lei_code = res.pop("tr-org:hasLEI", None)
|
123
|
+
match = self.make_entity(entity, "Company")
|
124
|
+
match.id = f"lei-{lei_code}" if lei_code is not None else f"permid-{perm_id}"
|
125
|
+
match.add("sourceUrl", url)
|
126
|
+
match.add("leiCode", lei_code)
|
127
|
+
match.add("permId", perm_id)
|
128
|
+
match.add("name", res.pop("vcard:organization-name", None))
|
129
|
+
match.add("website", res.pop("hasURL", None))
|
130
|
+
match.add("country", self.fetch_placename(res.pop("isDomiciledIn", None)))
|
131
|
+
incorporated = self.fetch_placename(res.pop("isIncorporatedIn", None))
|
132
|
+
match.add("jurisdiction", incorporated)
|
133
|
+
inc_date = res.pop("hasLatestOrganizationFoundedDate", None)
|
134
|
+
match.add("incorporationDate", inc_date)
|
135
|
+
|
136
|
+
hq_addr = res.pop("mdaas:HeadquartersAddress", None)
|
137
|
+
reg_addr = res.pop("mdaas:RegisteredAddress", None)
|
138
|
+
for addr in (hq_addr, reg_addr):
|
139
|
+
if addr is not None:
|
140
|
+
addr = ", ".join(addr.split("\n"))
|
141
|
+
addr = addr.replace(",,", ",").strip().strip(",")
|
142
|
+
match.add("address", addr)
|
143
|
+
status_uri = res.pop("hasActivityStatus", None)
|
144
|
+
status = STATUS.get(status_uri)
|
145
|
+
if status is None:
|
146
|
+
log.warning("Unknown status: %s" % status_uri)
|
147
|
+
match.add("status", status)
|
148
|
+
match.add("phone", res.pop("tr-org:hasHeadquartersPhoneNumber", None))
|
149
|
+
match.add("phone", res.pop("tr-org:hasRegisteredPhoneNumber", None))
|
150
|
+
res.pop("tr-org:hasHeadquartersFaxNumber", None)
|
151
|
+
res.pop("tr-org:hasRegisteredFaxNumber", None)
|
152
|
+
|
153
|
+
quote = res.pop("hasOrganizationPrimaryQuote", None)
|
154
|
+
if quote is not None:
|
155
|
+
quote_res = self.fetch_permid(quote)
|
156
|
+
if quote_res is not None:
|
157
|
+
match.add("ticker", quote_res.pop("tr-fin:hasExchangeTicker", None))
|
158
|
+
match.add("ricCode", quote_res.pop("tr-fin:hasRic", None))
|
159
|
+
match.add("topics", "corp.public")
|
160
|
+
return match
|
161
|
+
|
162
|
+
def match(self, entity: SE) -> Generator[SE, None, None]:
|
163
|
+
if self.quota_exceeded:
|
164
|
+
return
|
165
|
+
if not entity.schema.is_a("Organization"):
|
166
|
+
return
|
167
|
+
try:
|
168
|
+
for permid in entity.get("permId", quiet=True):
|
169
|
+
permid_url = f"https://permid.org/1-{permid}"
|
170
|
+
match = self.fetch_perm_org(entity, permid_url)
|
171
|
+
if match is not None:
|
172
|
+
yield match
|
173
|
+
headers = {
|
174
|
+
"x-openmatch-numberOfMatchesPerRecord": "4",
|
175
|
+
"X-AG-Access-Token": self.api_token,
|
176
|
+
"x-openmatch-dataType": "Organization",
|
177
|
+
}
|
178
|
+
cache_key = f"permid:{entity.id}"
|
179
|
+
query = self.entity_to_queries(entity)
|
180
|
+
res = self.http_post_json_cached(
|
181
|
+
self.MATCHING_API,
|
182
|
+
cache_key,
|
183
|
+
data=query,
|
184
|
+
headers=headers,
|
185
|
+
cache_days=self.cache_days,
|
186
|
+
)
|
187
|
+
seen_matches: Set[str] = set()
|
188
|
+
for result in res.get("outputContentResponse", []):
|
189
|
+
match_permid_url = result.get("Match OpenPermID")
|
190
|
+
if match_permid_url is None or match_permid_url in seen_matches:
|
191
|
+
continue
|
192
|
+
seen_matches.add(match_permid_url)
|
193
|
+
match = self.fetch_perm_org(entity, match_permid_url)
|
194
|
+
if match is not None:
|
195
|
+
yield match
|
196
|
+
except EnrichmentAbort as exc:
|
197
|
+
self.quota_exceeded = True
|
198
|
+
log.warning("PermID quota exceeded: %s", exc)
|
199
|
+
|
200
|
+
def expand(self, entity: SE, match: SE) -> Generator[SE, None, None]:
|
201
|
+
yield match
|