nomenklatura-mpt 4.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. nomenklatura/__init__.py +11 -0
  2. nomenklatura/cache.py +194 -0
  3. nomenklatura/cli.py +260 -0
  4. nomenklatura/conflicting_match.py +80 -0
  5. nomenklatura/data/er-unstable.pkl +0 -0
  6. nomenklatura/data/regression-v1.pkl +0 -0
  7. nomenklatura/db.py +139 -0
  8. nomenklatura/delta.py +4 -0
  9. nomenklatura/enrich/__init__.py +94 -0
  10. nomenklatura/enrich/aleph.py +141 -0
  11. nomenklatura/enrich/common.py +219 -0
  12. nomenklatura/enrich/nominatim.py +72 -0
  13. nomenklatura/enrich/opencorporates.py +233 -0
  14. nomenklatura/enrich/openfigi.py +124 -0
  15. nomenklatura/enrich/permid.py +201 -0
  16. nomenklatura/enrich/wikidata.py +268 -0
  17. nomenklatura/enrich/yente.py +116 -0
  18. nomenklatura/exceptions.py +9 -0
  19. nomenklatura/index/__init__.py +5 -0
  20. nomenklatura/index/common.py +24 -0
  21. nomenklatura/index/entry.py +89 -0
  22. nomenklatura/index/index.py +170 -0
  23. nomenklatura/index/tokenizer.py +92 -0
  24. nomenklatura/judgement.py +21 -0
  25. nomenklatura/kv.py +40 -0
  26. nomenklatura/matching/__init__.py +47 -0
  27. nomenklatura/matching/bench.py +32 -0
  28. nomenklatura/matching/compare/__init__.py +0 -0
  29. nomenklatura/matching/compare/addresses.py +71 -0
  30. nomenklatura/matching/compare/countries.py +15 -0
  31. nomenklatura/matching/compare/dates.py +83 -0
  32. nomenklatura/matching/compare/gender.py +15 -0
  33. nomenklatura/matching/compare/identifiers.py +30 -0
  34. nomenklatura/matching/compare/names.py +157 -0
  35. nomenklatura/matching/compare/util.py +51 -0
  36. nomenklatura/matching/compat.py +66 -0
  37. nomenklatura/matching/erun/__init__.py +0 -0
  38. nomenklatura/matching/erun/countries.py +42 -0
  39. nomenklatura/matching/erun/identifiers.py +64 -0
  40. nomenklatura/matching/erun/misc.py +71 -0
  41. nomenklatura/matching/erun/model.py +110 -0
  42. nomenklatura/matching/erun/names.py +126 -0
  43. nomenklatura/matching/erun/train.py +135 -0
  44. nomenklatura/matching/erun/util.py +28 -0
  45. nomenklatura/matching/logic_v1/__init__.py +0 -0
  46. nomenklatura/matching/logic_v1/identifiers.py +104 -0
  47. nomenklatura/matching/logic_v1/model.py +76 -0
  48. nomenklatura/matching/logic_v1/multi.py +21 -0
  49. nomenklatura/matching/logic_v1/phonetic.py +142 -0
  50. nomenklatura/matching/logic_v2/__init__.py +0 -0
  51. nomenklatura/matching/logic_v2/identifiers.py +124 -0
  52. nomenklatura/matching/logic_v2/model.py +98 -0
  53. nomenklatura/matching/logic_v2/names/__init__.py +3 -0
  54. nomenklatura/matching/logic_v2/names/analysis.py +51 -0
  55. nomenklatura/matching/logic_v2/names/distance.py +181 -0
  56. nomenklatura/matching/logic_v2/names/magic.py +60 -0
  57. nomenklatura/matching/logic_v2/names/match.py +195 -0
  58. nomenklatura/matching/logic_v2/names/pairing.py +81 -0
  59. nomenklatura/matching/logic_v2/names/util.py +89 -0
  60. nomenklatura/matching/name_based/__init__.py +4 -0
  61. nomenklatura/matching/name_based/misc.py +86 -0
  62. nomenklatura/matching/name_based/model.py +59 -0
  63. nomenklatura/matching/name_based/names.py +59 -0
  64. nomenklatura/matching/pairs.py +42 -0
  65. nomenklatura/matching/regression_v1/__init__.py +0 -0
  66. nomenklatura/matching/regression_v1/misc.py +75 -0
  67. nomenklatura/matching/regression_v1/model.py +110 -0
  68. nomenklatura/matching/regression_v1/names.py +63 -0
  69. nomenklatura/matching/regression_v1/train.py +87 -0
  70. nomenklatura/matching/regression_v1/util.py +31 -0
  71. nomenklatura/matching/svm_v1/__init__.py +5 -0
  72. nomenklatura/matching/svm_v1/misc.py +94 -0
  73. nomenklatura/matching/svm_v1/model.py +168 -0
  74. nomenklatura/matching/svm_v1/names.py +81 -0
  75. nomenklatura/matching/svm_v1/train.py +186 -0
  76. nomenklatura/matching/svm_v1/util.py +30 -0
  77. nomenklatura/matching/types.py +227 -0
  78. nomenklatura/matching/util.py +62 -0
  79. nomenklatura/publish/__init__.py +0 -0
  80. nomenklatura/publish/dates.py +49 -0
  81. nomenklatura/publish/edges.py +32 -0
  82. nomenklatura/py.typed +0 -0
  83. nomenklatura/resolver/__init__.py +6 -0
  84. nomenklatura/resolver/common.py +2 -0
  85. nomenklatura/resolver/edge.py +107 -0
  86. nomenklatura/resolver/identifier.py +60 -0
  87. nomenklatura/resolver/linker.py +101 -0
  88. nomenklatura/resolver/resolver.py +565 -0
  89. nomenklatura/settings.py +17 -0
  90. nomenklatura/store/__init__.py +41 -0
  91. nomenklatura/store/base.py +130 -0
  92. nomenklatura/store/level.py +272 -0
  93. nomenklatura/store/memory.py +102 -0
  94. nomenklatura/store/redis_.py +131 -0
  95. nomenklatura/store/sql.py +219 -0
  96. nomenklatura/store/util.py +48 -0
  97. nomenklatura/store/versioned.py +371 -0
  98. nomenklatura/tui/__init__.py +17 -0
  99. nomenklatura/tui/app.py +294 -0
  100. nomenklatura/tui/app.tcss +52 -0
  101. nomenklatura/tui/comparison.py +81 -0
  102. nomenklatura/tui/util.py +35 -0
  103. nomenklatura/util.py +26 -0
  104. nomenklatura/versions.py +119 -0
  105. nomenklatura/wikidata/__init__.py +14 -0
  106. nomenklatura/wikidata/client.py +122 -0
  107. nomenklatura/wikidata/lang.py +94 -0
  108. nomenklatura/wikidata/model.py +139 -0
  109. nomenklatura/wikidata/props.py +70 -0
  110. nomenklatura/wikidata/qualified.py +49 -0
  111. nomenklatura/wikidata/query.py +66 -0
  112. nomenklatura/wikidata/value.py +87 -0
  113. nomenklatura/xref.py +125 -0
  114. nomenklatura_mpt-4.1.9.dist-info/METADATA +159 -0
  115. nomenklatura_mpt-4.1.9.dist-info/RECORD +118 -0
  116. nomenklatura_mpt-4.1.9.dist-info/WHEEL +4 -0
  117. nomenklatura_mpt-4.1.9.dist-info/entry_points.txt +3 -0
  118. nomenklatura_mpt-4.1.9.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,233 @@
1
+ import json
2
+ import logging
3
+ from normality import slugify_text
4
+ from typing import cast, Any, Dict, Generator, Optional
5
+ from urllib.parse import urlparse
6
+ from banal import ensure_dict
7
+ from followthemoney import registry, DS, SE
8
+ from requests import Session
9
+ from requests.exceptions import RequestException
10
+ from rigour.urls import build_url, ParamsType
11
+
12
+ from nomenklatura.cache import Cache
13
+ from nomenklatura.enrich.common import Enricher, EnricherConfig
14
+ from nomenklatura.enrich.common import EnrichmentAbort, EnrichmentException
15
+
16
+
17
+ log = logging.getLogger(__name__)
18
+
19
+
20
+ def parse_date(raw: Any) -> Optional[str]:
21
+ return registry.date.clean(raw)
22
+
23
+
24
+ class OpenCorporatesEnricher(Enricher[DS]):
25
+ COMPANY_SEARCH_API = "https://api.opencorporates.com/v0.4/companies/search"
26
+ OFFICER_SEARCH_API = "https://api.opencorporates.com/v0.4/officers/search"
27
+ UI_PART = "://opencorporates.com/"
28
+ API_PART = "://api.opencorporates.com/v0.4/"
29
+
30
+ def __init__(
31
+ self,
32
+ dataset: DS,
33
+ cache: Cache,
34
+ config: EnricherConfig,
35
+ session: Optional[Session] = None,
36
+ ):
37
+ super().__init__(dataset, cache, config, session)
38
+ token_var = "${OPENCORPORATES_API_TOKEN}"
39
+ self.api_token: Optional[str] = self.get_config_expand("api_token", token_var)
40
+ self.quota_exceeded = False
41
+ if self.api_token == token_var:
42
+ self.api_token = None
43
+ if self.api_token is None:
44
+ log.warning("OpenCorporates has no API token (%s)" % token_var)
45
+ self.cache.preload(f"{self.COMPANY_SEARCH_API}%")
46
+
47
+ def oc_get_cached(self, url: str, params: ParamsType = None) -> Optional[Any]:
48
+ url = build_url(url, params=params)
49
+ response = self.cache.get(url, max_age=self.cache_days)
50
+ if response is None:
51
+ if self.quota_exceeded:
52
+ return None
53
+ hidden_url = build_url(url, params={"api_token": self.api_token})
54
+ try:
55
+ resp = self.session.get(hidden_url)
56
+ resp.raise_for_status()
57
+ except RequestException as rex:
58
+ if rex.response is not None:
59
+ if rex.response.status_code in (403, 429):
60
+ log.info("OpenCorporates quota exceeded; using only cache now.")
61
+ self.quota_exceeded = True
62
+ return None
63
+ elif rex.response.status_code == 401:
64
+ raise EnrichmentAbort(
65
+ "Authorization failure: %s" % url
66
+ ) from rex
67
+ msg = "HTTP fetch failed [%s]: %s" % (url, rex)
68
+ raise EnrichmentException(msg) from rex
69
+ response = resp.text
70
+ self.cache.set(url, response)
71
+ return json.loads(response)
72
+
73
+ def match(self, entity: SE) -> Generator[SE, None, None]:
74
+ if not entity.schema.matchable:
75
+ return
76
+ if entity.has("opencorporatesUrl"):
77
+ # TODO: fetch entity here when we start to expand with content!
78
+ return
79
+
80
+ if entity.schema.name in ["Company", "Organization", "LegalEntity"]:
81
+ yield from self.search_companies(entity)
82
+ if entity.schema.name in ["Person", "LegalEntity", "Company", "Organization"]:
83
+ # yield from self.search_officers(entity)
84
+ pass
85
+
86
+ def expand(self, entity: SE, match: SE) -> Generator[SE, None, None]:
87
+ clone = self.make_entity(match, match.schema.name)
88
+ clone.id = match.id
89
+ clone.add("opencorporatesUrl", match.get("opencorporatesUrl"))
90
+ yield clone
91
+
92
+ # def expand_entity(self, entity):
93
+ # for url in entity.get("opencorporatesUrl", quiet=True):
94
+ # url = self.make_url(url)
95
+ # data = self.get_api(url).get("results", {})
96
+ # if "company" in data:
97
+ # yield from self.expand_company(entity, data)
98
+ # if "officer" in data:
99
+ # yield from self.expand_officer(data, officer=entity)
100
+
101
+ def make_entity_id(self, url: str) -> str:
102
+ parsed = urlparse(url)
103
+ path = slugify_text(parsed.path, sep="-")
104
+ assert path is not None, "Invalid OpenCorporates URL: %s" % url
105
+ return f"oc-{path}"
106
+
107
+ def jurisdiction_to_country(self, juris: Optional[Any]) -> Optional[str]:
108
+ if juris is None:
109
+ return None
110
+ return str(juris).split("_", 1)[0]
111
+
112
+ def company_entity(
113
+ self, ref: SE, data: Dict[str, Any], entity: Optional[SE] = None
114
+ ) -> SE:
115
+ if "company" in data:
116
+ data = ensure_dict(data.get("company", data))
117
+ oc_url = cast(Optional[str], data.get("opencorporates_url"))
118
+ if oc_url is None:
119
+ raise ValueError("Company has no URL: %r" % data)
120
+ if entity is None:
121
+ entity = self.make_entity(ref, "Company")
122
+ entity.id = self.make_entity_id(oc_url)
123
+ entity.add("name", data.get("name"))
124
+
125
+ # TODO: make this an adjacent object?
126
+ address: Dict[str, Any] = ensure_dict(data.get("registered_address"))
127
+ entity.add("country", address.get("country"))
128
+
129
+ juris = self.jurisdiction_to_country(data.get("jurisdiction_code"))
130
+ entity.add("jurisdiction", juris)
131
+ entity.add("alias", data.get("alternative_names"))
132
+ entity.add("address", data.get("registered_address_in_full"))
133
+ entity.add("sourceUrl", data.get("registry_url"))
134
+ entity.add("legalForm", data.get("company_type"))
135
+ inc_date = data.get("incorporation_date")
136
+ entity.add("incorporationDate", parse_date(inc_date))
137
+ dis_date = data.get("dissolution_date")
138
+ entity.add("dissolutionDate", parse_date(dis_date))
139
+ entity.add("status", data.get("current_status"))
140
+ entity.add("registrationNumber", data.get("company_number"))
141
+ entity.add("opencorporatesUrl", oc_url)
142
+ source = data.get("source", {})
143
+ entity.add("publisher", source.get("publisher"))
144
+ entity.add("publisherUrl", source.get("url"))
145
+ entity.add("retrievedAt", parse_date(source.get("retrieved_at")))
146
+ for code in data.get("industry_codes", []):
147
+ code = code.get("industry_code", code)
148
+ entity.add("sector", code.get("description"))
149
+ for previous in data.get("previous_names", []):
150
+ entity.add("previousName", previous.get("company_name"))
151
+ for alias in data.get("alternative_names", []):
152
+ entity.add("alias", alias.get("company_name"))
153
+ return entity
154
+
155
+ # def officer_entity(self, data, entity=None):
156
+ # if "officer" in data:
157
+ # data = ensure_dict(data.get("officer", data))
158
+ # person = data.get("occupation") or data.get("date_of_birth")
159
+ # schema = "Person" if person else "LegalEntity"
160
+ # entity = model.make_entity(schema)
161
+ # entity.make_id(data.get("opencorporates_url"))
162
+ # entity.add("name", data.get("name"))
163
+ # entity.add("country", data.get("nationality"))
164
+ # entity.add("jurisdiction", data.get("jurisdiction_code"))
165
+ # entity.add("address", data.get("address"))
166
+ # entity.add("birthDate", data.get("date_of_birth"), quiet=True)
167
+ # entity.add("position", data.get("occupation"), quiet=True)
168
+ # entity.add("opencorporatesUrl", data.get("opencorporates_url"))
169
+ # source = data.get("source", {})
170
+ # entity.add("publisher", source.get("publisher"))
171
+ # entity.add("publisherUrl", source.get("url"))
172
+ # entity.add("retrievedAt", source.get("retrieved_at"))
173
+ # return entity
174
+
175
+ def search_companies(self, entity: SE) -> Generator[SE, None, None]:
176
+ countries = entity.get_type_values(registry.country)
177
+ params = {"q": entity.caption, "sparse": True, "country_codes": countries}
178
+ for page in range(1, 9):
179
+ params["page"] = page
180
+ results = self.oc_get_cached(self.COMPANY_SEARCH_API, params=params)
181
+ if results is None:
182
+ break
183
+
184
+ # print(results)
185
+ for company in results.get("results", {}).get("companies", []):
186
+ proxy = self.company_entity(entity, company)
187
+ yield proxy
188
+ if page >= results.get("total_pages", 0):
189
+ break
190
+
191
+ # def search_officers(self, entity):
192
+ # params = self.get_query(entity)
193
+ # for page in range(1, 9):
194
+ # params["page"] = page
195
+ # url = self.make_url(self.OFFICER_SEARCH_API, params)
196
+ # results = self.get_api(url)
197
+ # officers = results.get("results", {}).get("officers")
198
+ # for officer in ensure_list(officers):
199
+ # proxy = self.officer_entity(officer)
200
+ # yield self.make_match(entity, proxy)
201
+ # if page >= results.get("total_pages", 0):
202
+ # break
203
+
204
+ # def enrich_entity(self, entity):
205
+ # schema = entity.schema.name
206
+
207
+ # if schema in ["Person", "LegalEntity", "Company", "Organization"]:
208
+ # yield from self.search_officers(entity)
209
+
210
+ # def expand_company(self, entity, data):
211
+ # data = ensure_dict(data.get("company", data))
212
+ # entity = self.company_entity(data, entity=entity)
213
+ # for officer in ensure_list(data.get("officers")):
214
+ # yield from self.expand_officer(officer, company=entity)
215
+ # yield entity
216
+
217
+ # def expand_officer(self, data, entity=None, company=None):
218
+ # data = ensure_dict(data.get("officer", data))
219
+ # entity = self.officer_entity(data, entity=entity)
220
+ # yield entity
221
+
222
+ # company = self.company_entity(data.get("company"), entity=company)
223
+ # yield company
224
+
225
+ # if company.id and entity.id:
226
+ # directorship = model.make_entity("Directorship")
227
+ # directorship.make_id(data.get("opencorporates_url"), "Directorship")
228
+ # directorship.add("director", entity)
229
+ # directorship.add("startDate", data.get("start_date"))
230
+ # directorship.add("endDate", data.get("end_date"))
231
+ # directorship.add("organization", company)
232
+ # directorship.add("role", data.get("position"))
233
+ # yield directorship
@@ -0,0 +1,124 @@
1
+ import os
2
+ import logging
3
+ from typing import Generator, Dict, Optional
4
+ from followthemoney.util import make_entity_id
5
+ from followthemoney import DS, SE
6
+ from requests import Session
7
+
8
+ from nomenklatura.cache import Cache
9
+ from nomenklatura.enrich.common import Enricher, EnricherConfig
10
+
11
+ log = logging.getLogger(__name__)
12
+
13
+
14
+ class OpenFIGIEnricher(Enricher[DS]):
15
+ """Uses the `OpenFIGI` search API to look up FIGIs by company name."""
16
+
17
+ SEARCH_URL = "https://api.openfigi.com/v3/search"
18
+ MAPPING_URL = "https://api.openfigi.com/v3/mapping"
19
+
20
+ def __init__(
21
+ self,
22
+ dataset: DS,
23
+ cache: Cache,
24
+ config: EnricherConfig,
25
+ session: Optional[Session] = None,
26
+ ):
27
+ super().__init__(dataset, cache, config, session)
28
+ api_key_var = "${OPENFIGI_API_KEY}"
29
+ self.api_key: Optional[str] = self.get_config_expand("api_key", api_key_var)
30
+ if self.api_key == api_key_var:
31
+ self.api_key = None
32
+ if self.api_key is None:
33
+ log.warning("PermID has no API token (%s)" % api_key_var)
34
+
35
+ api_key = os.environ.get("OPENFIGI_API_KEY")
36
+ if api_key is not None:
37
+ self.session.headers["X-OPENFIGI-APIKEY"] = api_key
38
+
39
+ def make_company_id(self, name: str) -> str:
40
+ return f"figi-company-{make_entity_id(name)}"
41
+
42
+ def make_security_id(self, figi: str) -> str:
43
+ return f"figi-{figi}"
44
+
45
+ def search(self, query: str) -> Generator[Dict[str, str], None, None]:
46
+ body = {"query": query}
47
+ next = None
48
+
49
+ while True:
50
+ if next is not None:
51
+ body["start"] = next
52
+
53
+ log.info(f"Searching {query!r}, offset={next}")
54
+ cache_key = f"{self.SEARCH_URL}:{query}:{next}"
55
+ resp = self.http_post_json_cached(self.SEARCH_URL, cache_key, json=body)
56
+ if "data" in resp:
57
+ yield from resp["data"]
58
+
59
+ next = resp.get("next", None)
60
+ if next is None:
61
+ break
62
+
63
+ def match_organization(self, entity: SE) -> Generator[SE, None, None]:
64
+ for name in entity.get("name"):
65
+ for match in self.search(name):
66
+ match_name = match.get("name", None)
67
+ if match_name is None:
68
+ continue
69
+ other = self.make_entity(entity, "Company")
70
+ other.id = self.make_company_id(match_name)
71
+ other.add("name", match_name)
72
+ other.add("topics", "corp.public")
73
+ yield other
74
+
75
+ def match_security(self, entity: SE) -> Generator[SE, None, None]:
76
+ for isin in entity.get("isin"):
77
+ cache_key = f"{self.MAPPING_URL}:ISIN:{isin}"
78
+ query = [{"idType": "ID_ISIN", "idValue": isin}]
79
+ resp = self.http_post_json_cached(self.MAPPING_URL, cache_key, json=query)
80
+ for section in resp:
81
+ for item in section.get("data", []):
82
+ figi = item["figi"]
83
+ if figi != item.get("compositeFIGI", figi):
84
+ continue
85
+ security = self.make_entity(entity, "Security")
86
+ # security.id = self.make_security_id(item["figi"])
87
+ security.id = entity.id
88
+ security.add("isin", isin)
89
+ security.add("figiCode", item["figi"])
90
+ security.add("ticker", item["ticker"])
91
+ security.add("type", item["securityType"])
92
+ yield security
93
+
94
+ def match(self, entity: SE) -> Generator[SE, None, None]:
95
+ if entity.schema.is_a("Organization"):
96
+ yield from self.match_organization(entity)
97
+ if entity.schema.is_a("Security"):
98
+ yield from self.match_security(entity)
99
+
100
+ def expand(self, entity: SE, match: SE) -> Generator[SE, None, None]:
101
+ if match.schema.is_a("Security"):
102
+ yield match
103
+ if match.schema.is_a("Organization"):
104
+ name = match.first("name")
105
+ if name is None:
106
+ return
107
+ yield match
108
+ for item in self.search(name):
109
+ # Only emit the securities which match the name of the positive match
110
+ # to the company exactly. Skip everything else.
111
+ if item["name"] != name:
112
+ continue
113
+
114
+ figi = item["figi"]
115
+ security = self.make_entity(match, "Security")
116
+ security.id = self.make_security_id(figi)
117
+ security.add("figiCode", figi)
118
+ security.add("issuer", match)
119
+ security.add("ticker", item["ticker"])
120
+ security.add("type", item["securityType"])
121
+ # if item["exchCode"] is not None:
122
+ # security.add("notes", f'exchange {item["exchCode"]}')
123
+ security.add("description", item["securityDescription"])
124
+ yield security
@@ -0,0 +1,201 @@
1
+ import csv
2
+ import io
3
+ import json
4
+ import logging
5
+ from functools import lru_cache
6
+ from itertools import product
7
+ from typing import cast, Set, Generator, Optional, Dict, Any
8
+ from urllib.parse import urljoin
9
+
10
+ from followthemoney import StatementEntity, registry, DS, SE
11
+ from lxml import etree
12
+ from requests import Session
13
+
14
+ from nomenklatura.cache import Cache
15
+ from nomenklatura.enrich.common import Enricher, EnricherConfig
16
+ from nomenklatura.enrich.common import EnrichmentAbort
17
+ from nomenklatura.matching.compat import fingerprint_name
18
+
19
+ log = logging.getLogger(__name__)
20
+
21
+ GN = "{http://www.geonames.org/ontology#}"
22
+ STATUS = {
23
+ "tr-org:statusActive": "Active",
24
+ "tr-org:statusInActive": "Inactive",
25
+ }
26
+
27
+
28
+ class PermIDEnricher(Enricher[DS]):
29
+ MATCHING_API = "https://api-eit.refinitiv.com/permid/match"
30
+
31
+ def __init__(
32
+ self,
33
+ dataset: DS,
34
+ cache: Cache,
35
+ config: EnricherConfig,
36
+ session: Optional[Session] = None,
37
+ ):
38
+ super().__init__(dataset, cache, config, session)
39
+ token_var = "${PERMID_API_TOKEN}"
40
+ self.api_token: Optional[str] = self.get_config_expand("api_token", token_var)
41
+ if self.api_token == token_var:
42
+ self.api_token = None
43
+ if self.api_token is None:
44
+ log.warning("PermID has no API token (%s)" % token_var)
45
+ self.quota_exceeded = False
46
+
47
+ def entity_to_queries(self, entity: StatementEntity) -> bytes:
48
+ names = entity.get_type_values(registry.name, matchable=True)
49
+ countries = entity.get("jurisdiction", quiet=True)
50
+ if not len(countries):
51
+ countries = entity.get_type_values(registry.country, matchable=True)
52
+ country_set = {c.upper()[:2] for c in countries}
53
+ if len(country_set) == 0:
54
+ country_set.add("")
55
+ if len(names) * len(country_set) < 999:
56
+ country_set.add("")
57
+ if len(names) * len(country_set) < 999:
58
+ fp = fingerprint_name(entity.caption)
59
+ if fp is not None and fp not in names:
60
+ names.append(fp)
61
+ for name in entity.get("name", quiet=True):
62
+ if len(names) * len(country_set) >= 999:
63
+ break
64
+ fp = fingerprint_name(entity.caption)
65
+ if fp is not None and fp not in names:
66
+ names.append(fp)
67
+ sio = io.StringIO()
68
+ writer = csv.writer(sio, dialect=csv.unix_dialect, delimiter=",")
69
+ # LocalID,Standard Identifier,Name,Country,Street,City,PostalCode,State,Website
70
+ writer.writerow(["LocalID", "Standard Identifier", "Name", "Country"])
71
+ lei_code = entity.first("leiCode", quiet=True)
72
+ if lei_code is not None:
73
+ lei_code = f"LEI:{lei_code}"
74
+ else:
75
+ lei_code = ""
76
+ for name, country in list(product(names, country_set))[:999]:
77
+ writer.writerow([entity.id, lei_code, name, country])
78
+ sio.seek(0)
79
+ return sio.getvalue().encode("utf-8")
80
+
81
+ @lru_cache(maxsize=1000)
82
+ def fetch_placename(self, value: Optional[str]) -> Optional[str]:
83
+ if value is None:
84
+ return None
85
+ if not value.startswith("http://sws.geonames.org/"):
86
+ raise ValueError("Not a GeoNames URL: %s" % value)
87
+ url = urljoin(value, "about.rdf")
88
+ res = self.http_get_cached(url, cache_days=120)
89
+ try:
90
+ doc = etree.fromstring(res.encode("utf=8"))
91
+ except Exception:
92
+ log.warn("Invalid GeoNames response: %s", url)
93
+ self.http_remove_cache(url)
94
+ return None
95
+ for code in doc.findall(".//%scountryCode" % GN):
96
+ return code.text
97
+ for name in doc.findall(".//%sname" % GN):
98
+ return name.text
99
+ return value
100
+
101
+ def fetch_permid(self, url: str) -> Optional[Dict[str, Any]]:
102
+ params = {"format": "json-ld"}
103
+ hidden = {"access-token": self.api_token}
104
+ res_raw = self.http_get_cached(url, params=params, hidden=hidden, cache_days=90)
105
+ try:
106
+ return cast(Dict[str, Any], json.loads(res_raw))
107
+ except Exception:
108
+ log.info("Invalid response from PermID: %s", url)
109
+ self.http_remove_cache(url, params=params)
110
+ return None
111
+
112
+ def fetch_perm_org(self, entity: SE, url: str) -> Optional[SE]:
113
+ res = self.fetch_permid(url)
114
+ if res is None:
115
+ return None
116
+ res.pop("@id", None)
117
+ res.pop("@type", None)
118
+ res.pop("@context", None)
119
+ res.pop("hasPrimaryIndustryGroup", None)
120
+
121
+ perm_id = res.pop("tr-common:hasPermId", url.rsplit("-", 1)[-1])
122
+ lei_code = res.pop("tr-org:hasLEI", None)
123
+ match = self.make_entity(entity, "Company")
124
+ match.id = f"lei-{lei_code}" if lei_code is not None else f"permid-{perm_id}"
125
+ match.add("sourceUrl", url)
126
+ match.add("leiCode", lei_code)
127
+ match.add("permId", perm_id)
128
+ match.add("name", res.pop("vcard:organization-name", None))
129
+ match.add("website", res.pop("hasURL", None))
130
+ match.add("country", self.fetch_placename(res.pop("isDomiciledIn", None)))
131
+ incorporated = self.fetch_placename(res.pop("isIncorporatedIn", None))
132
+ match.add("jurisdiction", incorporated)
133
+ inc_date = res.pop("hasLatestOrganizationFoundedDate", None)
134
+ match.add("incorporationDate", inc_date)
135
+
136
+ hq_addr = res.pop("mdaas:HeadquartersAddress", None)
137
+ reg_addr = res.pop("mdaas:RegisteredAddress", None)
138
+ for addr in (hq_addr, reg_addr):
139
+ if addr is not None:
140
+ addr = ", ".join(addr.split("\n"))
141
+ addr = addr.replace(",,", ",").strip().strip(",")
142
+ match.add("address", addr)
143
+ status_uri = res.pop("hasActivityStatus", None)
144
+ status = STATUS.get(status_uri)
145
+ if status is None:
146
+ log.warning("Unknown status: %s" % status_uri)
147
+ match.add("status", status)
148
+ match.add("phone", res.pop("tr-org:hasHeadquartersPhoneNumber", None))
149
+ match.add("phone", res.pop("tr-org:hasRegisteredPhoneNumber", None))
150
+ res.pop("tr-org:hasHeadquartersFaxNumber", None)
151
+ res.pop("tr-org:hasRegisteredFaxNumber", None)
152
+
153
+ quote = res.pop("hasOrganizationPrimaryQuote", None)
154
+ if quote is not None:
155
+ quote_res = self.fetch_permid(quote)
156
+ if quote_res is not None:
157
+ match.add("ticker", quote_res.pop("tr-fin:hasExchangeTicker", None))
158
+ match.add("ricCode", quote_res.pop("tr-fin:hasRic", None))
159
+ match.add("topics", "corp.public")
160
+ return match
161
+
162
+ def match(self, entity: SE) -> Generator[SE, None, None]:
163
+ if self.quota_exceeded:
164
+ return
165
+ if not entity.schema.is_a("Organization"):
166
+ return
167
+ try:
168
+ for permid in entity.get("permId", quiet=True):
169
+ permid_url = f"https://permid.org/1-{permid}"
170
+ match = self.fetch_perm_org(entity, permid_url)
171
+ if match is not None:
172
+ yield match
173
+ headers = {
174
+ "x-openmatch-numberOfMatchesPerRecord": "4",
175
+ "X-AG-Access-Token": self.api_token,
176
+ "x-openmatch-dataType": "Organization",
177
+ }
178
+ cache_key = f"permid:{entity.id}"
179
+ query = self.entity_to_queries(entity)
180
+ res = self.http_post_json_cached(
181
+ self.MATCHING_API,
182
+ cache_key,
183
+ data=query,
184
+ headers=headers,
185
+ cache_days=self.cache_days,
186
+ )
187
+ seen_matches: Set[str] = set()
188
+ for result in res.get("outputContentResponse", []):
189
+ match_permid_url = result.get("Match OpenPermID")
190
+ if match_permid_url is None or match_permid_url in seen_matches:
191
+ continue
192
+ seen_matches.add(match_permid_url)
193
+ match = self.fetch_perm_org(entity, match_permid_url)
194
+ if match is not None:
195
+ yield match
196
+ except EnrichmentAbort as exc:
197
+ self.quota_exceeded = True
198
+ log.warning("PermID quota exceeded: %s", exc)
199
+
200
+ def expand(self, entity: SE, match: SE) -> Generator[SE, None, None]:
201
+ yield match