PyPI - nomenklatura-mpt - Versions diffs - 4.1.9__py3-none-any.whl - Mend

nomenklatura-mpt 4.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (118) hide show

nomenklatura/__init__.py +11 -0
nomenklatura/cache.py +194 -0
nomenklatura/cli.py +260 -0
nomenklatura/conflicting_match.py +80 -0
nomenklatura/data/er-unstable.pkl +0 -0
nomenklatura/data/regression-v1.pkl +0 -0
nomenklatura/db.py +139 -0
nomenklatura/delta.py +4 -0
nomenklatura/enrich/__init__.py +94 -0
nomenklatura/enrich/aleph.py +141 -0
nomenklatura/enrich/common.py +219 -0
nomenklatura/enrich/nominatim.py +72 -0
nomenklatura/enrich/opencorporates.py +233 -0
nomenklatura/enrich/openfigi.py +124 -0
nomenklatura/enrich/permid.py +201 -0
nomenklatura/enrich/wikidata.py +268 -0
nomenklatura/enrich/yente.py +116 -0
nomenklatura/exceptions.py +9 -0
nomenklatura/index/__init__.py +5 -0
nomenklatura/index/common.py +24 -0
nomenklatura/index/entry.py +89 -0
nomenklatura/index/index.py +170 -0
nomenklatura/index/tokenizer.py +92 -0
nomenklatura/judgement.py +21 -0
nomenklatura/kv.py +40 -0
nomenklatura/matching/__init__.py +47 -0
nomenklatura/matching/bench.py +32 -0
nomenklatura/matching/compare/__init__.py +0 -0
nomenklatura/matching/compare/addresses.py +71 -0
nomenklatura/matching/compare/countries.py +15 -0
nomenklatura/matching/compare/dates.py +83 -0
nomenklatura/matching/compare/gender.py +15 -0
nomenklatura/matching/compare/identifiers.py +30 -0
nomenklatura/matching/compare/names.py +157 -0
nomenklatura/matching/compare/util.py +51 -0
nomenklatura/matching/compat.py +66 -0
nomenklatura/matching/erun/__init__.py +0 -0
nomenklatura/matching/erun/countries.py +42 -0
nomenklatura/matching/erun/identifiers.py +64 -0
nomenklatura/matching/erun/misc.py +71 -0
nomenklatura/matching/erun/model.py +110 -0
nomenklatura/matching/erun/names.py +126 -0
nomenklatura/matching/erun/train.py +135 -0
nomenklatura/matching/erun/util.py +28 -0
nomenklatura/matching/logic_v1/__init__.py +0 -0
nomenklatura/matching/logic_v1/identifiers.py +104 -0
nomenklatura/matching/logic_v1/model.py +76 -0
nomenklatura/matching/logic_v1/multi.py +21 -0
nomenklatura/matching/logic_v1/phonetic.py +142 -0
nomenklatura/matching/logic_v2/__init__.py +0 -0
nomenklatura/matching/logic_v2/identifiers.py +124 -0
nomenklatura/matching/logic_v2/model.py +98 -0
nomenklatura/matching/logic_v2/names/__init__.py +3 -0
nomenklatura/matching/logic_v2/names/analysis.py +51 -0
nomenklatura/matching/logic_v2/names/distance.py +181 -0
nomenklatura/matching/logic_v2/names/magic.py +60 -0
nomenklatura/matching/logic_v2/names/match.py +195 -0
nomenklatura/matching/logic_v2/names/pairing.py +81 -0
nomenklatura/matching/logic_v2/names/util.py +89 -0
nomenklatura/matching/name_based/__init__.py +4 -0
nomenklatura/matching/name_based/misc.py +86 -0
nomenklatura/matching/name_based/model.py +59 -0
nomenklatura/matching/name_based/names.py +59 -0
nomenklatura/matching/pairs.py +42 -0
nomenklatura/matching/regression_v1/__init__.py +0 -0
nomenklatura/matching/regression_v1/misc.py +75 -0
nomenklatura/matching/regression_v1/model.py +110 -0
nomenklatura/matching/regression_v1/names.py +63 -0
nomenklatura/matching/regression_v1/train.py +87 -0
nomenklatura/matching/regression_v1/util.py +31 -0
nomenklatura/matching/svm_v1/__init__.py +5 -0
nomenklatura/matching/svm_v1/misc.py +94 -0
nomenklatura/matching/svm_v1/model.py +168 -0
nomenklatura/matching/svm_v1/names.py +81 -0
nomenklatura/matching/svm_v1/train.py +186 -0
nomenklatura/matching/svm_v1/util.py +30 -0
nomenklatura/matching/types.py +227 -0
nomenklatura/matching/util.py +62 -0
nomenklatura/publish/__init__.py +0 -0
nomenklatura/publish/dates.py +49 -0
nomenklatura/publish/edges.py +32 -0
nomenklatura/py.typed +0 -0
nomenklatura/resolver/__init__.py +6 -0
nomenklatura/resolver/common.py +2 -0
nomenklatura/resolver/edge.py +107 -0
nomenklatura/resolver/identifier.py +60 -0
nomenklatura/resolver/linker.py +101 -0
nomenklatura/resolver/resolver.py +565 -0
nomenklatura/settings.py +17 -0
nomenklatura/store/__init__.py +41 -0
nomenklatura/store/base.py +130 -0
nomenklatura/store/level.py +272 -0
nomenklatura/store/memory.py +102 -0
nomenklatura/store/redis_.py +131 -0
nomenklatura/store/sql.py +219 -0
nomenklatura/store/util.py +48 -0
nomenklatura/store/versioned.py +371 -0
nomenklatura/tui/__init__.py +17 -0
nomenklatura/tui/app.py +294 -0
nomenklatura/tui/app.tcss +52 -0
nomenklatura/tui/comparison.py +81 -0
nomenklatura/tui/util.py +35 -0
nomenklatura/util.py +26 -0
nomenklatura/versions.py +119 -0
nomenklatura/wikidata/__init__.py +14 -0
nomenklatura/wikidata/client.py +122 -0
nomenklatura/wikidata/lang.py +94 -0
nomenklatura/wikidata/model.py +139 -0
nomenklatura/wikidata/props.py +70 -0
nomenklatura/wikidata/qualified.py +49 -0
nomenklatura/wikidata/query.py +66 -0
nomenklatura/wikidata/value.py +87 -0
nomenklatura/xref.py +125 -0
nomenklatura_mpt-4.1.9.dist-info/METADATA +159 -0
nomenklatura_mpt-4.1.9.dist-info/RECORD +118 -0
nomenklatura_mpt-4.1.9.dist-info/WHEEL +4 -0
nomenklatura_mpt-4.1.9.dist-info/entry_points.txt +3 -0
nomenklatura_mpt-4.1.9.dist-info/licenses/LICENSE +21 -0

nomenklatura/enrich/wikidata.py ADDED Viewed

@@ -0,0 +1,268 @@
+import logging
+from typing import Generator, Optional, Set
+from followthemoney.helpers import check_person_cutoff
+from followthemoney import StatementEntity, registry, DS, SE
+from requests import Session
+from rigour.ids.wikidata import is_qid
+from rigour.territories import get_territory_by_qid
+from nomenklatura.cache import Cache
+from nomenklatura.enrich.common import Enricher, EnricherConfig
+from nomenklatura.wikidata.client import WikidataClient
+from nomenklatura.wikidata.lang import LangText
+from nomenklatura.wikidata.model import Claim, Item
+from nomenklatura.wikidata.props import (
+    PROPS_ASSOCIATION,
+    PROPS_DIRECT,
+    PROPS_FAMILY,
+    PROPS_QUALIFIED,
+    PROPS_TOPICS,
+)
+from nomenklatura.wikidata.qualified import qualify_value
+from nomenklatura.wikidata.value import clean_name, is_alias_strong
+log = logging.getLogger(__name__)
+class WikidataEnricher(Enricher[DS]):
+    def __init__(
+        self,
+        dataset: DS,
+        cache: Cache,
+        config: EnricherConfig,
+        session: Optional[Session] = None,
+    ):
+        super().__init__(dataset, cache, config, session)
+        self.depth = self.get_config_int("depth", 1)
+        self.client = WikidataClient(cache, self.session, cache_days=self.cache_days)
+    def keep_entity(self, entity: StatementEntity) -> bool:
+        if check_person_cutoff(entity):
+            return False
+        return True
+    def match(self, entity: SE) -> Generator[SE, None, None]:
+        if not entity.schema.is_a("Person"):
+            return
+        wikidata_id = self.get_wikidata_id(entity)
+        # Already has an ID associated with it:
+        if wikidata_id is not None:
+            item = self.client.fetch_item(wikidata_id)
+            if item is not None:
+                proxy = self.item_proxy(entity, item, schema=entity.schema.name)
+                if proxy is not None and self.keep_entity(proxy):
+                    yield proxy
+            return
+        for name in entity.get("name", quiet=True):
+            params = {
+                "format": "json",
+                "search": name,
+                "action": "wbsearchentities",
+                "language": "en",
+                "strictlanguage": "false",
+            }
+            data = self.http_get_json_cached(WikidataClient.WD_API, params=params)
+            if "search" not in data:
+                self.http_remove_cache(WikidataClient.WD_API, params=params)
+                log.info("Search response [%s] does not include results" % name)
+                continue
+            for result in data["search"]:
+                item = self.client.fetch_item(result["id"])
+                if item is not None:
+                    proxy = self.item_proxy(entity, item, schema=entity.schema.name)
+                    if proxy is not None and self.keep_entity(proxy):
+                        yield proxy
+    def expand(self, entity: SE, match: SE) -> Generator[SE, None, None]:
+        wikidata_id = self.get_wikidata_id(match)
+        if wikidata_id is None:
+            return
+        item = self.client.fetch_item(wikidata_id)
+        if item is None:
+            return
+        proxy = self.item_proxy(match, item, schema=match.schema.name)
+        if proxy is None or not self.keep_entity(proxy):
+            return
+        if "role.pep" in entity.get("topics", quiet=True):
+            proxy.add("topics", "role.pep")
+        yield proxy
+        yield from self.item_graph(proxy, item)
+    def get_wikidata_id(self, entity: SE) -> Optional[str]:
+        if entity.id is not None and is_qid(entity.id):
+            return str(entity.id)
+        for value in entity.get("wikidataId", quiet=True):
+            if is_qid(value):
+                return value
+        return None
+    def make_link(
+        self,
+        proxy: SE,
+        claim: Claim,
+        depth: int,
+        seen: Set[str],
+        schema: str,
+        other_schema: str,
+        source_prop: str,
+        target_prop: str,
+    ) -> Generator[SE, None, None]:
+        if depth < 1 or claim.qid is None or claim.qid in seen:
+            return
+        item = self.client.fetch_item(claim.qid)
+        if item is None:
+            return
+        other = self.item_proxy(proxy, item, schema=other_schema)
+        if other is None or not self.keep_entity(other):
+            return None
+        if proxy.id is None or other.id is None:
+            return None
+        # Hacky: if an entity is a PEP, then by definition their relatives and
+        # associates are RCA (relatives and close associates).
+        if "role.pep" in proxy.get("topics", quiet=True):
+            if "role.pep" not in other.get("topics"):
+                other.add("topics", "role.rca")
+        yield other
+        yield from self.item_graph(other, item, depth=depth - 1, seen=seen)
+        link = self.make_entity(proxy, schema)
+        min_id, max_id = sorted((proxy.id, other.id))
+        # FIXME: doesn't lead to collisions because claim.property has an inverse:
+        link.id = f"wd-{claim.property}-{min_id}-{max_id}"
+        link.id = link.id.lower()
+        link.add(source_prop, proxy.id)
+        link.add(target_prop, item.id)
+        claim.property_label.apply(link, "relationship")
+        for qual in claim.get_qualifier("P580"):
+            qual.text.apply(link, "startDate")
+        for qual in claim.get_qualifier("P582"):
+            qual.text.apply(link, "endDate")
+        for qual in claim.get_qualifier("P585"):
+            qual.text.apply(link, "date")
+        for qual in claim.get_qualifier("P1039"):
+            qual.text.apply(link, "relationship")
+        for qual in claim.get_qualifier("P2868"):
+            qual.text.apply(link, "relationship")
+        for ref in claim.references:
+            for snak in ref.get("P854"):
+                snak.text.apply(link, "sourceUrl")
+        yield link
+    def item_graph(
+        self,
+        proxy: SE,
+        item: Item,
+        depth: Optional[int] = None,
+        seen: Optional[Set[str]] = None,
+    ) -> Generator[SE, None, None]:
+        if seen is None:
+            seen = set()
+        seen = seen.union([item.id])
+        if depth is None:
+            depth = self.depth
+        for claim in item.claims:
+            # TODO: memberships, employers?
+            if claim.property in PROPS_FAMILY:
+                yield from self.make_link(
+                    proxy,
+                    claim,
+                    depth,
+                    seen,
+                    schema="Family",
+                    other_schema="Person",
+                    source_prop="person",
+                    target_prop="relative",
+                )
+                continue
+            if claim.property in PROPS_ASSOCIATION:
+                yield from self.make_link(
+                    proxy,
+                    claim,
+                    depth,
+                    seen,
+                    schema="Associate",
+                    other_schema="Person",
+                    source_prop="person",
+                    target_prop="associate",
+                )
+                continue
+    def item_proxy(self, ref: SE, item: Item, schema: str = "Person") -> Optional[SE]:
+        proxy = self.make_entity(ref, schema)
+        proxy.id = item.id
+        if item.modified is None:
+            return None
+        # proxy.add("modifiedAt", item.modified)
+        proxy.add("wikidataId", item.id)
+        names: Set[str] = set()
+        for label in item.labels:
+            label.apply(proxy, "name", clean=clean_name)
+            if label.text is not None:
+                names.add(label.text.lower())
+        if item.description is not None:
+            item.description.apply(proxy, "notes")
+        for alias in item.aliases:
+            if alias.text is None or alias.text.lower() in names:
+                continue
+            _strong = is_alias_strong(alias.text, names)
+            prop = "alias" if _strong else "weakAlias"
+            alias.apply(proxy, prop, clean=clean_name)
+            if _strong:
+                names.add(alias.text.lower())
+        if proxy.schema.is_a("Person") and not item.is_instance("Q5"):
+            log.debug("Person is not a Q5 [%s]: %s", item.id, item.labels)
+            return None
+        names_concat = " ".join(names)
+        for claim in item.claims:
+            if claim.property is None:
+                continue
+            ftm_prop = PROPS_DIRECT.get(claim.property)
+            if ftm_prop is None:
+                continue
+            if ftm_prop not in proxy.schema.properties:
+                log.info("Entity %s does not have property: %s", proxy.id, ftm_prop)
+                continue
+            ftm_prop_ = proxy.schema.get(ftm_prop)
+            if ftm_prop_ is None:
+                log.info("Entity %s does not have property: %s", proxy.id, ftm_prop)
+                continue
+            if ftm_prop_.type == registry.country:
+                territory = get_territory_by_qid(claim.qid)
+                if territory is None or territory.ftm_country is None:
+                    continue
+                value = LangText(territory.ftm_country, original=claim.qid)
+            else:
+                value = claim.text
+            # Sanity check that the name parts are in any of the full names:
+            if ftm_prop in ("firstName", "lastName", "fatherName"):
+                if value.text is None or value.text.lower() not in names_concat:
+                    continue
+            # Make sure the aliases look like the main name, otherwise mark them as weak:
+            if ftm_prop == "alias":
+                if value.text is None or value.text.lower() in names:
+                    continue
+                _strong = is_alias_strong(value.text, names)
+                ftm_prop = "alias" if _strong else "weakAlias"
+            if ftm_prop in PROPS_QUALIFIED:
+                value = qualify_value(value, claim)
+            if ftm_prop == "topics":
+                topic = PROPS_TOPICS.get(claim.qid or "")
+                if topic is None:
+                    continue
+                value = LangText(topic, original=claim.qid)
+            value.apply(proxy, ftm_prop)
+        return proxy

nomenklatura/enrich/yente.py ADDED Viewed

@@ -0,0 +1,116 @@
+import os
+import time
+import logging
+from banal import ensure_list
+from typing import Any, Generator, Optional, Dict, List
+from urllib.parse import urljoin
+from followthemoney import registry, DS, SE
+from followthemoney import StatementEntity
+from followthemoney.namespace import Namespace
+from requests import Session
+from rigour.urls import build_url
+from nomenklatura.cache import Cache
+from nomenklatura.enrich.common import Enricher, EnricherConfig
+from nomenklatura.enrich.common import EnrichmentException
+log = logging.getLogger(__name__)
+class YenteEnricher(Enricher[DS]):
+    """Uses the `yente` match API to look up entities in a specific dataset."""
+    def __init__(
+        self,
+        dataset: DS,
+        cache: Cache,
+        config: EnricherConfig,
+        session: Optional[Session] = None,
+    ):
+        super().__init__(dataset, cache, config, session)
+        self._api: str = config.pop("api")
+        self._yente_dataset: str = config.pop("dataset", "default")
+        self._cutoff: Optional[float] = config.pop("cutoff", None)
+        self._algorithm: Optional[float] = config.pop("algorithm", "best")
+        self._nested: bool = config.pop("expand_nested", True)
+        self._fuzzy: bool = config.pop("fuzzy", False)
+        self._ns: Optional[Namespace] = None
+        if self.get_config_bool("strip_namespace"):
+            self._ns = Namespace()
+        api_key: Optional[str] = os.path.expandvars(config.pop("api_key", "")).strip()
+        if api_key is None or not len(api_key):
+            api_key = os.environ.get("YENTE_API_KEY")
+        self._api_key: Optional[str] = api_key
+        if self._api_key is not None:
+            self.session.headers["Authorization"] = f"ApiKey {self._api_key}"
+    def make_url(self, entity: StatementEntity) -> str:
+        return urljoin(self._api, f"entities/{entity.id}")
+    def match(self, entity: SE) -> Generator[SE, None, None]:
+        if not entity.schema.matchable:
+            return
+        url = urljoin(self._api, f"match/{self._yente_dataset}")
+        params: Dict[str, Any] = {"fuzzy": self._fuzzy, "algorithm": self._algorithm}
+        if self._cutoff is not None:
+            params["cutoff"] = self._cutoff
+        url = build_url(url, params)
+        cache_key = f"{url}:{entity.id}"
+        props: Dict[str, List[str]] = {}
+        for prop in entity.iterprops():
+            if prop.type == registry.entity:
+                continue
+            if prop.matchable:
+                props[prop.name] = entity.get(prop)
+        query = {
+            "queries": {
+                "entity": {
+                    "schema": entity.schema.name,
+                    "properties": props,
+                }
+            }
+        }
+        for retry in range(4):
+            try:
+                response = self.http_post_json_cached(url, cache_key, query)
+                inner_resp = response.get("responses", {}).get("entity", {})
+                for result in inner_resp.get("results", []):
+                    proxy = self.load_entity(entity, result)
+                    proxy.add("sourceUrl", self.make_url(proxy))
+                    if self._ns is not None:
+                        proxy = self._ns.apply(proxy)
+                    yield proxy
+                return
+            except EnrichmentException as exc:
+                log.info("Error matching %r: %s", entity, exc)
+                if retry == 3:
+                    raise
+                time.sleep((retry + 1) ** 2)
+    def _traverse_nested(self, entity: SE, response: Any) -> Generator[SE, None, None]:
+        entity = self.load_entity(entity, response)
+        if self._ns is not None:
+            entity = self._ns.apply(entity)
+        yield entity
+        for prop_name, values in response.get("properties", {}).items():
+            prop = entity.schema.properties.get(prop_name)
+            if prop is None or prop.type != registry.entity:
+                continue
+            for value in ensure_list(values):
+                if isinstance(value, dict):
+                    if prop.reverse is not None and not prop.reverse.stub:
+                        reverse = prop.reverse.name
+                        if reverse not in value["properties"]:
+                            value["properties"][reverse] = []
+                        value["properties"][reverse].append(entity.id)
+                    yield from self._traverse_nested(entity, value)
+    def expand(self, entity: SE, match: SE) -> Generator[SE, None, None]:
+        url = self.make_url(match)
+        for source_url in match.get("sourceUrl", quiet=True):
+            if source_url.startswith(self._api):
+                url = source_url
+        url = build_url(url, {"nested": self._nested})
+        response = self.http_get_json_cached(url)
+        yield from self._traverse_nested(match, response)

nomenklatura/exceptions.py ADDED Viewed

@@ -0,0 +1,9 @@
+from followthemoney.exc import FollowTheMoneyException
+class NomenklaturaException(FollowTheMoneyException):
+    pass
+class MetadataException(NomenklaturaException):
+    pass

nomenklatura/index/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from nomenklatura.index.index import Index
+from nomenklatura.index.common import BaseIndex
+__all__ = ["BaseIndex", "Index"]

nomenklatura/index/common.py ADDED Viewed

@@ -0,0 +1,24 @@
+from pathlib import Path
+from typing import Generic, Iterable, List, Tuple
+from followthemoney import DS, SE
+from nomenklatura.resolver import Identifier
+from nomenklatura.store import View
+class BaseIndex(Generic[DS, SE]):
+    MAX_PAIRS = 10_000
+    name: str
+    def __init__(self, view: View[DS, SE], data_dir: Path) -> None:
+        raise NotImplementedError
+    def build(self) -> None:
+        raise NotImplementedError
+    def pairs(
+        self, max_pairs: int = MAX_PAIRS
+    ) -> Iterable[Tuple[Tuple[Identifier, Identifier], float]]:
+        raise NotImplementedError
+    def match(self, entity: SE) -> List[Tuple[Identifier, float]]:
+        raise NotImplementedError

nomenklatura/index/entry.py ADDED Viewed

@@ -0,0 +1,89 @@
+from typing import Any, Dict, Generator, Tuple
+from nomenklatura.resolver import Identifier
+class Entry(object):
+    """A set of entities and a weight associated with a given term in the index."""
+    __slots__ = "idf", "entities"
+    def __init__(self) -> None:
+        self.entities: Dict[Identifier, int] = dict()
+    def add(self, entity_id: Identifier) -> None:
+        """Mark the given entity as relevant to the entry's token."""
+        # This is insane and meant to trade perf for memory:
+        try:
+            self.entities[entity_id] += 1
+        except KeyError:
+            self.entities[entity_id] = 1
+    def frequencies(
+        self, field: "Field"
+    ) -> Generator[Tuple[Identifier, float], None, None]:
+        """
+        Term Frequency (TF) for each entity in this entry.
+        TF being the number of occurrences of this token in the entity divided
+        by the total number of tokens in the entity (scoped to this field).
+        """
+        for entity_id, mentions in self.entities.items():
+            field_len = max(1, field.entities[entity_id])
+            yield entity_id, (mentions / field_len)
+    def __repr__(self) -> str:
+        return "<Entry(%r)>" % len(self.entities)
+    def to_dict(self) -> Dict[str, Any]:
+        return {"entities": self.entities}
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "Entry":
+        obj = cls()
+        obj.entities = data["entities"]
+        return obj
+class Field(object):
+    """Index of all tokens of the same type."""
+    __slots__ = "len", "avg_len", "tokens", "entities"
+    def __init__(self) -> None:
+        self.len = 0
+        self.avg_len = 0.0
+        self.tokens: Dict[str, Entry] = {}
+        self.entities: Dict[Identifier, int] = {}
+    def add(self, entity_id: Identifier, token: str) -> None:
+        if token not in self.tokens:
+            self.tokens[token] = Entry()
+        self.tokens[token].add(entity_id)
+        try:
+            self.entities[entity_id] += 1
+        except KeyError:
+            self.entities[entity_id] = 1
+    def compute(self) -> None:
+        self.len = max(1, len(self.entities))
+        self.avg_len = sum(self.entities.values()) / self.len
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "tokens": {t: e.to_dict() for t, e in self.tokens.items()},
+            "entities": {i.id: c for i, c in self.entities.items()},
+        }
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "Field":
+        obj = cls()
+        inverted = data["tokens"].items()
+        obj.tokens = {t: Entry.from_dict(i) for t, i in inverted}
+        # obj.entities = cast(Dict[str, int], data.get("entities"))
+        entities: Dict[str, int] = data.get("entities", {})
+        obj.entities = {Identifier.get(e): c for e, c in entities.items()}
+        return obj
+    def __repr__(self) -> str:
+        return "<Field(%d, %.3f)>" % (self.len, self.avg_len)

nomenklatura/index/index.py ADDED Viewed

@@ -0,0 +1,170 @@
+from pathlib import Path
+import pickle
+import logging
+from itertools import combinations
+from typing import Any, Dict, List, Set, Tuple
+from followthemoney import registry, DS, SE
+from followthemoney.util import PathLike
+from nomenklatura.resolver import Pair, Identifier
+from nomenklatura.store import View
+from nomenklatura.index.entry import Field
+from nomenklatura.index.tokenizer import NAME_PART_FIELD, WORD_FIELD, Tokenizer
+from nomenklatura.index.common import BaseIndex
+log = logging.getLogger(__name__)
+class Index(BaseIndex[DS, SE]):
+    """
+    An in-memory search index to match entities against a given dataset.
+    For each field in the dataset, the index stores the IDs which contains each
+    token, along with the absolute frequency of each token in the document.
+    """
+    name = "memory"
+    BOOSTS = {
+        NAME_PART_FIELD: 2.0,
+        WORD_FIELD: 0.5,
+        registry.name.name: 10.0,
+        # registry.country.name: 1.5,
+        # registry.date.name: 1.5,
+        # registry.language: 0.7,
+        # registry.iban.name: 3.0,
+        registry.phone.name: 3.0,
+        registry.email.name: 3.0,
+        # registry.entity: 0.0,
+        # registry.topic: 2.1,
+        registry.address.name: 2.5,
+        registry.identifier.name: 3.0,
+    }
+    __slots__ = "view", "fields", "tokenizer", "entities"
+    def __init__(self, view: View[DS, SE], data_dir: Path):
+        self.view = view
+        self.tokenizer = Tokenizer[DS, SE]()
+        self.fields: Dict[str, Field] = {}
+        self.entities: Set[Identifier] = set()
+    def index(self, entity: SE) -> None:
+        """Index one entity. This is not idempotent, you need to remove the
+        entity before re-indexing it."""
+        if not entity.schema.matchable or entity.id is None:
+            return
+        ident = Identifier.get(entity.id)
+        for field, token in self.tokenizer.entity(entity):
+            if field not in self.fields:
+                self.fields[field] = Field()
+            self.fields[field].add(ident, token)
+        self.entities.add(ident)
+    def build(self) -> None:
+        """Index all entities in the dataset."""
+        log.info("Building index from: %r...", self.view)
+        self.fields = {}
+        self.entities = set()
+        for entity in self.view.entities():
+            self.index(entity)
+        self.commit()
+        log.info("Built index: %r", self)
+    def commit(self) -> None:
+        for field in self.fields.values():
+            field.compute()
+    def pairs(self, max_pairs: int = BaseIndex.MAX_PAIRS) -> List[Tuple[Pair, float]]:
+        """
+        A second method of doing xref: summing up the pairwise match value
+        for all entities linearly. This uses a lot of memory but is really
+        fast.
+        The score of each pair is the the sum of the product of term frequencies for
+        each co-occurring token in each field of the pair.
+        We skip any tokens with more than 100 entities.
+        """
+        pairs: Dict[Pair, float] = {}
+        log.info("Building index blocking pairs...")
+        for field_name, field in self.fields.items():
+            boost = self.BOOSTS.get(field_name, 1.0)
+            for idx, entry in enumerate(field.tokens.values()):
+                if idx % 10000 == 0:
+                    log.info("Pairwise xref [%s]: %d" % (field_name, idx))
+                if len(entry.entities) == 1 or len(entry.entities) > 100:
+                    continue
+                entities = entry.frequencies(field)
+                for (left, lw), (right, rw) in combinations(entities, 2):
+                    if lw == 0.0 or rw == 0.0:
+                        continue
+                    pair = (max(left, right), min(left, right))
+                    if pair not in pairs:
+                        pairs[pair] = 0
+                    score = (lw + rw) * boost
+                    pairs[pair] += score
+        return sorted(pairs.items(), key=lambda p: p[1], reverse=True)[:max_pairs]
+    def match(self, entity: SE) -> List[Tuple[Identifier, float]]:
+        """Match an entity against the index, returning a list of
+        (entity_id, score) pairs."""
+        scores: Dict[Identifier, float] = {}
+        for field_name, token in self.tokenizer.entity(entity):
+            field = self.fields.get(field_name)
+            if field is None:
+                continue
+            entry = field.tokens.get(token)
+            if entry is None:
+                continue
+            for ident, weight in entry.frequencies(field):
+                if ident not in scores:
+                    scores[ident] = 0.0
+                scores[ident] += weight * self.BOOSTS.get(field_name, 1.0)
+        return sorted(scores.items(), key=lambda s: s[1], reverse=True)
+    def save(self, path: PathLike) -> None:
+        with open(path, "wb") as fh:
+            pickle.dump(self.to_dict(), fh)
+    @classmethod
+    def load(cls, view: View[DS, SE], path: Path, data_dir: Path) -> "Index[DS, SE]":
+        index = Index(view, data_dir)
+        if not path.exists():
+            log.debug("Cannot load: %r", index)
+            index.build()
+            index.save(path)
+            return index
+        with open(path, "rb") as fh:
+            state = pickle.load(fh)
+            index.from_dict(state)
+            index.commit()
+        log.debug("Loaded: %r", index)
+        return index
+    def to_dict(self) -> Dict[str, Any]:
+        """Prepare an index for pickling."""
+        return {
+            "fields": {n: f.to_dict() for n, f in self.fields.items()},
+            "entities": [e.id for e in self.entities],
+        }
+    def from_dict(self, state: Dict[str, Any]) -> None:
+        """Restore a pickled index."""
+        fields = state["fields"].items()
+        self.fields = {t: Field.from_dict(i) for t, i in fields}
+        entities: List[str] = state.get("entities", [])
+        self.entities = set((Identifier.get(e) for e in entities))
+    def __len__(self) -> int:
+        return len(self.entities)
+    def __repr__(self) -> str:
+        return "<Index(%r, %d, %d)>" % (
+            self.view.scope.name,
+            len(self.fields),
+            len(self.entities),
+        )