nomenklatura-mpt 4.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nomenklatura/__init__.py +11 -0
- nomenklatura/cache.py +194 -0
- nomenklatura/cli.py +260 -0
- nomenklatura/conflicting_match.py +80 -0
- nomenklatura/data/er-unstable.pkl +0 -0
- nomenklatura/data/regression-v1.pkl +0 -0
- nomenklatura/db.py +139 -0
- nomenklatura/delta.py +4 -0
- nomenklatura/enrich/__init__.py +94 -0
- nomenklatura/enrich/aleph.py +141 -0
- nomenklatura/enrich/common.py +219 -0
- nomenklatura/enrich/nominatim.py +72 -0
- nomenklatura/enrich/opencorporates.py +233 -0
- nomenklatura/enrich/openfigi.py +124 -0
- nomenklatura/enrich/permid.py +201 -0
- nomenklatura/enrich/wikidata.py +268 -0
- nomenklatura/enrich/yente.py +116 -0
- nomenklatura/exceptions.py +9 -0
- nomenklatura/index/__init__.py +5 -0
- nomenklatura/index/common.py +24 -0
- nomenklatura/index/entry.py +89 -0
- nomenklatura/index/index.py +170 -0
- nomenklatura/index/tokenizer.py +92 -0
- nomenklatura/judgement.py +21 -0
- nomenklatura/kv.py +40 -0
- nomenklatura/matching/__init__.py +47 -0
- nomenklatura/matching/bench.py +32 -0
- nomenklatura/matching/compare/__init__.py +0 -0
- nomenklatura/matching/compare/addresses.py +71 -0
- nomenklatura/matching/compare/countries.py +15 -0
- nomenklatura/matching/compare/dates.py +83 -0
- nomenklatura/matching/compare/gender.py +15 -0
- nomenklatura/matching/compare/identifiers.py +30 -0
- nomenklatura/matching/compare/names.py +157 -0
- nomenklatura/matching/compare/util.py +51 -0
- nomenklatura/matching/compat.py +66 -0
- nomenklatura/matching/erun/__init__.py +0 -0
- nomenklatura/matching/erun/countries.py +42 -0
- nomenklatura/matching/erun/identifiers.py +64 -0
- nomenklatura/matching/erun/misc.py +71 -0
- nomenklatura/matching/erun/model.py +110 -0
- nomenklatura/matching/erun/names.py +126 -0
- nomenklatura/matching/erun/train.py +135 -0
- nomenklatura/matching/erun/util.py +28 -0
- nomenklatura/matching/logic_v1/__init__.py +0 -0
- nomenklatura/matching/logic_v1/identifiers.py +104 -0
- nomenklatura/matching/logic_v1/model.py +76 -0
- nomenklatura/matching/logic_v1/multi.py +21 -0
- nomenklatura/matching/logic_v1/phonetic.py +142 -0
- nomenklatura/matching/logic_v2/__init__.py +0 -0
- nomenklatura/matching/logic_v2/identifiers.py +124 -0
- nomenklatura/matching/logic_v2/model.py +98 -0
- nomenklatura/matching/logic_v2/names/__init__.py +3 -0
- nomenklatura/matching/logic_v2/names/analysis.py +51 -0
- nomenklatura/matching/logic_v2/names/distance.py +181 -0
- nomenklatura/matching/logic_v2/names/magic.py +60 -0
- nomenklatura/matching/logic_v2/names/match.py +195 -0
- nomenklatura/matching/logic_v2/names/pairing.py +81 -0
- nomenklatura/matching/logic_v2/names/util.py +89 -0
- nomenklatura/matching/name_based/__init__.py +4 -0
- nomenklatura/matching/name_based/misc.py +86 -0
- nomenklatura/matching/name_based/model.py +59 -0
- nomenklatura/matching/name_based/names.py +59 -0
- nomenklatura/matching/pairs.py +42 -0
- nomenklatura/matching/regression_v1/__init__.py +0 -0
- nomenklatura/matching/regression_v1/misc.py +75 -0
- nomenklatura/matching/regression_v1/model.py +110 -0
- nomenklatura/matching/regression_v1/names.py +63 -0
- nomenklatura/matching/regression_v1/train.py +87 -0
- nomenklatura/matching/regression_v1/util.py +31 -0
- nomenklatura/matching/svm_v1/__init__.py +5 -0
- nomenklatura/matching/svm_v1/misc.py +94 -0
- nomenklatura/matching/svm_v1/model.py +168 -0
- nomenklatura/matching/svm_v1/names.py +81 -0
- nomenklatura/matching/svm_v1/train.py +186 -0
- nomenklatura/matching/svm_v1/util.py +30 -0
- nomenklatura/matching/types.py +227 -0
- nomenklatura/matching/util.py +62 -0
- nomenklatura/publish/__init__.py +0 -0
- nomenklatura/publish/dates.py +49 -0
- nomenklatura/publish/edges.py +32 -0
- nomenklatura/py.typed +0 -0
- nomenklatura/resolver/__init__.py +6 -0
- nomenklatura/resolver/common.py +2 -0
- nomenklatura/resolver/edge.py +107 -0
- nomenklatura/resolver/identifier.py +60 -0
- nomenklatura/resolver/linker.py +101 -0
- nomenklatura/resolver/resolver.py +565 -0
- nomenklatura/settings.py +17 -0
- nomenklatura/store/__init__.py +41 -0
- nomenklatura/store/base.py +130 -0
- nomenklatura/store/level.py +272 -0
- nomenklatura/store/memory.py +102 -0
- nomenklatura/store/redis_.py +131 -0
- nomenklatura/store/sql.py +219 -0
- nomenklatura/store/util.py +48 -0
- nomenklatura/store/versioned.py +371 -0
- nomenklatura/tui/__init__.py +17 -0
- nomenklatura/tui/app.py +294 -0
- nomenklatura/tui/app.tcss +52 -0
- nomenklatura/tui/comparison.py +81 -0
- nomenklatura/tui/util.py +35 -0
- nomenklatura/util.py +26 -0
- nomenklatura/versions.py +119 -0
- nomenklatura/wikidata/__init__.py +14 -0
- nomenklatura/wikidata/client.py +122 -0
- nomenklatura/wikidata/lang.py +94 -0
- nomenklatura/wikidata/model.py +139 -0
- nomenklatura/wikidata/props.py +70 -0
- nomenklatura/wikidata/qualified.py +49 -0
- nomenklatura/wikidata/query.py +66 -0
- nomenklatura/wikidata/value.py +87 -0
- nomenklatura/xref.py +125 -0
- nomenklatura_mpt-4.1.9.dist-info/METADATA +159 -0
- nomenklatura_mpt-4.1.9.dist-info/RECORD +118 -0
- nomenklatura_mpt-4.1.9.dist-info/WHEEL +4 -0
- nomenklatura_mpt-4.1.9.dist-info/entry_points.txt +3 -0
- nomenklatura_mpt-4.1.9.dist-info/licenses/LICENSE +21 -0
nomenklatura/__init__.py
ADDED
nomenklatura/cache.py
ADDED
@@ -0,0 +1,194 @@
|
|
1
|
+
import math
|
2
|
+
import json
|
3
|
+
import logging
|
4
|
+
from random import randint
|
5
|
+
from dataclasses import dataclass
|
6
|
+
from typing import Any, cast, Dict, Optional, Union, Generator
|
7
|
+
from datetime import datetime, timedelta
|
8
|
+
from sqlalchemy import MetaData
|
9
|
+
from sqlalchemy import Table, Column, DateTime, Unicode
|
10
|
+
from sqlalchemy.engine import Engine, Connection, Transaction
|
11
|
+
from sqlalchemy.future import select
|
12
|
+
from sqlalchemy.sql.expression import delete
|
13
|
+
from sqlalchemy.exc import OperationalError, InvalidRequestError
|
14
|
+
from sqlalchemy.dialects.postgresql import insert as upsert
|
15
|
+
from rigour.time import naive_now
|
16
|
+
from followthemoney import Dataset
|
17
|
+
|
18
|
+
from nomenklatura.db import get_engine, get_metadata
|
19
|
+
|
20
|
+
|
21
|
+
log = logging.getLogger(__name__)
|
22
|
+
Value = Union[str, None]
|
23
|
+
|
24
|
+
|
25
|
+
@dataclass
|
26
|
+
class CacheValue:
|
27
|
+
key: str
|
28
|
+
dataset: Optional[str]
|
29
|
+
text: Value
|
30
|
+
timestamp: datetime
|
31
|
+
|
32
|
+
|
33
|
+
def randomize_cache(days: int) -> timedelta:
|
34
|
+
min_cache = max(1, math.ceil(days * 0.5))
|
35
|
+
max_cache = math.ceil(days * 1.3)
|
36
|
+
return timedelta(days=randint(min_cache, max_cache))
|
37
|
+
|
38
|
+
|
39
|
+
class Cache(object):
|
40
|
+
def __init__(
|
41
|
+
self, engine: Engine, metadata: MetaData, dataset: Dataset, create: bool = False
|
42
|
+
) -> None:
|
43
|
+
self.dataset = dataset
|
44
|
+
self._engine = engine
|
45
|
+
self._conn: Optional[Connection] = None
|
46
|
+
self._transaction: Optional[Transaction] = None
|
47
|
+
self._table = Table(
|
48
|
+
"cache",
|
49
|
+
metadata,
|
50
|
+
Column("key", Unicode(), primary_key=True),
|
51
|
+
Column("text", Unicode(), nullable=True),
|
52
|
+
Column("dataset", Unicode(), nullable=False),
|
53
|
+
Column("timestamp", DateTime, index=True),
|
54
|
+
extend_existing=True,
|
55
|
+
)
|
56
|
+
if create:
|
57
|
+
metadata.create_all(bind=engine, checkfirst=True, tables=[self._table])
|
58
|
+
|
59
|
+
self._preload: Dict[str, CacheValue] = {}
|
60
|
+
|
61
|
+
@property
|
62
|
+
def conn(self) -> Connection:
|
63
|
+
if self._conn is None:
|
64
|
+
self._conn = self._engine.connect()
|
65
|
+
self._transaction = self._conn.begin()
|
66
|
+
return self._conn
|
67
|
+
|
68
|
+
def set(self, key: str, value: Value) -> None:
|
69
|
+
self._preload.pop(key, None)
|
70
|
+
cache = {
|
71
|
+
"timestamp": naive_now(),
|
72
|
+
"key": key,
|
73
|
+
"dataset": self.dataset.name,
|
74
|
+
"text": value,
|
75
|
+
}
|
76
|
+
try:
|
77
|
+
istmt = upsert(self._table).values(cache)
|
78
|
+
values = dict(
|
79
|
+
timestamp=istmt.excluded.timestamp,
|
80
|
+
text=istmt.excluded.text,
|
81
|
+
dataset=istmt.excluded.dataset,
|
82
|
+
)
|
83
|
+
stmt = istmt.on_conflict_do_update(index_elements=["key"], set_=values)
|
84
|
+
self.conn.execute(stmt)
|
85
|
+
except (OperationalError, InvalidRequestError) as exc:
|
86
|
+
log.exception("Error while saving to cache: %s" % exc)
|
87
|
+
self.reset()
|
88
|
+
|
89
|
+
def set_json(self, key: str, value: Any) -> None:
|
90
|
+
return self.set(key, json.dumps(value))
|
91
|
+
|
92
|
+
def get(self, key: str, max_age: Optional[int] = None) -> Optional[Value]:
|
93
|
+
if max_age is not None and max_age < 1:
|
94
|
+
return None
|
95
|
+
|
96
|
+
cache_cutoff = None
|
97
|
+
if max_age is not None:
|
98
|
+
cache_cutoff = naive_now() - randomize_cache(max_age)
|
99
|
+
|
100
|
+
cache = self._preload.get(key)
|
101
|
+
if cache is not None:
|
102
|
+
if cache_cutoff is not None and cache.timestamp < cache_cutoff:
|
103
|
+
return None
|
104
|
+
return cache.text
|
105
|
+
|
106
|
+
q = select(self._table.c.text)
|
107
|
+
q = q.filter(self._table.c.key == key)
|
108
|
+
if cache_cutoff is not None:
|
109
|
+
q = q.filter(self._table.c.timestamp > cache_cutoff)
|
110
|
+
q = q.order_by(self._table.c.timestamp.desc())
|
111
|
+
q = q.limit(1)
|
112
|
+
try:
|
113
|
+
result = self.conn.execute(q)
|
114
|
+
row = result.fetchone()
|
115
|
+
except InvalidRequestError as ire:
|
116
|
+
log.exception("Cache fetch error: %s", ire)
|
117
|
+
self.reset()
|
118
|
+
return None
|
119
|
+
if row is not None:
|
120
|
+
return cast(Optional[str], row.text)
|
121
|
+
return None
|
122
|
+
|
123
|
+
def get_json(self, key: str, max_age: Optional[int] = None) -> Optional[Any]:
|
124
|
+
text = self.get(key, max_age=max_age)
|
125
|
+
if text is None:
|
126
|
+
return None
|
127
|
+
return json.loads(text)
|
128
|
+
|
129
|
+
def has(self, key: str) -> bool:
|
130
|
+
return self.get(key) is not None
|
131
|
+
|
132
|
+
def delete(self, key: str) -> None:
|
133
|
+
self._preload.pop(key, None)
|
134
|
+
pq = delete(self._table)
|
135
|
+
pq = pq.where(self._table.c.key == key)
|
136
|
+
try:
|
137
|
+
self.conn.execute(pq)
|
138
|
+
except InvalidRequestError as ire:
|
139
|
+
log.exception("Cache delete error: %s", ire)
|
140
|
+
self.reset()
|
141
|
+
return None
|
142
|
+
|
143
|
+
def all(self, like: Optional[str]) -> Generator[CacheValue, None, None]:
|
144
|
+
q = select(self._table)
|
145
|
+
if like is not None:
|
146
|
+
q = q.filter(self._table.c.key.like(like))
|
147
|
+
|
148
|
+
result = self.conn.execute(q)
|
149
|
+
for row in result.yield_per(10000):
|
150
|
+
yield CacheValue(row.key, row.dataset, row.text, row.timestamp)
|
151
|
+
|
152
|
+
def preload(self, like: Optional[str] = None) -> None:
|
153
|
+
log.info("Pre-loading cache: %r", like)
|
154
|
+
for cache in self.all(like=like):
|
155
|
+
self._preload[cache.key] = cache
|
156
|
+
|
157
|
+
def clear(self) -> None:
|
158
|
+
try:
|
159
|
+
pq = delete(self._table)
|
160
|
+
pq = pq.where(self._table.c.dataset == self.dataset.name)
|
161
|
+
self.conn.execute(pq)
|
162
|
+
except InvalidRequestError:
|
163
|
+
log.exception("Cannot clear cache from database")
|
164
|
+
self.reset()
|
165
|
+
|
166
|
+
def reset(self) -> None:
|
167
|
+
if self._conn is not None:
|
168
|
+
self._conn.close()
|
169
|
+
self._conn = None
|
170
|
+
self._transaction = None
|
171
|
+
|
172
|
+
def flush(self) -> None:
|
173
|
+
# log.info("Flushing cache.")
|
174
|
+
if self._transaction is not None:
|
175
|
+
try:
|
176
|
+
self._transaction.commit()
|
177
|
+
except InvalidRequestError:
|
178
|
+
log.exception("Transaction was failed, cannot store cache state.")
|
179
|
+
self.reset()
|
180
|
+
|
181
|
+
def close(self) -> None:
|
182
|
+
self.flush()
|
183
|
+
|
184
|
+
def __repr__(self) -> str:
|
185
|
+
return f"<Cache({self._table!r})>"
|
186
|
+
|
187
|
+
def __hash__(self) -> int:
|
188
|
+
return hash((self.dataset.name, self._table.name))
|
189
|
+
|
190
|
+
@classmethod
|
191
|
+
def make_default(cls, dataset: Dataset) -> "Cache":
|
192
|
+
engine = get_engine()
|
193
|
+
metadata = get_metadata()
|
194
|
+
return cls(engine, metadata, dataset, create=True)
|
nomenklatura/cli.py
ADDED
@@ -0,0 +1,260 @@
|
|
1
|
+
import os
|
2
|
+
import shutil
|
3
|
+
import yaml
|
4
|
+
import click
|
5
|
+
import logging
|
6
|
+
from pathlib import Path
|
7
|
+
from typing import Generator, Optional, Tuple
|
8
|
+
from followthemoney import Dataset, ValueEntity, StatementEntity as Entity
|
9
|
+
from followthemoney.statement import Statement, CSV, FORMATS
|
10
|
+
from followthemoney.statement import write_statements, read_path_statements
|
11
|
+
from followthemoney.cli.util import path_writer, InPath, OutPath
|
12
|
+
from followthemoney.cli.util import path_entities, write_entity
|
13
|
+
from followthemoney.cli.aggregate import sorted_aggregate
|
14
|
+
|
15
|
+
from nomenklatura.cache import Cache
|
16
|
+
from nomenklatura.matching import train_v1_matcher, train_erun_matcher
|
17
|
+
from nomenklatura.store import load_entity_file_store
|
18
|
+
from nomenklatura.resolver import Resolver, Linker
|
19
|
+
from nomenklatura.enrich import Enricher, make_enricher, match, enrich
|
20
|
+
from nomenklatura.matching import get_algorithm, DefaultAlgorithm
|
21
|
+
from nomenklatura.xref import xref as run_xref
|
22
|
+
from nomenklatura.tui import dedupe_ui
|
23
|
+
from nomenklatura.matching.bench import bench_matcher
|
24
|
+
|
25
|
+
INDEX_SEGMENT = "xref-index"
|
26
|
+
|
27
|
+
log = logging.getLogger(__name__)
|
28
|
+
|
29
|
+
ResPath = click.Path(dir_okay=False, writable=True, path_type=Path)
|
30
|
+
|
31
|
+
|
32
|
+
def _load_enricher(path: Path) -> Tuple[Dataset, Enricher[Dataset]]:
|
33
|
+
with open(path, "r") as fh:
|
34
|
+
data = yaml.safe_load(fh)
|
35
|
+
dataset = Dataset.make(data)
|
36
|
+
cache = Cache.make_default(dataset)
|
37
|
+
enricher = make_enricher(dataset, cache, data)
|
38
|
+
if enricher is None:
|
39
|
+
raise TypeError("Could not load enricher")
|
40
|
+
return dataset, enricher
|
41
|
+
|
42
|
+
|
43
|
+
def _get_linker() -> Linker[Entity]:
|
44
|
+
resolver = Resolver[Entity].make_default()
|
45
|
+
linker = resolver.get_linker()
|
46
|
+
resolver.close()
|
47
|
+
return linker
|
48
|
+
|
49
|
+
|
50
|
+
@click.group(help="Nomenklatura data integration")
|
51
|
+
def cli() -> None:
|
52
|
+
logging.basicConfig(level=logging.INFO)
|
53
|
+
|
54
|
+
|
55
|
+
@cli.command("xref", help="Generate dedupe candidates")
|
56
|
+
@click.argument("path", type=InPath)
|
57
|
+
@click.option("-a", "--auto-threshold", type=click.FLOAT, default=None)
|
58
|
+
@click.option("-l", "--limit", type=click.INT, default=5000)
|
59
|
+
@click.option("--algorithm", default=DefaultAlgorithm.NAME)
|
60
|
+
@click.option("--scored/--unscored", is_flag=True, type=click.BOOL, default=True)
|
61
|
+
@click.option(
|
62
|
+
"-c",
|
63
|
+
"--clear",
|
64
|
+
is_flag=True,
|
65
|
+
default=False,
|
66
|
+
help="Clear the index directory, if it exists.",
|
67
|
+
)
|
68
|
+
def xref_file(
|
69
|
+
path: Path,
|
70
|
+
auto_threshold: Optional[float] = None,
|
71
|
+
algorithm: str = DefaultAlgorithm.NAME,
|
72
|
+
limit: int = 5000,
|
73
|
+
scored: bool = True,
|
74
|
+
clear: bool = False,
|
75
|
+
) -> None:
|
76
|
+
resolver = Resolver[Entity].make_default()
|
77
|
+
resolver.begin()
|
78
|
+
store = load_entity_file_store(path, resolver=resolver)
|
79
|
+
algorithm_type = get_algorithm(algorithm)
|
80
|
+
if algorithm_type is None:
|
81
|
+
raise click.Abort(f"Unknown algorithm: {algorithm}")
|
82
|
+
|
83
|
+
index_dir = Path(
|
84
|
+
os.environ.get("NOMENKLATURA_INDEX_PATH", path.parent / INDEX_SEGMENT)
|
85
|
+
)
|
86
|
+
if clear and index_dir.exists():
|
87
|
+
log.info("Clearing index: %s", index_dir)
|
88
|
+
shutil.rmtree(index_dir, ignore_errors=True)
|
89
|
+
run_xref(
|
90
|
+
resolver,
|
91
|
+
store,
|
92
|
+
index_dir,
|
93
|
+
auto_threshold=auto_threshold,
|
94
|
+
algorithm=algorithm_type,
|
95
|
+
scored=scored,
|
96
|
+
limit=limit,
|
97
|
+
)
|
98
|
+
resolver.commit()
|
99
|
+
log.info("Xref complete in: %r", resolver)
|
100
|
+
|
101
|
+
|
102
|
+
@cli.command("prune", help="Remove dedupe candidates")
|
103
|
+
def xref_prune() -> None:
|
104
|
+
resolver = Resolver[Entity].make_default()
|
105
|
+
resolver.begin()
|
106
|
+
resolver.prune()
|
107
|
+
resolver.commit()
|
108
|
+
|
109
|
+
|
110
|
+
@cli.command("apply", help="Apply resolver to an entity stream")
|
111
|
+
@click.argument("path", type=InPath)
|
112
|
+
@click.option("-o", "--outpath", type=OutPath, default="-")
|
113
|
+
@click.option(
|
114
|
+
"-d",
|
115
|
+
"--dataset",
|
116
|
+
type=str,
|
117
|
+
default=None,
|
118
|
+
help="Add a dataset to the entity metadata",
|
119
|
+
)
|
120
|
+
def apply(path: Path, outpath: Path, dataset: Optional[str] = None) -> None:
|
121
|
+
linker = _get_linker()
|
122
|
+
with path_writer(outpath) as outfh:
|
123
|
+
for proxy in path_entities(path, ValueEntity):
|
124
|
+
proxy = linker.apply_stream(proxy)
|
125
|
+
if dataset is not None:
|
126
|
+
proxy.datasets.add(dataset)
|
127
|
+
write_entity(outfh, proxy)
|
128
|
+
|
129
|
+
|
130
|
+
@cli.command("sorted-aggregate", help="Merge sort-order entities")
|
131
|
+
@click.option("-i", "--infile", type=InPath, default="-")
|
132
|
+
@click.option("-o", "--outfile", type=OutPath, default="-")
|
133
|
+
def sorted_aggregate_(infile: Path, outfile: Path) -> None:
|
134
|
+
sorted_aggregate(infile, outfile, ValueEntity)
|
135
|
+
|
136
|
+
|
137
|
+
@cli.command("make-sortable", help="Convert entities into plain-text sortable form")
|
138
|
+
@click.argument("path", type=InPath)
|
139
|
+
@click.option("-o", "--outpath", type=OutPath, default="-")
|
140
|
+
def make_sortable(path: Path, outpath: Path) -> None:
|
141
|
+
with path_writer(outpath) as outfh:
|
142
|
+
for entity in path_entities(path, Entity):
|
143
|
+
write_entity(outfh, entity)
|
144
|
+
|
145
|
+
|
146
|
+
@cli.command("dedupe", help="Interactively judge xref candidates")
|
147
|
+
@click.argument("path", type=InPath)
|
148
|
+
@click.option("-x", "--xref", is_flag=True, default=False)
|
149
|
+
def dedupe(path: Path, xref: bool = False) -> None:
|
150
|
+
resolver = Resolver[Entity].make_default()
|
151
|
+
resolver.begin()
|
152
|
+
store = load_entity_file_store(path, resolver=resolver)
|
153
|
+
if xref:
|
154
|
+
index_dir = path.parent / INDEX_SEGMENT
|
155
|
+
run_xref(resolver, store, index_dir)
|
156
|
+
resolver.commit()
|
157
|
+
|
158
|
+
dedupe_ui(resolver, store)
|
159
|
+
|
160
|
+
|
161
|
+
@cli.command("train-v1-matcher", help="Train a matching model from judgement pairs")
|
162
|
+
@click.argument("pairs_file", type=InPath)
|
163
|
+
def train_v1_matcher_(pairs_file: Path) -> None:
|
164
|
+
train_v1_matcher(pairs_file)
|
165
|
+
|
166
|
+
|
167
|
+
@cli.command("train-erun-matcher", help="Train an ER model from judgement pairs")
|
168
|
+
@click.argument("pairs_file", type=InPath)
|
169
|
+
def train_erun_matcher_(pairs_file: Path) -> None:
|
170
|
+
train_erun_matcher(pairs_file)
|
171
|
+
|
172
|
+
|
173
|
+
@cli.command("match", help="Generate matches from an enrichment source")
|
174
|
+
@click.argument("config", type=InPath)
|
175
|
+
@click.argument("entities", type=InPath)
|
176
|
+
@click.option("-o", "--outpath", type=OutPath, default="-")
|
177
|
+
def match_command(
|
178
|
+
config: Path,
|
179
|
+
entities: Path,
|
180
|
+
outpath: Path,
|
181
|
+
) -> None:
|
182
|
+
resolver = Resolver[Entity].make_default()
|
183
|
+
_, enricher = _load_enricher(config)
|
184
|
+
|
185
|
+
try:
|
186
|
+
resolver.begin()
|
187
|
+
with path_writer(outpath) as fh:
|
188
|
+
stream = path_entities(entities, Entity)
|
189
|
+
for proxy in match(enricher, resolver, stream):
|
190
|
+
write_entity(fh, proxy)
|
191
|
+
resolver.commit()
|
192
|
+
finally:
|
193
|
+
enricher.close()
|
194
|
+
|
195
|
+
|
196
|
+
@cli.command("enrich", help="Fetch extra info from an enrichment source")
|
197
|
+
@click.argument("config", type=InPath)
|
198
|
+
@click.argument("entities", type=InPath)
|
199
|
+
@click.option("-o", "--outpath", type=OutPath, default="-") # noqa
|
200
|
+
def enrich_command(
|
201
|
+
config: Path,
|
202
|
+
entities: Path,
|
203
|
+
outpath: Path,
|
204
|
+
) -> None:
|
205
|
+
resolver = Resolver[Entity].make_default()
|
206
|
+
_, enricher = _load_enricher(config)
|
207
|
+
try:
|
208
|
+
resolver.begin()
|
209
|
+
with path_writer(outpath) as fh:
|
210
|
+
stream = path_entities(entities, Entity)
|
211
|
+
for proxy in enrich(enricher, resolver, stream):
|
212
|
+
write_entity(fh, proxy)
|
213
|
+
resolver.commit()
|
214
|
+
finally:
|
215
|
+
enricher.close()
|
216
|
+
|
217
|
+
|
218
|
+
@cli.command("apply-statements", help="Apply a resolver file to a set of statements")
|
219
|
+
@click.option("-i", "--infile", type=InPath, default="-")
|
220
|
+
@click.option("-o", "--outpath", type=OutPath, default="-")
|
221
|
+
@click.option("-f", "--format", type=click.Choice(FORMATS), default=CSV)
|
222
|
+
def statements_apply(infile: Path, outpath: Path, format: str) -> None:
|
223
|
+
linker = _get_linker()
|
224
|
+
|
225
|
+
def _generate() -> Generator[Statement, None, None]:
|
226
|
+
for stmt in read_path_statements(infile, format=format):
|
227
|
+
yield linker.apply_statement(stmt)
|
228
|
+
|
229
|
+
with path_writer(outpath) as outfh:
|
230
|
+
write_statements(outfh, format, _generate())
|
231
|
+
|
232
|
+
|
233
|
+
@cli.command("load-resolver", help="Load resolver edges from file into database")
|
234
|
+
@click.argument("source", type=InPath)
|
235
|
+
def load_resolver(source: Path) -> None:
|
236
|
+
resolver = Resolver[Entity].make_default()
|
237
|
+
resolver.begin()
|
238
|
+
resolver.load(source)
|
239
|
+
resolver.commit()
|
240
|
+
|
241
|
+
|
242
|
+
@cli.command("dump-resolver", help="Dump resolver decisions from database to file")
|
243
|
+
@click.argument("target", type=OutPath)
|
244
|
+
def dump_resolver(target: Path) -> None:
|
245
|
+
resolver = Resolver[Entity].make_default()
|
246
|
+
resolver.begin()
|
247
|
+
resolver.dump(target)
|
248
|
+
resolver.rollback()
|
249
|
+
|
250
|
+
|
251
|
+
@cli.command("bench", help="Benchmark a matching algorithm")
|
252
|
+
@click.argument("name", type=str)
|
253
|
+
@click.argument("pairs_file", type=InPath)
|
254
|
+
@click.option("-n", "--number", type=int, default=1000)
|
255
|
+
def bench(name: str, pairs_file: Path, number: int = 1000) -> None:
|
256
|
+
bench_matcher(name, pairs_file, number)
|
257
|
+
|
258
|
+
|
259
|
+
if __name__ == "__main__":
|
260
|
+
cli()
|
@@ -0,0 +1,80 @@
|
|
1
|
+
from typing import Dict, Set, Tuple, Generic, Generator
|
2
|
+
from itertools import combinations
|
3
|
+
from collections import defaultdict
|
4
|
+
from rich.console import Console
|
5
|
+
from rich.table import Table
|
6
|
+
from rich import box
|
7
|
+
|
8
|
+
from followthemoney import DS, Statement, SE
|
9
|
+
from nomenklatura.store import View
|
10
|
+
from nomenklatura.judgement import Judgement
|
11
|
+
from nomenklatura.resolver import Resolver
|
12
|
+
|
13
|
+
|
14
|
+
class ConflictingMatchReporter(Generic[SE]):
|
15
|
+
def __init__(self, view: View[DS, SE], resolver: Resolver[SE], threshold: float):
|
16
|
+
self.console = Console()
|
17
|
+
self.view = view
|
18
|
+
self.resolver = resolver
|
19
|
+
self.threshold = threshold
|
20
|
+
self.matches: Dict[str, Set[str]] = defaultdict(set)
|
21
|
+
|
22
|
+
def check_match(self, score: float, left_id: str, right_id: str) -> None:
|
23
|
+
if score > self.threshold:
|
24
|
+
self.matches[left_id].add(right_id)
|
25
|
+
self.matches[right_id].add(left_id)
|
26
|
+
|
27
|
+
def get_conflicting_matches(self) -> Generator[Tuple[str, str, str], None, None]:
|
28
|
+
for candidate_id, matches in self.matches.items():
|
29
|
+
for left_id, right_id in combinations(matches, 2):
|
30
|
+
judgement = self.resolver.get_judgement(left_id, right_id)
|
31
|
+
if judgement == Judgement.NEGATIVE:
|
32
|
+
yield candidate_id, left_id, right_id
|
33
|
+
|
34
|
+
@staticmethod
|
35
|
+
def _sort_key(stmt: Statement) -> Tuple[str, str, int, str, int]:
|
36
|
+
prop_order = 0 if stmt.prop == "name" else 1
|
37
|
+
lang_order = 0 if stmt.lang is None else 1
|
38
|
+
return (stmt.dataset, stmt.entity_id, prop_order, stmt.value, lang_order)
|
39
|
+
|
40
|
+
def report_conflicting_match(self, title: str, entity: SE) -> None:
|
41
|
+
if entity.id is None:
|
42
|
+
return
|
43
|
+
statements = []
|
44
|
+
for stmt in entity.statements:
|
45
|
+
if stmt.prop in {"name", "alias"}:
|
46
|
+
statements.append(stmt)
|
47
|
+
|
48
|
+
table = Table(box=box.SIMPLE, expand=True)
|
49
|
+
table.add_column("Dataset", style="cyan", max_width=20)
|
50
|
+
table.add_column("Entity ID", style="magenta", max_width=30)
|
51
|
+
table.add_column("Prop", style="blue")
|
52
|
+
table.add_column("Lang", style="green")
|
53
|
+
table.add_column("Name", style="yellow")
|
54
|
+
|
55
|
+
for stmt in sorted(statements, key=self._sort_key):
|
56
|
+
table.add_row(
|
57
|
+
stmt.dataset, stmt.entity_id, stmt.prop, stmt.lang, "• " + stmt.value
|
58
|
+
)
|
59
|
+
|
60
|
+
self.console.print(f"[bold]{title}[/bold]:")
|
61
|
+
self.console.print(f"{entity.id}")
|
62
|
+
self.console.print(table)
|
63
|
+
|
64
|
+
def report(self) -> None:
|
65
|
+
conflicts = list(self.get_conflicting_matches())
|
66
|
+
if not conflicts:
|
67
|
+
return
|
68
|
+
|
69
|
+
self.console.print("[bold]Potential conflicting matches found:\n[/bold]")
|
70
|
+
for candidate_id, left_id, right_id in self.get_conflicting_matches():
|
71
|
+
left = self.view.get_entity(left_id)
|
72
|
+
right = self.view.get_entity(right_id)
|
73
|
+
candidate = self.view.get_entity(candidate_id)
|
74
|
+
|
75
|
+
if candidate:
|
76
|
+
self.report_conflicting_match("Candidate", candidate)
|
77
|
+
if left:
|
78
|
+
self.report_conflicting_match("Left side of negative decision", left)
|
79
|
+
if right:
|
80
|
+
self.report_conflicting_match("Right side of negative decision", right)
|
Binary file
|
Binary file
|
nomenklatura/db.py
ADDED
@@ -0,0 +1,139 @@
|
|
1
|
+
from contextlib import contextmanager
|
2
|
+
from functools import cache
|
3
|
+
from typing import Any, Dict, Generator, Iterable, List, Mapping, Optional, cast
|
4
|
+
import logging
|
5
|
+
|
6
|
+
from followthemoney import Statement
|
7
|
+
from followthemoney.statement.util import get_prop_type
|
8
|
+
from sqlalchemy import (
|
9
|
+
Boolean,
|
10
|
+
Column,
|
11
|
+
DateTime,
|
12
|
+
Dialect,
|
13
|
+
MetaData,
|
14
|
+
Table,
|
15
|
+
Unicode,
|
16
|
+
create_engine,
|
17
|
+
delete,
|
18
|
+
)
|
19
|
+
from sqlalchemy.engine import Connection, Engine
|
20
|
+
from sqlalchemy.dialects.postgresql import insert as psql_insert
|
21
|
+
from sqlalchemy.dialects.sqlite import insert as sqlite_insert
|
22
|
+
|
23
|
+
from nomenklatura import settings
|
24
|
+
|
25
|
+
Conn = Connection
|
26
|
+
Connish = Optional[Connection]
|
27
|
+
KEY_LEN = 255
|
28
|
+
VALUE_LEN = 65535
|
29
|
+
|
30
|
+
log = logging.getLogger(__name__)
|
31
|
+
|
32
|
+
|
33
|
+
@cache
|
34
|
+
def get_engine(url: Optional[str] = None) -> Engine:
|
35
|
+
url = url or settings.DB_URL
|
36
|
+
connect_args = {}
|
37
|
+
if url.startswith("postgres"):
|
38
|
+
connect_args["options"] = f"-c statement_timeout={settings.DB_STMT_TIMEOUT}"
|
39
|
+
|
40
|
+
return create_engine(
|
41
|
+
url, pool_size=settings.DB_POOL_SIZE, connect_args=connect_args
|
42
|
+
)
|
43
|
+
|
44
|
+
|
45
|
+
@cache
|
46
|
+
def get_metadata() -> MetaData:
|
47
|
+
return MetaData()
|
48
|
+
|
49
|
+
|
50
|
+
@contextmanager
|
51
|
+
def ensure_tx(conn: Connish = None) -> Generator[Connection, None, None]:
|
52
|
+
if conn is not None:
|
53
|
+
yield conn
|
54
|
+
return
|
55
|
+
engine = get_engine()
|
56
|
+
with engine.begin() as conn:
|
57
|
+
yield conn
|
58
|
+
|
59
|
+
|
60
|
+
def make_statement_table(
|
61
|
+
metadata: MetaData,
|
62
|
+
name: str = settings.STATEMENT_TABLE,
|
63
|
+
) -> Table:
|
64
|
+
return Table(
|
65
|
+
name,
|
66
|
+
metadata,
|
67
|
+
Column("id", Unicode(KEY_LEN), primary_key=True, unique=True),
|
68
|
+
Column("entity_id", Unicode(KEY_LEN), index=True, nullable=False),
|
69
|
+
Column("canonical_id", Unicode(KEY_LEN), index=True, nullable=False),
|
70
|
+
Column("prop", Unicode(KEY_LEN), index=True, nullable=False),
|
71
|
+
Column("prop_type", Unicode(KEY_LEN), index=True, nullable=False),
|
72
|
+
Column("schema", Unicode(KEY_LEN), index=True, nullable=False),
|
73
|
+
Column("value", Unicode(VALUE_LEN), nullable=False),
|
74
|
+
Column("original_value", Unicode(VALUE_LEN), nullable=True),
|
75
|
+
Column("dataset", Unicode(KEY_LEN), index=True),
|
76
|
+
Column("origin", Unicode(KEY_LEN), index=True),
|
77
|
+
Column("lang", Unicode(KEY_LEN), nullable=True),
|
78
|
+
Column("external", Boolean, default=False, nullable=False),
|
79
|
+
Column("first_seen", DateTime, nullable=True),
|
80
|
+
Column("last_seen", DateTime, nullable=True),
|
81
|
+
)
|
82
|
+
|
83
|
+
|
84
|
+
def _upsert_statement_batch(
|
85
|
+
dialect: Dialect, conn: Connection, table: Table, batch: List[Mapping[str, Any]]
|
86
|
+
) -> None:
|
87
|
+
"""Create an upsert statement for the given table and engine."""
|
88
|
+
if dialect.name == "sqlite":
|
89
|
+
lstmt = sqlite_insert(table).values(batch)
|
90
|
+
lstmt = lstmt.on_conflict_do_nothing(index_elements=["id"])
|
91
|
+
conn.execute(lstmt)
|
92
|
+
elif dialect.name in ("postgresql", "postgres"):
|
93
|
+
pstmt = psql_insert(table).values(batch)
|
94
|
+
pstmt = pstmt.on_conflict_do_nothing(index_elements=["id"])
|
95
|
+
conn.execute(pstmt)
|
96
|
+
else:
|
97
|
+
raise NotImplementedError(f"Upsert not implemented for dialect {dialect.name}")
|
98
|
+
|
99
|
+
|
100
|
+
def insert_statements(
|
101
|
+
engine: Engine,
|
102
|
+
table: Table,
|
103
|
+
dataset_name: str,
|
104
|
+
statements: Iterable[Statement],
|
105
|
+
batch_size: int = settings.STATEMENT_BATCH,
|
106
|
+
) -> None:
|
107
|
+
dataset_count: int = 0
|
108
|
+
is_postgresql = "postgres" in engine.dialect.name
|
109
|
+
with engine.begin() as conn:
|
110
|
+
del_q = delete(table).where(table.c.dataset == dataset_name)
|
111
|
+
conn.execute(del_q)
|
112
|
+
batch: List[Mapping[str, Any]] = []
|
113
|
+
|
114
|
+
for stmt in statements:
|
115
|
+
if is_postgresql:
|
116
|
+
row = cast(Dict[str, Any], stmt.to_dict())
|
117
|
+
row["prop_type"] = get_prop_type(row["schema"], row["prop"])
|
118
|
+
else:
|
119
|
+
row = stmt.to_db_row()
|
120
|
+
batch.append(row)
|
121
|
+
dataset_count += 1
|
122
|
+
if len(batch) >= batch_size:
|
123
|
+
args = (len(batch), dataset_count, dataset_name)
|
124
|
+
log.info("Inserting batch %s statements (total: %s) into %r" % args)
|
125
|
+
_upsert_statement_batch(engine.dialect, conn, table, batch)
|
126
|
+
batch = []
|
127
|
+
if len(batch):
|
128
|
+
_upsert_statement_batch(engine.dialect, conn, table, batch)
|
129
|
+
log.info("Load complete: %r (%d total)" % (dataset_name, dataset_count))
|
130
|
+
|
131
|
+
|
132
|
+
# TODO: consider offering a COPY-based loader:
|
133
|
+
# raw_conn = await conn.get_raw_connection()
|
134
|
+
# driver_conn: Connection = raw_conn.driver_connection
|
135
|
+
# result = await driver_conn.copy_records_to_table(
|
136
|
+
# stmt_table.name,
|
137
|
+
# records=load_data_rows(),
|
138
|
+
# columns=COLUMNS,
|
139
|
+
# )
|
nomenklatura/delta.py
ADDED