nomenklatura-mpt 4.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. nomenklatura/__init__.py +11 -0
  2. nomenklatura/cache.py +194 -0
  3. nomenklatura/cli.py +260 -0
  4. nomenklatura/conflicting_match.py +80 -0
  5. nomenklatura/data/er-unstable.pkl +0 -0
  6. nomenklatura/data/regression-v1.pkl +0 -0
  7. nomenklatura/db.py +139 -0
  8. nomenklatura/delta.py +4 -0
  9. nomenklatura/enrich/__init__.py +94 -0
  10. nomenklatura/enrich/aleph.py +141 -0
  11. nomenklatura/enrich/common.py +219 -0
  12. nomenklatura/enrich/nominatim.py +72 -0
  13. nomenklatura/enrich/opencorporates.py +233 -0
  14. nomenklatura/enrich/openfigi.py +124 -0
  15. nomenklatura/enrich/permid.py +201 -0
  16. nomenklatura/enrich/wikidata.py +268 -0
  17. nomenklatura/enrich/yente.py +116 -0
  18. nomenklatura/exceptions.py +9 -0
  19. nomenklatura/index/__init__.py +5 -0
  20. nomenklatura/index/common.py +24 -0
  21. nomenklatura/index/entry.py +89 -0
  22. nomenklatura/index/index.py +170 -0
  23. nomenklatura/index/tokenizer.py +92 -0
  24. nomenklatura/judgement.py +21 -0
  25. nomenklatura/kv.py +40 -0
  26. nomenklatura/matching/__init__.py +47 -0
  27. nomenklatura/matching/bench.py +32 -0
  28. nomenklatura/matching/compare/__init__.py +0 -0
  29. nomenklatura/matching/compare/addresses.py +71 -0
  30. nomenklatura/matching/compare/countries.py +15 -0
  31. nomenklatura/matching/compare/dates.py +83 -0
  32. nomenklatura/matching/compare/gender.py +15 -0
  33. nomenklatura/matching/compare/identifiers.py +30 -0
  34. nomenklatura/matching/compare/names.py +157 -0
  35. nomenklatura/matching/compare/util.py +51 -0
  36. nomenklatura/matching/compat.py +66 -0
  37. nomenklatura/matching/erun/__init__.py +0 -0
  38. nomenklatura/matching/erun/countries.py +42 -0
  39. nomenklatura/matching/erun/identifiers.py +64 -0
  40. nomenklatura/matching/erun/misc.py +71 -0
  41. nomenklatura/matching/erun/model.py +110 -0
  42. nomenklatura/matching/erun/names.py +126 -0
  43. nomenklatura/matching/erun/train.py +135 -0
  44. nomenklatura/matching/erun/util.py +28 -0
  45. nomenklatura/matching/logic_v1/__init__.py +0 -0
  46. nomenklatura/matching/logic_v1/identifiers.py +104 -0
  47. nomenklatura/matching/logic_v1/model.py +76 -0
  48. nomenklatura/matching/logic_v1/multi.py +21 -0
  49. nomenklatura/matching/logic_v1/phonetic.py +142 -0
  50. nomenklatura/matching/logic_v2/__init__.py +0 -0
  51. nomenklatura/matching/logic_v2/identifiers.py +124 -0
  52. nomenklatura/matching/logic_v2/model.py +98 -0
  53. nomenklatura/matching/logic_v2/names/__init__.py +3 -0
  54. nomenklatura/matching/logic_v2/names/analysis.py +51 -0
  55. nomenklatura/matching/logic_v2/names/distance.py +181 -0
  56. nomenklatura/matching/logic_v2/names/magic.py +60 -0
  57. nomenklatura/matching/logic_v2/names/match.py +195 -0
  58. nomenklatura/matching/logic_v2/names/pairing.py +81 -0
  59. nomenklatura/matching/logic_v2/names/util.py +89 -0
  60. nomenklatura/matching/name_based/__init__.py +4 -0
  61. nomenklatura/matching/name_based/misc.py +86 -0
  62. nomenklatura/matching/name_based/model.py +59 -0
  63. nomenklatura/matching/name_based/names.py +59 -0
  64. nomenklatura/matching/pairs.py +42 -0
  65. nomenklatura/matching/regression_v1/__init__.py +0 -0
  66. nomenklatura/matching/regression_v1/misc.py +75 -0
  67. nomenklatura/matching/regression_v1/model.py +110 -0
  68. nomenklatura/matching/regression_v1/names.py +63 -0
  69. nomenklatura/matching/regression_v1/train.py +87 -0
  70. nomenklatura/matching/regression_v1/util.py +31 -0
  71. nomenklatura/matching/svm_v1/__init__.py +5 -0
  72. nomenklatura/matching/svm_v1/misc.py +94 -0
  73. nomenklatura/matching/svm_v1/model.py +168 -0
  74. nomenklatura/matching/svm_v1/names.py +81 -0
  75. nomenklatura/matching/svm_v1/train.py +186 -0
  76. nomenklatura/matching/svm_v1/util.py +30 -0
  77. nomenklatura/matching/types.py +227 -0
  78. nomenklatura/matching/util.py +62 -0
  79. nomenklatura/publish/__init__.py +0 -0
  80. nomenklatura/publish/dates.py +49 -0
  81. nomenklatura/publish/edges.py +32 -0
  82. nomenklatura/py.typed +0 -0
  83. nomenklatura/resolver/__init__.py +6 -0
  84. nomenklatura/resolver/common.py +2 -0
  85. nomenklatura/resolver/edge.py +107 -0
  86. nomenklatura/resolver/identifier.py +60 -0
  87. nomenklatura/resolver/linker.py +101 -0
  88. nomenklatura/resolver/resolver.py +565 -0
  89. nomenklatura/settings.py +17 -0
  90. nomenklatura/store/__init__.py +41 -0
  91. nomenklatura/store/base.py +130 -0
  92. nomenklatura/store/level.py +272 -0
  93. nomenklatura/store/memory.py +102 -0
  94. nomenklatura/store/redis_.py +131 -0
  95. nomenklatura/store/sql.py +219 -0
  96. nomenklatura/store/util.py +48 -0
  97. nomenklatura/store/versioned.py +371 -0
  98. nomenklatura/tui/__init__.py +17 -0
  99. nomenklatura/tui/app.py +294 -0
  100. nomenklatura/tui/app.tcss +52 -0
  101. nomenklatura/tui/comparison.py +81 -0
  102. nomenklatura/tui/util.py +35 -0
  103. nomenklatura/util.py +26 -0
  104. nomenklatura/versions.py +119 -0
  105. nomenklatura/wikidata/__init__.py +14 -0
  106. nomenklatura/wikidata/client.py +122 -0
  107. nomenklatura/wikidata/lang.py +94 -0
  108. nomenklatura/wikidata/model.py +139 -0
  109. nomenklatura/wikidata/props.py +70 -0
  110. nomenklatura/wikidata/qualified.py +49 -0
  111. nomenklatura/wikidata/query.py +66 -0
  112. nomenklatura/wikidata/value.py +87 -0
  113. nomenklatura/xref.py +125 -0
  114. nomenklatura_mpt-4.1.9.dist-info/METADATA +159 -0
  115. nomenklatura_mpt-4.1.9.dist-info/RECORD +118 -0
  116. nomenklatura_mpt-4.1.9.dist-info/WHEEL +4 -0
  117. nomenklatura_mpt-4.1.9.dist-info/entry_points.txt +3 -0
  118. nomenklatura_mpt-4.1.9.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,11 @@
1
+ from nomenklatura.resolver import Resolver
2
+ from nomenklatura.store import Store, View
3
+ from nomenklatura.index import Index
4
+
5
+ __version__ = "4.1.9"
6
+ __all__ = [
7
+ "Resolver",
8
+ "Index",
9
+ "Store",
10
+ "View",
11
+ ]
nomenklatura/cache.py ADDED
@@ -0,0 +1,194 @@
1
+ import math
2
+ import json
3
+ import logging
4
+ from random import randint
5
+ from dataclasses import dataclass
6
+ from typing import Any, cast, Dict, Optional, Union, Generator
7
+ from datetime import datetime, timedelta
8
+ from sqlalchemy import MetaData
9
+ from sqlalchemy import Table, Column, DateTime, Unicode
10
+ from sqlalchemy.engine import Engine, Connection, Transaction
11
+ from sqlalchemy.future import select
12
+ from sqlalchemy.sql.expression import delete
13
+ from sqlalchemy.exc import OperationalError, InvalidRequestError
14
+ from sqlalchemy.dialects.postgresql import insert as upsert
15
+ from rigour.time import naive_now
16
+ from followthemoney import Dataset
17
+
18
+ from nomenklatura.db import get_engine, get_metadata
19
+
20
+
21
+ log = logging.getLogger(__name__)
22
+ Value = Union[str, None]
23
+
24
+
25
+ @dataclass
26
+ class CacheValue:
27
+ key: str
28
+ dataset: Optional[str]
29
+ text: Value
30
+ timestamp: datetime
31
+
32
+
33
+ def randomize_cache(days: int) -> timedelta:
34
+ min_cache = max(1, math.ceil(days * 0.5))
35
+ max_cache = math.ceil(days * 1.3)
36
+ return timedelta(days=randint(min_cache, max_cache))
37
+
38
+
39
+ class Cache(object):
40
+ def __init__(
41
+ self, engine: Engine, metadata: MetaData, dataset: Dataset, create: bool = False
42
+ ) -> None:
43
+ self.dataset = dataset
44
+ self._engine = engine
45
+ self._conn: Optional[Connection] = None
46
+ self._transaction: Optional[Transaction] = None
47
+ self._table = Table(
48
+ "cache",
49
+ metadata,
50
+ Column("key", Unicode(), primary_key=True),
51
+ Column("text", Unicode(), nullable=True),
52
+ Column("dataset", Unicode(), nullable=False),
53
+ Column("timestamp", DateTime, index=True),
54
+ extend_existing=True,
55
+ )
56
+ if create:
57
+ metadata.create_all(bind=engine, checkfirst=True, tables=[self._table])
58
+
59
+ self._preload: Dict[str, CacheValue] = {}
60
+
61
+ @property
62
+ def conn(self) -> Connection:
63
+ if self._conn is None:
64
+ self._conn = self._engine.connect()
65
+ self._transaction = self._conn.begin()
66
+ return self._conn
67
+
68
+ def set(self, key: str, value: Value) -> None:
69
+ self._preload.pop(key, None)
70
+ cache = {
71
+ "timestamp": naive_now(),
72
+ "key": key,
73
+ "dataset": self.dataset.name,
74
+ "text": value,
75
+ }
76
+ try:
77
+ istmt = upsert(self._table).values(cache)
78
+ values = dict(
79
+ timestamp=istmt.excluded.timestamp,
80
+ text=istmt.excluded.text,
81
+ dataset=istmt.excluded.dataset,
82
+ )
83
+ stmt = istmt.on_conflict_do_update(index_elements=["key"], set_=values)
84
+ self.conn.execute(stmt)
85
+ except (OperationalError, InvalidRequestError) as exc:
86
+ log.exception("Error while saving to cache: %s" % exc)
87
+ self.reset()
88
+
89
+ def set_json(self, key: str, value: Any) -> None:
90
+ return self.set(key, json.dumps(value))
91
+
92
+ def get(self, key: str, max_age: Optional[int] = None) -> Optional[Value]:
93
+ if max_age is not None and max_age < 1:
94
+ return None
95
+
96
+ cache_cutoff = None
97
+ if max_age is not None:
98
+ cache_cutoff = naive_now() - randomize_cache(max_age)
99
+
100
+ cache = self._preload.get(key)
101
+ if cache is not None:
102
+ if cache_cutoff is not None and cache.timestamp < cache_cutoff:
103
+ return None
104
+ return cache.text
105
+
106
+ q = select(self._table.c.text)
107
+ q = q.filter(self._table.c.key == key)
108
+ if cache_cutoff is not None:
109
+ q = q.filter(self._table.c.timestamp > cache_cutoff)
110
+ q = q.order_by(self._table.c.timestamp.desc())
111
+ q = q.limit(1)
112
+ try:
113
+ result = self.conn.execute(q)
114
+ row = result.fetchone()
115
+ except InvalidRequestError as ire:
116
+ log.exception("Cache fetch error: %s", ire)
117
+ self.reset()
118
+ return None
119
+ if row is not None:
120
+ return cast(Optional[str], row.text)
121
+ return None
122
+
123
+ def get_json(self, key: str, max_age: Optional[int] = None) -> Optional[Any]:
124
+ text = self.get(key, max_age=max_age)
125
+ if text is None:
126
+ return None
127
+ return json.loads(text)
128
+
129
+ def has(self, key: str) -> bool:
130
+ return self.get(key) is not None
131
+
132
+ def delete(self, key: str) -> None:
133
+ self._preload.pop(key, None)
134
+ pq = delete(self._table)
135
+ pq = pq.where(self._table.c.key == key)
136
+ try:
137
+ self.conn.execute(pq)
138
+ except InvalidRequestError as ire:
139
+ log.exception("Cache delete error: %s", ire)
140
+ self.reset()
141
+ return None
142
+
143
+ def all(self, like: Optional[str]) -> Generator[CacheValue, None, None]:
144
+ q = select(self._table)
145
+ if like is not None:
146
+ q = q.filter(self._table.c.key.like(like))
147
+
148
+ result = self.conn.execute(q)
149
+ for row in result.yield_per(10000):
150
+ yield CacheValue(row.key, row.dataset, row.text, row.timestamp)
151
+
152
+ def preload(self, like: Optional[str] = None) -> None:
153
+ log.info("Pre-loading cache: %r", like)
154
+ for cache in self.all(like=like):
155
+ self._preload[cache.key] = cache
156
+
157
+ def clear(self) -> None:
158
+ try:
159
+ pq = delete(self._table)
160
+ pq = pq.where(self._table.c.dataset == self.dataset.name)
161
+ self.conn.execute(pq)
162
+ except InvalidRequestError:
163
+ log.exception("Cannot clear cache from database")
164
+ self.reset()
165
+
166
+ def reset(self) -> None:
167
+ if self._conn is not None:
168
+ self._conn.close()
169
+ self._conn = None
170
+ self._transaction = None
171
+
172
+ def flush(self) -> None:
173
+ # log.info("Flushing cache.")
174
+ if self._transaction is not None:
175
+ try:
176
+ self._transaction.commit()
177
+ except InvalidRequestError:
178
+ log.exception("Transaction was failed, cannot store cache state.")
179
+ self.reset()
180
+
181
+ def close(self) -> None:
182
+ self.flush()
183
+
184
+ def __repr__(self) -> str:
185
+ return f"<Cache({self._table!r})>"
186
+
187
+ def __hash__(self) -> int:
188
+ return hash((self.dataset.name, self._table.name))
189
+
190
+ @classmethod
191
+ def make_default(cls, dataset: Dataset) -> "Cache":
192
+ engine = get_engine()
193
+ metadata = get_metadata()
194
+ return cls(engine, metadata, dataset, create=True)
nomenklatura/cli.py ADDED
@@ -0,0 +1,260 @@
1
+ import os
2
+ import shutil
3
+ import yaml
4
+ import click
5
+ import logging
6
+ from pathlib import Path
7
+ from typing import Generator, Optional, Tuple
8
+ from followthemoney import Dataset, ValueEntity, StatementEntity as Entity
9
+ from followthemoney.statement import Statement, CSV, FORMATS
10
+ from followthemoney.statement import write_statements, read_path_statements
11
+ from followthemoney.cli.util import path_writer, InPath, OutPath
12
+ from followthemoney.cli.util import path_entities, write_entity
13
+ from followthemoney.cli.aggregate import sorted_aggregate
14
+
15
+ from nomenklatura.cache import Cache
16
+ from nomenklatura.matching import train_v1_matcher, train_erun_matcher
17
+ from nomenklatura.store import load_entity_file_store
18
+ from nomenklatura.resolver import Resolver, Linker
19
+ from nomenklatura.enrich import Enricher, make_enricher, match, enrich
20
+ from nomenklatura.matching import get_algorithm, DefaultAlgorithm
21
+ from nomenklatura.xref import xref as run_xref
22
+ from nomenklatura.tui import dedupe_ui
23
+ from nomenklatura.matching.bench import bench_matcher
24
+
25
+ INDEX_SEGMENT = "xref-index"
26
+
27
+ log = logging.getLogger(__name__)
28
+
29
+ ResPath = click.Path(dir_okay=False, writable=True, path_type=Path)
30
+
31
+
32
+ def _load_enricher(path: Path) -> Tuple[Dataset, Enricher[Dataset]]:
33
+ with open(path, "r") as fh:
34
+ data = yaml.safe_load(fh)
35
+ dataset = Dataset.make(data)
36
+ cache = Cache.make_default(dataset)
37
+ enricher = make_enricher(dataset, cache, data)
38
+ if enricher is None:
39
+ raise TypeError("Could not load enricher")
40
+ return dataset, enricher
41
+
42
+
43
+ def _get_linker() -> Linker[Entity]:
44
+ resolver = Resolver[Entity].make_default()
45
+ linker = resolver.get_linker()
46
+ resolver.close()
47
+ return linker
48
+
49
+
50
+ @click.group(help="Nomenklatura data integration")
51
+ def cli() -> None:
52
+ logging.basicConfig(level=logging.INFO)
53
+
54
+
55
+ @cli.command("xref", help="Generate dedupe candidates")
56
+ @click.argument("path", type=InPath)
57
+ @click.option("-a", "--auto-threshold", type=click.FLOAT, default=None)
58
+ @click.option("-l", "--limit", type=click.INT, default=5000)
59
+ @click.option("--algorithm", default=DefaultAlgorithm.NAME)
60
+ @click.option("--scored/--unscored", is_flag=True, type=click.BOOL, default=True)
61
+ @click.option(
62
+ "-c",
63
+ "--clear",
64
+ is_flag=True,
65
+ default=False,
66
+ help="Clear the index directory, if it exists.",
67
+ )
68
+ def xref_file(
69
+ path: Path,
70
+ auto_threshold: Optional[float] = None,
71
+ algorithm: str = DefaultAlgorithm.NAME,
72
+ limit: int = 5000,
73
+ scored: bool = True,
74
+ clear: bool = False,
75
+ ) -> None:
76
+ resolver = Resolver[Entity].make_default()
77
+ resolver.begin()
78
+ store = load_entity_file_store(path, resolver=resolver)
79
+ algorithm_type = get_algorithm(algorithm)
80
+ if algorithm_type is None:
81
+ raise click.Abort(f"Unknown algorithm: {algorithm}")
82
+
83
+ index_dir = Path(
84
+ os.environ.get("NOMENKLATURA_INDEX_PATH", path.parent / INDEX_SEGMENT)
85
+ )
86
+ if clear and index_dir.exists():
87
+ log.info("Clearing index: %s", index_dir)
88
+ shutil.rmtree(index_dir, ignore_errors=True)
89
+ run_xref(
90
+ resolver,
91
+ store,
92
+ index_dir,
93
+ auto_threshold=auto_threshold,
94
+ algorithm=algorithm_type,
95
+ scored=scored,
96
+ limit=limit,
97
+ )
98
+ resolver.commit()
99
+ log.info("Xref complete in: %r", resolver)
100
+
101
+
102
+ @cli.command("prune", help="Remove dedupe candidates")
103
+ def xref_prune() -> None:
104
+ resolver = Resolver[Entity].make_default()
105
+ resolver.begin()
106
+ resolver.prune()
107
+ resolver.commit()
108
+
109
+
110
+ @cli.command("apply", help="Apply resolver to an entity stream")
111
+ @click.argument("path", type=InPath)
112
+ @click.option("-o", "--outpath", type=OutPath, default="-")
113
+ @click.option(
114
+ "-d",
115
+ "--dataset",
116
+ type=str,
117
+ default=None,
118
+ help="Add a dataset to the entity metadata",
119
+ )
120
+ def apply(path: Path, outpath: Path, dataset: Optional[str] = None) -> None:
121
+ linker = _get_linker()
122
+ with path_writer(outpath) as outfh:
123
+ for proxy in path_entities(path, ValueEntity):
124
+ proxy = linker.apply_stream(proxy)
125
+ if dataset is not None:
126
+ proxy.datasets.add(dataset)
127
+ write_entity(outfh, proxy)
128
+
129
+
130
+ @cli.command("sorted-aggregate", help="Merge sort-order entities")
131
+ @click.option("-i", "--infile", type=InPath, default="-")
132
+ @click.option("-o", "--outfile", type=OutPath, default="-")
133
+ def sorted_aggregate_(infile: Path, outfile: Path) -> None:
134
+ sorted_aggregate(infile, outfile, ValueEntity)
135
+
136
+
137
+ @cli.command("make-sortable", help="Convert entities into plain-text sortable form")
138
+ @click.argument("path", type=InPath)
139
+ @click.option("-o", "--outpath", type=OutPath, default="-")
140
+ def make_sortable(path: Path, outpath: Path) -> None:
141
+ with path_writer(outpath) as outfh:
142
+ for entity in path_entities(path, Entity):
143
+ write_entity(outfh, entity)
144
+
145
+
146
+ @cli.command("dedupe", help="Interactively judge xref candidates")
147
+ @click.argument("path", type=InPath)
148
+ @click.option("-x", "--xref", is_flag=True, default=False)
149
+ def dedupe(path: Path, xref: bool = False) -> None:
150
+ resolver = Resolver[Entity].make_default()
151
+ resolver.begin()
152
+ store = load_entity_file_store(path, resolver=resolver)
153
+ if xref:
154
+ index_dir = path.parent / INDEX_SEGMENT
155
+ run_xref(resolver, store, index_dir)
156
+ resolver.commit()
157
+
158
+ dedupe_ui(resolver, store)
159
+
160
+
161
+ @cli.command("train-v1-matcher", help="Train a matching model from judgement pairs")
162
+ @click.argument("pairs_file", type=InPath)
163
+ def train_v1_matcher_(pairs_file: Path) -> None:
164
+ train_v1_matcher(pairs_file)
165
+
166
+
167
+ @cli.command("train-erun-matcher", help="Train an ER model from judgement pairs")
168
+ @click.argument("pairs_file", type=InPath)
169
+ def train_erun_matcher_(pairs_file: Path) -> None:
170
+ train_erun_matcher(pairs_file)
171
+
172
+
173
+ @cli.command("match", help="Generate matches from an enrichment source")
174
+ @click.argument("config", type=InPath)
175
+ @click.argument("entities", type=InPath)
176
+ @click.option("-o", "--outpath", type=OutPath, default="-")
177
+ def match_command(
178
+ config: Path,
179
+ entities: Path,
180
+ outpath: Path,
181
+ ) -> None:
182
+ resolver = Resolver[Entity].make_default()
183
+ _, enricher = _load_enricher(config)
184
+
185
+ try:
186
+ resolver.begin()
187
+ with path_writer(outpath) as fh:
188
+ stream = path_entities(entities, Entity)
189
+ for proxy in match(enricher, resolver, stream):
190
+ write_entity(fh, proxy)
191
+ resolver.commit()
192
+ finally:
193
+ enricher.close()
194
+
195
+
196
+ @cli.command("enrich", help="Fetch extra info from an enrichment source")
197
+ @click.argument("config", type=InPath)
198
+ @click.argument("entities", type=InPath)
199
+ @click.option("-o", "--outpath", type=OutPath, default="-") # noqa
200
+ def enrich_command(
201
+ config: Path,
202
+ entities: Path,
203
+ outpath: Path,
204
+ ) -> None:
205
+ resolver = Resolver[Entity].make_default()
206
+ _, enricher = _load_enricher(config)
207
+ try:
208
+ resolver.begin()
209
+ with path_writer(outpath) as fh:
210
+ stream = path_entities(entities, Entity)
211
+ for proxy in enrich(enricher, resolver, stream):
212
+ write_entity(fh, proxy)
213
+ resolver.commit()
214
+ finally:
215
+ enricher.close()
216
+
217
+
218
+ @cli.command("apply-statements", help="Apply a resolver file to a set of statements")
219
+ @click.option("-i", "--infile", type=InPath, default="-")
220
+ @click.option("-o", "--outpath", type=OutPath, default="-")
221
+ @click.option("-f", "--format", type=click.Choice(FORMATS), default=CSV)
222
+ def statements_apply(infile: Path, outpath: Path, format: str) -> None:
223
+ linker = _get_linker()
224
+
225
+ def _generate() -> Generator[Statement, None, None]:
226
+ for stmt in read_path_statements(infile, format=format):
227
+ yield linker.apply_statement(stmt)
228
+
229
+ with path_writer(outpath) as outfh:
230
+ write_statements(outfh, format, _generate())
231
+
232
+
233
+ @cli.command("load-resolver", help="Load resolver edges from file into database")
234
+ @click.argument("source", type=InPath)
235
+ def load_resolver(source: Path) -> None:
236
+ resolver = Resolver[Entity].make_default()
237
+ resolver.begin()
238
+ resolver.load(source)
239
+ resolver.commit()
240
+
241
+
242
+ @cli.command("dump-resolver", help="Dump resolver decisions from database to file")
243
+ @click.argument("target", type=OutPath)
244
+ def dump_resolver(target: Path) -> None:
245
+ resolver = Resolver[Entity].make_default()
246
+ resolver.begin()
247
+ resolver.dump(target)
248
+ resolver.rollback()
249
+
250
+
251
+ @cli.command("bench", help="Benchmark a matching algorithm")
252
+ @click.argument("name", type=str)
253
+ @click.argument("pairs_file", type=InPath)
254
+ @click.option("-n", "--number", type=int, default=1000)
255
+ def bench(name: str, pairs_file: Path, number: int = 1000) -> None:
256
+ bench_matcher(name, pairs_file, number)
257
+
258
+
259
+ if __name__ == "__main__":
260
+ cli()
@@ -0,0 +1,80 @@
1
+ from typing import Dict, Set, Tuple, Generic, Generator
2
+ from itertools import combinations
3
+ from collections import defaultdict
4
+ from rich.console import Console
5
+ from rich.table import Table
6
+ from rich import box
7
+
8
+ from followthemoney import DS, Statement, SE
9
+ from nomenklatura.store import View
10
+ from nomenklatura.judgement import Judgement
11
+ from nomenklatura.resolver import Resolver
12
+
13
+
14
+ class ConflictingMatchReporter(Generic[SE]):
15
+ def __init__(self, view: View[DS, SE], resolver: Resolver[SE], threshold: float):
16
+ self.console = Console()
17
+ self.view = view
18
+ self.resolver = resolver
19
+ self.threshold = threshold
20
+ self.matches: Dict[str, Set[str]] = defaultdict(set)
21
+
22
+ def check_match(self, score: float, left_id: str, right_id: str) -> None:
23
+ if score > self.threshold:
24
+ self.matches[left_id].add(right_id)
25
+ self.matches[right_id].add(left_id)
26
+
27
+ def get_conflicting_matches(self) -> Generator[Tuple[str, str, str], None, None]:
28
+ for candidate_id, matches in self.matches.items():
29
+ for left_id, right_id in combinations(matches, 2):
30
+ judgement = self.resolver.get_judgement(left_id, right_id)
31
+ if judgement == Judgement.NEGATIVE:
32
+ yield candidate_id, left_id, right_id
33
+
34
+ @staticmethod
35
+ def _sort_key(stmt: Statement) -> Tuple[str, str, int, str, int]:
36
+ prop_order = 0 if stmt.prop == "name" else 1
37
+ lang_order = 0 if stmt.lang is None else 1
38
+ return (stmt.dataset, stmt.entity_id, prop_order, stmt.value, lang_order)
39
+
40
+ def report_conflicting_match(self, title: str, entity: SE) -> None:
41
+ if entity.id is None:
42
+ return
43
+ statements = []
44
+ for stmt in entity.statements:
45
+ if stmt.prop in {"name", "alias"}:
46
+ statements.append(stmt)
47
+
48
+ table = Table(box=box.SIMPLE, expand=True)
49
+ table.add_column("Dataset", style="cyan", max_width=20)
50
+ table.add_column("Entity ID", style="magenta", max_width=30)
51
+ table.add_column("Prop", style="blue")
52
+ table.add_column("Lang", style="green")
53
+ table.add_column("Name", style="yellow")
54
+
55
+ for stmt in sorted(statements, key=self._sort_key):
56
+ table.add_row(
57
+ stmt.dataset, stmt.entity_id, stmt.prop, stmt.lang, "• " + stmt.value
58
+ )
59
+
60
+ self.console.print(f"[bold]{title}[/bold]:")
61
+ self.console.print(f"{entity.id}")
62
+ self.console.print(table)
63
+
64
+ def report(self) -> None:
65
+ conflicts = list(self.get_conflicting_matches())
66
+ if not conflicts:
67
+ return
68
+
69
+ self.console.print("[bold]Potential conflicting matches found:\n[/bold]")
70
+ for candidate_id, left_id, right_id in self.get_conflicting_matches():
71
+ left = self.view.get_entity(left_id)
72
+ right = self.view.get_entity(right_id)
73
+ candidate = self.view.get_entity(candidate_id)
74
+
75
+ if candidate:
76
+ self.report_conflicting_match("Candidate", candidate)
77
+ if left:
78
+ self.report_conflicting_match("Left side of negative decision", left)
79
+ if right:
80
+ self.report_conflicting_match("Right side of negative decision", right)
Binary file
Binary file
nomenklatura/db.py ADDED
@@ -0,0 +1,139 @@
1
+ from contextlib import contextmanager
2
+ from functools import cache
3
+ from typing import Any, Dict, Generator, Iterable, List, Mapping, Optional, cast
4
+ import logging
5
+
6
+ from followthemoney import Statement
7
+ from followthemoney.statement.util import get_prop_type
8
+ from sqlalchemy import (
9
+ Boolean,
10
+ Column,
11
+ DateTime,
12
+ Dialect,
13
+ MetaData,
14
+ Table,
15
+ Unicode,
16
+ create_engine,
17
+ delete,
18
+ )
19
+ from sqlalchemy.engine import Connection, Engine
20
+ from sqlalchemy.dialects.postgresql import insert as psql_insert
21
+ from sqlalchemy.dialects.sqlite import insert as sqlite_insert
22
+
23
+ from nomenklatura import settings
24
+
25
+ Conn = Connection
26
+ Connish = Optional[Connection]
27
+ KEY_LEN = 255
28
+ VALUE_LEN = 65535
29
+
30
+ log = logging.getLogger(__name__)
31
+
32
+
33
+ @cache
34
+ def get_engine(url: Optional[str] = None) -> Engine:
35
+ url = url or settings.DB_URL
36
+ connect_args = {}
37
+ if url.startswith("postgres"):
38
+ connect_args["options"] = f"-c statement_timeout={settings.DB_STMT_TIMEOUT}"
39
+
40
+ return create_engine(
41
+ url, pool_size=settings.DB_POOL_SIZE, connect_args=connect_args
42
+ )
43
+
44
+
45
+ @cache
46
+ def get_metadata() -> MetaData:
47
+ return MetaData()
48
+
49
+
50
+ @contextmanager
51
+ def ensure_tx(conn: Connish = None) -> Generator[Connection, None, None]:
52
+ if conn is not None:
53
+ yield conn
54
+ return
55
+ engine = get_engine()
56
+ with engine.begin() as conn:
57
+ yield conn
58
+
59
+
60
+ def make_statement_table(
61
+ metadata: MetaData,
62
+ name: str = settings.STATEMENT_TABLE,
63
+ ) -> Table:
64
+ return Table(
65
+ name,
66
+ metadata,
67
+ Column("id", Unicode(KEY_LEN), primary_key=True, unique=True),
68
+ Column("entity_id", Unicode(KEY_LEN), index=True, nullable=False),
69
+ Column("canonical_id", Unicode(KEY_LEN), index=True, nullable=False),
70
+ Column("prop", Unicode(KEY_LEN), index=True, nullable=False),
71
+ Column("prop_type", Unicode(KEY_LEN), index=True, nullable=False),
72
+ Column("schema", Unicode(KEY_LEN), index=True, nullable=False),
73
+ Column("value", Unicode(VALUE_LEN), nullable=False),
74
+ Column("original_value", Unicode(VALUE_LEN), nullable=True),
75
+ Column("dataset", Unicode(KEY_LEN), index=True),
76
+ Column("origin", Unicode(KEY_LEN), index=True),
77
+ Column("lang", Unicode(KEY_LEN), nullable=True),
78
+ Column("external", Boolean, default=False, nullable=False),
79
+ Column("first_seen", DateTime, nullable=True),
80
+ Column("last_seen", DateTime, nullable=True),
81
+ )
82
+
83
+
84
+ def _upsert_statement_batch(
85
+ dialect: Dialect, conn: Connection, table: Table, batch: List[Mapping[str, Any]]
86
+ ) -> None:
87
+ """Create an upsert statement for the given table and engine."""
88
+ if dialect.name == "sqlite":
89
+ lstmt = sqlite_insert(table).values(batch)
90
+ lstmt = lstmt.on_conflict_do_nothing(index_elements=["id"])
91
+ conn.execute(lstmt)
92
+ elif dialect.name in ("postgresql", "postgres"):
93
+ pstmt = psql_insert(table).values(batch)
94
+ pstmt = pstmt.on_conflict_do_nothing(index_elements=["id"])
95
+ conn.execute(pstmt)
96
+ else:
97
+ raise NotImplementedError(f"Upsert not implemented for dialect {dialect.name}")
98
+
99
+
100
+ def insert_statements(
101
+ engine: Engine,
102
+ table: Table,
103
+ dataset_name: str,
104
+ statements: Iterable[Statement],
105
+ batch_size: int = settings.STATEMENT_BATCH,
106
+ ) -> None:
107
+ dataset_count: int = 0
108
+ is_postgresql = "postgres" in engine.dialect.name
109
+ with engine.begin() as conn:
110
+ del_q = delete(table).where(table.c.dataset == dataset_name)
111
+ conn.execute(del_q)
112
+ batch: List[Mapping[str, Any]] = []
113
+
114
+ for stmt in statements:
115
+ if is_postgresql:
116
+ row = cast(Dict[str, Any], stmt.to_dict())
117
+ row["prop_type"] = get_prop_type(row["schema"], row["prop"])
118
+ else:
119
+ row = stmt.to_db_row()
120
+ batch.append(row)
121
+ dataset_count += 1
122
+ if len(batch) >= batch_size:
123
+ args = (len(batch), dataset_count, dataset_name)
124
+ log.info("Inserting batch %s statements (total: %s) into %r" % args)
125
+ _upsert_statement_batch(engine.dialect, conn, table, batch)
126
+ batch = []
127
+ if len(batch):
128
+ _upsert_statement_batch(engine.dialect, conn, table, batch)
129
+ log.info("Load complete: %r (%d total)" % (dataset_name, dataset_count))
130
+
131
+
132
+ # TODO: consider offering a COPY-based loader:
133
+ # raw_conn = await conn.get_raw_connection()
134
+ # driver_conn: Connection = raw_conn.driver_connection
135
+ # result = await driver_conn.copy_records_to_table(
136
+ # stmt_table.name,
137
+ # records=load_data_rows(),
138
+ # columns=COLUMNS,
139
+ # )
nomenklatura/delta.py ADDED
@@ -0,0 +1,4 @@
1
+ ADD = "ADD"
2
+ MOD = "MOD"
3
+ DEL = "DEL"
4
+ OPS = [ADD, MOD, DEL]