ftmq 4.1.2__py3-none-any.whl → 4.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ftmq/__init__.py +1 -1
- ftmq/cli.py +81 -0
- ftmq/store/__init__.py +1 -1
- ftmq/store/base.py +1 -1
- ftmq/store/fragments/__init__.py +1 -1
- ftmq/store/fragments/dataset.py +143 -15
- ftmq/store/fragments/loader.py +13 -5
- ftmq/store/fragments/store.py +31 -3
- ftmq/store/lake.py +1 -1
- ftmq/util.py +66 -2
- {ftmq-4.1.2.dist-info → ftmq-4.3.1.dist-info}/METADATA +12 -9
- {ftmq-4.1.2.dist-info → ftmq-4.3.1.dist-info}/RECORD +16 -16
- {ftmq-4.1.2.dist-info → ftmq-4.3.1.dist-info}/WHEEL +1 -1
- {ftmq-4.1.2.dist-info → ftmq-4.3.1.dist-info}/entry_points.txt +0 -0
- {ftmq-4.1.2.dist-info → ftmq-4.3.1.dist-info/licenses}/LICENSE +0 -0
- {ftmq-4.1.2.dist-info → ftmq-4.3.1.dist-info/licenses}/NOTICE +0 -0
ftmq/__init__.py
CHANGED
ftmq/cli.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
|
|
1
3
|
import click
|
|
2
4
|
from anystore.io import smart_write, smart_write_json, smart_write_model
|
|
3
5
|
from click_default_group import DefaultGroup
|
|
@@ -11,6 +13,9 @@ from ftmq.model.dataset import Catalog, Dataset
|
|
|
11
13
|
from ftmq.model.stats import Collector
|
|
12
14
|
from ftmq.query import Query
|
|
13
15
|
from ftmq.store import get_store
|
|
16
|
+
from ftmq.store.fragments import get_fragments
|
|
17
|
+
from ftmq.store.fragments import get_store as get_fragments_store
|
|
18
|
+
from ftmq.store.fragments.settings import Settings as FragmentsSettings
|
|
14
19
|
from ftmq.util import apply_dataset, parse_unknown_filters
|
|
15
20
|
|
|
16
21
|
log = get_logger(__name__)
|
|
@@ -311,6 +316,82 @@ def store_iterate(
|
|
|
311
316
|
smart_write_proxies(output_uri, store.iterate())
|
|
312
317
|
|
|
313
318
|
|
|
319
|
+
@cli.group()
|
|
320
|
+
def fragments():
|
|
321
|
+
pass
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
fragments_settings = FragmentsSettings()
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
@fragments.command("list-datasets")
|
|
328
|
+
@click.option(
|
|
329
|
+
"-i",
|
|
330
|
+
"--input-uri",
|
|
331
|
+
default=fragments_settings.database_uri,
|
|
332
|
+
show_default=True,
|
|
333
|
+
help="input file or uri",
|
|
334
|
+
)
|
|
335
|
+
@click.option(
|
|
336
|
+
"-o", "--output-uri", default="-", show_default=True, help="output file or uri"
|
|
337
|
+
)
|
|
338
|
+
def fragments_list_datasets(
|
|
339
|
+
input_uri: str = fragments_settings.database_uri,
|
|
340
|
+
output_uri: str = "-",
|
|
341
|
+
):
|
|
342
|
+
"""
|
|
343
|
+
List datasets within a fragments store
|
|
344
|
+
"""
|
|
345
|
+
store = get_fragments_store(input_uri)
|
|
346
|
+
datasets = [ds.name for ds in store.all()]
|
|
347
|
+
smart_write(output_uri, "\n".join(datasets).encode() + b"\n")
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
@fragments.command("iterate")
|
|
351
|
+
@click.option(
|
|
352
|
+
"-i",
|
|
353
|
+
"--input-uri",
|
|
354
|
+
default=fragments_settings.database_uri,
|
|
355
|
+
show_default=True,
|
|
356
|
+
help="fragments store input uri",
|
|
357
|
+
)
|
|
358
|
+
@click.option(
|
|
359
|
+
"-o", "--output-uri", default="-", show_default=True, help="output file or uri"
|
|
360
|
+
)
|
|
361
|
+
@click.option("-d", "--dataset", required=True, help="Dataset name to iterate")
|
|
362
|
+
@click.option("-s", "--schema", default=None, help="Filter by schema")
|
|
363
|
+
@click.option(
|
|
364
|
+
"--since",
|
|
365
|
+
default=None,
|
|
366
|
+
help="Filter by timestamp (since), ISO format: YYYY-MM-DDTHH:MM:SS",
|
|
367
|
+
)
|
|
368
|
+
@click.option(
|
|
369
|
+
"--until",
|
|
370
|
+
default=None,
|
|
371
|
+
help="Filter by timestamp (until), ISO format: YYYY-MM-DDTHH:MM:SS",
|
|
372
|
+
)
|
|
373
|
+
def fragments_iterate(
|
|
374
|
+
input_uri: str = fragments_settings.database_uri,
|
|
375
|
+
output_uri: str = "-",
|
|
376
|
+
dataset: str = None,
|
|
377
|
+
schema: str | None = None,
|
|
378
|
+
since: str | None = None,
|
|
379
|
+
until: str | None = None,
|
|
380
|
+
):
|
|
381
|
+
"""
|
|
382
|
+
Iterate all entities from a fragments dataset
|
|
383
|
+
"""
|
|
384
|
+
fragments = get_fragments(dataset, database_uri=input_uri)
|
|
385
|
+
|
|
386
|
+
# Parse timestamp strings to datetime objects
|
|
387
|
+
since_dt = datetime.fromisoformat(since) if since else None
|
|
388
|
+
until_dt = datetime.fromisoformat(until) if until else None
|
|
389
|
+
|
|
390
|
+
smart_write_proxies(
|
|
391
|
+
output_uri, fragments.iterate(schema=schema, since=since_dt, until=until_dt)
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
|
|
314
395
|
@cli.command("aggregate")
|
|
315
396
|
@click.option(
|
|
316
397
|
"-i", "--input-uri", default="-", show_default=True, help="input file or uri"
|
ftmq/store/__init__.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from functools import cache
|
|
2
1
|
from pathlib import Path
|
|
3
2
|
from urllib.parse import urlparse
|
|
4
3
|
|
|
4
|
+
from anystore.functools import weakref_cache as cache
|
|
5
5
|
from anystore.types import Uri
|
|
6
6
|
from followthemoney.dataset.dataset import Dataset
|
|
7
7
|
from nomenklatura import Resolver, settings
|
ftmq/store/base.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from functools import cache
|
|
2
1
|
from typing import Generator, Iterable
|
|
3
2
|
from urllib.parse import urlparse
|
|
4
3
|
|
|
4
|
+
from anystore.functools import weakref_cache as cache
|
|
5
5
|
from followthemoney import DefaultDataset
|
|
6
6
|
from followthemoney.dataset.dataset import Dataset
|
|
7
7
|
from nomenklatura import store as nk
|
ftmq/store/fragments/__init__.py
CHANGED
ftmq/store/fragments/dataset.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from contextlib import contextmanager
|
|
2
3
|
from datetime import datetime
|
|
3
4
|
from typing import Generator, Iterable, TypeAlias
|
|
4
5
|
|
|
@@ -9,6 +10,7 @@ from normality import slugify
|
|
|
9
10
|
from sqlalchemy import (
|
|
10
11
|
JSON,
|
|
11
12
|
Column,
|
|
13
|
+
Connection,
|
|
12
14
|
DateTime,
|
|
13
15
|
String,
|
|
14
16
|
Table,
|
|
@@ -28,16 +30,42 @@ from ftmq.util import make_dataset
|
|
|
28
30
|
log = logging.getLogger(__name__)
|
|
29
31
|
UNDEFINED = (OperationalError,)
|
|
30
32
|
try:
|
|
31
|
-
from
|
|
33
|
+
from psycopg.errors import UndefinedTable
|
|
32
34
|
|
|
33
35
|
UNDEFINED = (UndefinedTable, *UNDEFINED)
|
|
34
36
|
except ImportError:
|
|
35
|
-
|
|
37
|
+
try:
|
|
38
|
+
from psycopg2.errors import UndefinedTable
|
|
39
|
+
|
|
40
|
+
UNDEFINED = (UndefinedTable, *UNDEFINED)
|
|
41
|
+
except ImportError:
|
|
42
|
+
pass
|
|
36
43
|
|
|
37
44
|
|
|
38
45
|
EntityFragments: TypeAlias = Generator[EntityProxy, None, None]
|
|
39
46
|
|
|
40
47
|
|
|
48
|
+
@contextmanager
|
|
49
|
+
def disable_timeout(conn: Connection, store):
|
|
50
|
+
# for long running iterations (e.g. re-index in OpenAleph), for postgres we
|
|
51
|
+
# don't want to get cancelled if a idle_in_transaction_timeout is configured
|
|
52
|
+
# on the server
|
|
53
|
+
if store.is_postgres:
|
|
54
|
+
raw_conn = conn.connection.driver_connection
|
|
55
|
+
with raw_conn.cursor() as cursor:
|
|
56
|
+
cursor.execute("SET idle_in_transaction_session_timeout = 0")
|
|
57
|
+
try:
|
|
58
|
+
yield conn
|
|
59
|
+
finally:
|
|
60
|
+
if store.is_postgres:
|
|
61
|
+
try:
|
|
62
|
+
raw_conn = conn.connection.driver_connection
|
|
63
|
+
with raw_conn.cursor() as cursor:
|
|
64
|
+
cursor.execute("SET idle_in_transaction_session_timeout = DEFAULT")
|
|
65
|
+
except Exception:
|
|
66
|
+
pass # Connection might be closed
|
|
67
|
+
|
|
68
|
+
|
|
41
69
|
class Fragments(object):
|
|
42
70
|
def __init__(self, store, name, origin=NULL_ORIGIN):
|
|
43
71
|
self.store = store
|
|
@@ -104,7 +132,9 @@ class Fragments(object):
|
|
|
104
132
|
def bulk(self, size=1000):
|
|
105
133
|
return BulkLoader(self, size)
|
|
106
134
|
|
|
107
|
-
def fragments(
|
|
135
|
+
def fragments(
|
|
136
|
+
self, entity_ids=None, fragment=None, schema=None, since=None, until=None
|
|
137
|
+
):
|
|
108
138
|
stmt = self.table.select()
|
|
109
139
|
entity_ids = ensure_list(entity_ids)
|
|
110
140
|
if len(entity_ids) == 1:
|
|
@@ -113,25 +143,42 @@ class Fragments(object):
|
|
|
113
143
|
stmt = stmt.where(self.table.c.id.in_(entity_ids))
|
|
114
144
|
if fragment is not None:
|
|
115
145
|
stmt = stmt.where(self.table.c.fragment == fragment)
|
|
146
|
+
if schema is not None:
|
|
147
|
+
if self.store.is_postgres:
|
|
148
|
+
stmt = stmt.where(self.table.c.entity["schema"].astext == schema)
|
|
149
|
+
else:
|
|
150
|
+
# SQLite JSON support - use json_extract function
|
|
151
|
+
stmt = stmt.where(
|
|
152
|
+
func.json_extract(self.table.c.entity, "$.schema") == schema
|
|
153
|
+
)
|
|
154
|
+
if since is not None:
|
|
155
|
+
stmt = stmt.where(self.table.c.timestamp >= since)
|
|
156
|
+
if until is not None:
|
|
157
|
+
stmt = stmt.where(self.table.c.timestamp <= until)
|
|
116
158
|
stmt = stmt.order_by(self.table.c.id)
|
|
117
159
|
# stmt = stmt.order_by(self.table.c.origin)
|
|
118
160
|
# stmt = stmt.order_by(self.table.c.fragment)
|
|
119
161
|
conn = self.store.engine.connect()
|
|
120
162
|
try:
|
|
121
|
-
conn
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
163
|
+
with disable_timeout(conn, self.store) as conn:
|
|
164
|
+
conn = conn.execution_options(stream_results=True)
|
|
165
|
+
for ent in conn.execute(stmt):
|
|
166
|
+
data = {"id": ent.id, "datasets": [self.name], **ent.entity}
|
|
167
|
+
if ent.origin != NULL_ORIGIN:
|
|
168
|
+
data["origin"] = ent.origin
|
|
169
|
+
yield data
|
|
127
170
|
except Exception:
|
|
128
171
|
self.reset()
|
|
129
172
|
raise
|
|
130
173
|
finally:
|
|
131
174
|
conn.close()
|
|
132
175
|
|
|
133
|
-
def partials(
|
|
134
|
-
|
|
176
|
+
def partials(
|
|
177
|
+
self, entity_id=None, skip_errors=False, schema=None, since=None, until=None
|
|
178
|
+
) -> EntityFragments:
|
|
179
|
+
for fragment in self.fragments(
|
|
180
|
+
entity_ids=entity_id, schema=schema, since=since, until=until
|
|
181
|
+
):
|
|
135
182
|
try:
|
|
136
183
|
yield EntityProxy.from_dict(fragment, cleaned=True)
|
|
137
184
|
except Exception:
|
|
@@ -140,18 +187,32 @@ class Fragments(object):
|
|
|
140
187
|
continue
|
|
141
188
|
raise
|
|
142
189
|
|
|
143
|
-
def iterate(
|
|
190
|
+
def iterate(
|
|
191
|
+
self, entity_id=None, skip_errors=False, schema=None, since=None, until=None
|
|
192
|
+
) -> EntityFragments:
|
|
193
|
+
if entity_id is None:
|
|
194
|
+
log.info("Using batched iteration for complete dataset.")
|
|
195
|
+
yield from self.iterate_batched(
|
|
196
|
+
skip_errors=skip_errors, schema=schema, since=since, until=until
|
|
197
|
+
)
|
|
198
|
+
return
|
|
144
199
|
entity = None
|
|
145
200
|
invalid = None
|
|
146
201
|
fragments = 1
|
|
147
|
-
for partial in self.partials(
|
|
202
|
+
for partial in self.partials(
|
|
203
|
+
entity_id=entity_id,
|
|
204
|
+
skip_errors=skip_errors,
|
|
205
|
+
schema=schema,
|
|
206
|
+
since=since,
|
|
207
|
+
until=until,
|
|
208
|
+
):
|
|
148
209
|
if partial.id == invalid:
|
|
149
210
|
continue
|
|
150
211
|
if entity is not None:
|
|
151
212
|
if entity.id == partial.id:
|
|
152
213
|
fragments += 1
|
|
153
214
|
if fragments % 10000 == 0:
|
|
154
|
-
log.
|
|
215
|
+
log.warning(
|
|
155
216
|
"[%s:%s] aggregated %d fragments...",
|
|
156
217
|
entity.schema.name,
|
|
157
218
|
entity.id,
|
|
@@ -176,11 +237,76 @@ class Fragments(object):
|
|
|
176
237
|
if entity is not None:
|
|
177
238
|
yield entity
|
|
178
239
|
|
|
240
|
+
def iterate_batched(
|
|
241
|
+
self, skip_errors=False, batch_size=10_000, schema=None, since=None, until=None
|
|
242
|
+
) -> EntityFragments:
|
|
243
|
+
"""
|
|
244
|
+
For large datasets an overall sort is not feasible, so we iterate in
|
|
245
|
+
sorted batched IDs.
|
|
246
|
+
"""
|
|
247
|
+
for entity_ids in self.get_sorted_id_batches(
|
|
248
|
+
batch_size, schema=schema, since=since, until=until
|
|
249
|
+
):
|
|
250
|
+
yield from self.iterate(
|
|
251
|
+
entity_id=entity_ids,
|
|
252
|
+
skip_errors=skip_errors,
|
|
253
|
+
schema=schema,
|
|
254
|
+
since=since,
|
|
255
|
+
until=until,
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
def get_sorted_id_batches(
|
|
259
|
+
self, batch_size=10_000, schema=None, since=None, until=None
|
|
260
|
+
) -> Generator[list[str], None, None]:
|
|
261
|
+
"""
|
|
262
|
+
Get sorted ID batches to speed up iteration and useful to parallelize
|
|
263
|
+
processing of iterator Entities
|
|
264
|
+
"""
|
|
265
|
+
last_id = None
|
|
266
|
+
with self.store.engine.connect() as conn:
|
|
267
|
+
while True:
|
|
268
|
+
stmt = select(self.table.c.id).distinct()
|
|
269
|
+
if last_id is not None:
|
|
270
|
+
stmt = stmt.where(self.table.c.id > last_id)
|
|
271
|
+
if schema is not None:
|
|
272
|
+
if self.store.is_postgres:
|
|
273
|
+
stmt = stmt.where(
|
|
274
|
+
self.table.c.entity["schema"].astext == schema
|
|
275
|
+
)
|
|
276
|
+
else:
|
|
277
|
+
# SQLite JSON support - use json_extract function
|
|
278
|
+
stmt = stmt.where(
|
|
279
|
+
func.json_extract(self.table.c.entity, "$.schema") == schema
|
|
280
|
+
)
|
|
281
|
+
if since is not None:
|
|
282
|
+
stmt = stmt.where(self.table.c.timestamp >= since)
|
|
283
|
+
if until is not None:
|
|
284
|
+
stmt = stmt.where(self.table.c.timestamp <= until)
|
|
285
|
+
stmt = stmt.order_by(self.table.c.id).limit(batch_size)
|
|
286
|
+
try:
|
|
287
|
+
res = conn.execute(stmt)
|
|
288
|
+
entity_ids = [r.id for r in res.fetchall()]
|
|
289
|
+
if not entity_ids:
|
|
290
|
+
return
|
|
291
|
+
yield entity_ids
|
|
292
|
+
last_id = entity_ids[-1]
|
|
293
|
+
except Exception:
|
|
294
|
+
self.reset()
|
|
295
|
+
raise
|
|
296
|
+
|
|
297
|
+
def get_sorted_ids(
|
|
298
|
+
self, batch_size=10_000, schema=None, since=None, until=None
|
|
299
|
+
) -> Generator[str, None, None]:
|
|
300
|
+
"""Get sorted IDs, optionally filtered by schema"""
|
|
301
|
+
for batch in self.get_sorted_id_batches(batch_size, schema, since, until):
|
|
302
|
+
yield from batch
|
|
303
|
+
|
|
179
304
|
def statements(
|
|
180
305
|
self,
|
|
181
306
|
entity_ids: Iterable[str] | None = None,
|
|
182
307
|
origin: str | None = None,
|
|
183
308
|
since: datetime | None = None,
|
|
309
|
+
until: datetime | None = None,
|
|
184
310
|
) -> Statements:
|
|
185
311
|
"""Iterate unsorted statements with its fragment origins"""
|
|
186
312
|
stmt = self.table.select()
|
|
@@ -192,7 +318,9 @@ class Fragments(object):
|
|
|
192
318
|
if origin is not None:
|
|
193
319
|
stmt = stmt.where(self.table.c.origin == origin)
|
|
194
320
|
if since is not None:
|
|
195
|
-
stmt = stmt.where(self.table.c.timestamp
|
|
321
|
+
stmt = stmt.where(self.table.c.timestamp >= since)
|
|
322
|
+
if until is not None:
|
|
323
|
+
stmt = stmt.where(self.table.c.timestamp <= until)
|
|
196
324
|
conn = self.store.engine.connect()
|
|
197
325
|
default_dataset = make_dataset(self.name)
|
|
198
326
|
try:
|
ftmq/store/fragments/loader.py
CHANGED
|
@@ -26,11 +26,16 @@ EXCEPTIONS = (
|
|
|
26
26
|
TimeoutError,
|
|
27
27
|
)
|
|
28
28
|
try:
|
|
29
|
-
from
|
|
29
|
+
from psycopg import DatabaseError, OperationalError
|
|
30
30
|
|
|
31
31
|
EXCEPTIONS = (DatabaseError, OperationalError, *EXCEPTIONS)
|
|
32
32
|
except ImportError:
|
|
33
|
-
|
|
33
|
+
try:
|
|
34
|
+
from psycopg2 import DatabaseError, OperationalError
|
|
35
|
+
|
|
36
|
+
EXCEPTIONS = (DatabaseError, OperationalError, *EXCEPTIONS)
|
|
37
|
+
except ImportError:
|
|
38
|
+
pass
|
|
34
39
|
|
|
35
40
|
log = logging.getLogger(__name__)
|
|
36
41
|
|
|
@@ -50,9 +55,12 @@ class BulkLoader(object):
|
|
|
50
55
|
else:
|
|
51
56
|
entity = dict(entity)
|
|
52
57
|
id_ = entity.pop("id")
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
self.
|
|
58
|
+
if id_:
|
|
59
|
+
self.buffer[(id_, origin, fragment)] = entity
|
|
60
|
+
if len(self.buffer) >= self.size:
|
|
61
|
+
self.flush()
|
|
62
|
+
else:
|
|
63
|
+
log.warning("Entity has no ID!")
|
|
56
64
|
|
|
57
65
|
def _store_values(self, conn, values):
|
|
58
66
|
table = self.dataset.table
|
ftmq/store/fragments/store.py
CHANGED
|
@@ -11,14 +11,42 @@ class Store(object):
|
|
|
11
11
|
|
|
12
12
|
PREFIX = "ftm"
|
|
13
13
|
|
|
14
|
+
def _adjust_psycopg3_uri(self, database_uri: str) -> str:
|
|
15
|
+
"""Adjust PostgreSQL URI to use psycopg3 dialect if psycopg is available."""
|
|
16
|
+
if database_uri.startswith(("postgresql://", "postgres://")):
|
|
17
|
+
try:
|
|
18
|
+
import psycopg # noqa: F401
|
|
19
|
+
|
|
20
|
+
# Use psycopg3 dialect for better performance and compatibility
|
|
21
|
+
if database_uri.startswith("postgresql://"):
|
|
22
|
+
return database_uri.replace(
|
|
23
|
+
"postgresql://", "postgresql+psycopg://", 1
|
|
24
|
+
)
|
|
25
|
+
elif database_uri.startswith("postgres://"):
|
|
26
|
+
return database_uri.replace(
|
|
27
|
+
"postgres://", "postgresql+psycopg://", 1
|
|
28
|
+
)
|
|
29
|
+
except ImportError:
|
|
30
|
+
# Fall back to psycopg2 if psycopg3 is not available
|
|
31
|
+
pass
|
|
32
|
+
return database_uri
|
|
33
|
+
|
|
14
34
|
def __init__(
|
|
15
35
|
self,
|
|
16
36
|
database_uri: str,
|
|
17
37
|
**config,
|
|
18
38
|
):
|
|
19
|
-
self.database_uri = database_uri
|
|
20
|
-
|
|
21
|
-
|
|
39
|
+
self.database_uri = self._adjust_psycopg3_uri(database_uri)
|
|
40
|
+
|
|
41
|
+
# Configure connection pooling for psycopg3
|
|
42
|
+
config.setdefault("pool_size", 1)
|
|
43
|
+
if self.database_uri.startswith("postgresql+psycopg://"):
|
|
44
|
+
config.setdefault("max_overflow", 5)
|
|
45
|
+
config.setdefault("pool_timeout", 60)
|
|
46
|
+
config.setdefault("pool_recycle", 3600)
|
|
47
|
+
config.setdefault("pool_pre_ping", True)
|
|
48
|
+
|
|
49
|
+
self.engine = create_engine(self.database_uri, future=True, **config)
|
|
22
50
|
self.is_postgres = self.engine.dialect.name == "postgresql"
|
|
23
51
|
self.meta = MetaData()
|
|
24
52
|
|
ftmq/store/lake.py
CHANGED
|
@@ -18,7 +18,6 @@ Layout:
|
|
|
18
18
|
```
|
|
19
19
|
"""
|
|
20
20
|
|
|
21
|
-
from functools import cache
|
|
22
21
|
from pathlib import Path
|
|
23
22
|
from typing import Any, Generator, Iterable
|
|
24
23
|
from urllib.parse import urlparse
|
|
@@ -26,6 +25,7 @@ from urllib.parse import urlparse
|
|
|
26
25
|
import duckdb
|
|
27
26
|
import numpy as np
|
|
28
27
|
import pandas as pd
|
|
28
|
+
from anystore.functools import weakref_cache as cache
|
|
29
29
|
from anystore.lock import Lock
|
|
30
30
|
from anystore.logging import get_logger
|
|
31
31
|
from anystore.store.fs import Store as FSStore
|
ftmq/util.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
from functools import
|
|
1
|
+
from functools import lru_cache
|
|
2
2
|
from typing import Any, Generator, Type
|
|
3
3
|
|
|
4
4
|
import pycountry
|
|
5
|
-
from anystore.
|
|
5
|
+
from anystore.functools import weakref_cache as cache
|
|
6
|
+
from anystore.types import SDict, StrGenerator
|
|
6
7
|
from banal import ensure_list, is_listish
|
|
7
8
|
from followthemoney import E, model
|
|
8
9
|
from followthemoney.compare import _normalize_names
|
|
@@ -13,6 +14,8 @@ from followthemoney.schema import Schema
|
|
|
13
14
|
from followthemoney.types import registry
|
|
14
15
|
from followthemoney.util import make_entity_id, sanitize_text
|
|
15
16
|
from normality import collapse_spaces, latinize_text, slugify
|
|
17
|
+
from rigour.names import Name, Symbol, tag_org_name, tag_person_name
|
|
18
|
+
from rigour.names.tokenize import normalize_name
|
|
16
19
|
from rigour.text.scripts import can_latinize
|
|
17
20
|
|
|
18
21
|
from ftmq.enums import Comparators
|
|
@@ -480,3 +483,64 @@ def must_str(value: Any) -> str:
|
|
|
480
483
|
if not value:
|
|
481
484
|
raise ValueError(f"Value invalid: `{value}`")
|
|
482
485
|
return value
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
SELECT_SYMBOLS = "__symbols__"
|
|
489
|
+
SELECT_ANNOTATED = "__annotated__"
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
def get_name_symbols(schema: Schema, *names: str) -> set[Symbol]:
|
|
493
|
+
"""Get the rigour names symbols for the given schema and list of names"""
|
|
494
|
+
symbols: set[Symbol] = set()
|
|
495
|
+
if schema.is_a("Person"):
|
|
496
|
+
taggers = [tag_person_name]
|
|
497
|
+
elif schema.is_a("Organization"):
|
|
498
|
+
taggers = [tag_org_name]
|
|
499
|
+
elif schema.is_a("LegalEntity"):
|
|
500
|
+
taggers = [tag_org_name, tag_person_name]
|
|
501
|
+
else:
|
|
502
|
+
return symbols
|
|
503
|
+
for name in names:
|
|
504
|
+
n = Name(name)
|
|
505
|
+
for tagger in taggers:
|
|
506
|
+
for symbol in tagger(n, normalize_name).symbols:
|
|
507
|
+
symbols.add(symbol)
|
|
508
|
+
return symbols
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
def get_symbols(entity: EntityProxy) -> set[Symbol]:
|
|
512
|
+
"""Get the rigour names symbols for the given entity"""
|
|
513
|
+
if not entity.schema.is_a("LegalEntity"):
|
|
514
|
+
return set()
|
|
515
|
+
names = entity.get_type_values(registry.name, matchable=True)
|
|
516
|
+
return get_name_symbols(entity.schema, *names)
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
def inline_symbols(entity: EntityProxy) -> None:
|
|
520
|
+
"""Get the rigour names symbols for the given entity and write them to `indexText`"""
|
|
521
|
+
# clean up old symbols from indexText:
|
|
522
|
+
for text in entity.pop("indexText"):
|
|
523
|
+
if not text.startswith(SELECT_SYMBOLS):
|
|
524
|
+
entity.add("indexText", text)
|
|
525
|
+
symbols = get_symbols(entity)
|
|
526
|
+
entity.add("indexText", f"{SELECT_SYMBOLS} {','.join(map(str, symbols))}")
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
def select_data(e: EntityProxy, prefix: str) -> StrGenerator:
|
|
530
|
+
"""Select arbitrary stored data in `indexText` identified by given prefix"""
|
|
531
|
+
for text in e.get("indexText", quiet=True):
|
|
532
|
+
if text.startswith(prefix):
|
|
533
|
+
yield text.replace(prefix, "").strip()
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
def select_symbols(e: EntityProxy) -> set[str]:
|
|
537
|
+
"""Select stored symbols in `indexText`"""
|
|
538
|
+
symbols: set[str] = set()
|
|
539
|
+
for data in select_data(e, SELECT_SYMBOLS):
|
|
540
|
+
symbols.update(data.split(","))
|
|
541
|
+
return symbols
|
|
542
|
+
|
|
543
|
+
|
|
544
|
+
def select_annotations(e: EntityProxy) -> set[str]:
|
|
545
|
+
"""Select stored annotations in `indexText`"""
|
|
546
|
+
return {s for s in select_data(e, SELECT_ANNOTATED)}
|
|
@@ -1,8 +1,10 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: ftmq
|
|
3
|
-
Version: 4.1
|
|
3
|
+
Version: 4.3.1
|
|
4
4
|
Summary: followthemoney query dsl and io helpers
|
|
5
5
|
License: AGPLv3+
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
License-File: NOTICE
|
|
6
8
|
Author: Simon Wörpel
|
|
7
9
|
Author-email: simon.woerpel@pm.me
|
|
8
10
|
Requires-Python: >=3.11,<4
|
|
@@ -19,24 +21,25 @@ Provides-Extra: postgres
|
|
|
19
21
|
Provides-Extra: redis
|
|
20
22
|
Provides-Extra: sql
|
|
21
23
|
Requires-Dist: alephclient (>=2.6.0,<3.0.0) ; extra == "aleph"
|
|
22
|
-
Requires-Dist: anystore (>=0.
|
|
24
|
+
Requires-Dist: anystore (>=0.4.0,<0.5.0)
|
|
23
25
|
Requires-Dist: click (>=8.2.1,<9.0.0)
|
|
24
26
|
Requires-Dist: click-default-group (>=1.2.4,<2.0.0)
|
|
25
|
-
Requires-Dist: deltalake (>=1.1
|
|
26
|
-
Requires-Dist: duckdb (>=1.
|
|
27
|
+
Requires-Dist: deltalake (>=1.2.1,<2.0.0) ; extra == "lake"
|
|
28
|
+
Requires-Dist: duckdb (>=1.4.1,<2.0.0) ; extra == "lake"
|
|
27
29
|
Requires-Dist: fakeredis (>=2.26.2,<3.0.0) ; extra == "redis"
|
|
28
|
-
Requires-Dist: followthemoney (>=4.
|
|
30
|
+
Requires-Dist: followthemoney (>=4.3.2,<5.0.0)
|
|
29
31
|
Requires-Dist: furl (>=2.1.4,<3.0.0) ; extra == "aleph"
|
|
30
|
-
Requires-Dist: nomenklatura (>=4.1.
|
|
32
|
+
Requires-Dist: nomenklatura (>=4.1.10,<5.0.0)
|
|
31
33
|
Requires-Dist: orjson (>=3.10.18,<4.0.0)
|
|
32
|
-
Requires-Dist: pandas (>=2.3.
|
|
34
|
+
Requires-Dist: pandas (>=2.3.3,<3.0.0) ; extra == "lake"
|
|
33
35
|
Requires-Dist: plyvel (>=1.5.1,<2.0.0) ; extra == "level"
|
|
34
|
-
Requires-Dist:
|
|
36
|
+
Requires-Dist: psycopg[pool] (>=3.2.9,<4.0.0) ; extra == "postgres"
|
|
35
37
|
Requires-Dist: pyarrow (>=21.0.0,<22.0.0) ; extra == "lake"
|
|
36
38
|
Requires-Dist: pycountry (>=24.6.1,<25.0.0)
|
|
37
39
|
Requires-Dist: pydantic (>=2.11.3,<3.0.0)
|
|
38
40
|
Requires-Dist: pyicu (>=2.15.2,<3.0.0)
|
|
39
41
|
Requires-Dist: redis (>=5.2.1,<6.0.0) ; extra == "redis"
|
|
42
|
+
Requires-Dist: rigour (>=1.4.0,<2.0.0)
|
|
40
43
|
Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "postgres"
|
|
41
44
|
Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "sql"
|
|
42
45
|
Project-URL: Documentation, https://docs.investigraph.dev/lib/ftmq
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
ftmq/__init__.py,sha256=
|
|
1
|
+
ftmq/__init__.py,sha256=NbZU0yvkkgqiSaOjVHmRAE8FeeI0fIiwknWOfU9j1Ow,245
|
|
2
2
|
ftmq/aggregate.py,sha256=nyAI5w6jKG1T4Jf2yy1ruhPh0vY6p7JWOEYh0SLBdZY,2163
|
|
3
3
|
ftmq/aggregations.py,sha256=YLu1WF3WgcI3tadWKqsoZk3f_3bYUJetIREy2N1u_EM,4794
|
|
4
|
-
ftmq/cli.py,sha256=
|
|
4
|
+
ftmq/cli.py,sha256=e4wLGfGFmn-8Z6_EhEcOJvUwQM3R-J1i40qh2UVHcGo,12078
|
|
5
5
|
ftmq/enums.py,sha256=4AJ6Ii8Bnbfz0BSznyK0IUopy-XMquuAuvud8ZcrD54,2521
|
|
6
6
|
ftmq/filters.py,sha256=TKs454wbSvA5tPj8WbpIrMojctE2jGTLnrrVQKM9PE4,7908
|
|
7
7
|
ftmq/io.py,sha256=gUbqoZuYXWwxoiJ505H7FhmXLYr5MEUiCEvqIiOkbgo,3789
|
|
@@ -14,25 +14,25 @@ ftmq/model/stats.py,sha256=BiOK7x-JymI5f3dRpwgr3J7OknDP1pe9955Y7AvOJGo,4048
|
|
|
14
14
|
ftmq/query.py,sha256=6BJXZm_oh5eUyLsQ4tu3GRcpWDPBQFd-MUo6fTy_foQ,13672
|
|
15
15
|
ftmq/similar.py,sha256=P8dGVtFq2LYUXMZrWJLlLFtzAwgBVcbarvxXFIX5oYY,998
|
|
16
16
|
ftmq/sql.py,sha256=ITksJzKaJYwljOG4XWZzb07gRHMRxNHaI3TCug4UVcU,12249
|
|
17
|
-
ftmq/store/__init__.py,sha256=
|
|
17
|
+
ftmq/store/__init__.py,sha256=HH30KAHqo1kPr4qbjo6oKxdvgpF_XrqPgk8hSCkPbAY,3152
|
|
18
18
|
ftmq/store/aleph.py,sha256=vENWpMgV1rViMwcef9wAwOGyrciCnSPRN-dSI103478,3977
|
|
19
|
-
ftmq/store/base.py,sha256=
|
|
20
|
-
ftmq/store/fragments/__init__.py,sha256=
|
|
21
|
-
ftmq/store/fragments/dataset.py,sha256=
|
|
22
|
-
ftmq/store/fragments/loader.py,sha256=
|
|
19
|
+
ftmq/store/base.py,sha256=1IgX4haeNUA9NTBCi-hBFVe0u44ra1nGdHAjSTOUEAs,4762
|
|
20
|
+
ftmq/store/fragments/__init__.py,sha256=jHXHejqXe6sBNllAt-BuU24Ou8m5evmsD1UPac5J2GE,750
|
|
21
|
+
ftmq/store/fragments/dataset.py,sha256=P1ljcpoRpl_IIApA2A3vizpB2goRxxue_NNoeI1JQN8,13032
|
|
22
|
+
ftmq/store/fragments/loader.py,sha256=iVh8F22IApe9MRY_Z2fOLvT80fCYstFyxu410l4pPQY,4066
|
|
23
23
|
ftmq/store/fragments/settings.py,sha256=4c-BW-blVM9gC_IGPch03eExbZYFZ3V5h9yTfhcHvOI,303
|
|
24
|
-
ftmq/store/fragments/store.py,sha256=
|
|
24
|
+
ftmq/store/fragments/store.py,sha256=LiSfg95LjEmyq2IUpX4CMtp2tE37SEBLtAw0EWufImM,2534
|
|
25
25
|
ftmq/store/fragments/utils.py,sha256=SDoLPFF5O_oJLIPrCEab5iGn-pl6y0AhYZDYIPYxYkk,1098
|
|
26
|
-
ftmq/store/lake.py,sha256=
|
|
26
|
+
ftmq/store/lake.py,sha256=snbEZXTuR9Oy3o7p6XA61IVpsd4-70a6PgoL63PBrRg,11683
|
|
27
27
|
ftmq/store/level.py,sha256=ZGx-mMtJZJNWkpvbe7ajTREnW5MPcnw0ct3nSFLVF0I,781
|
|
28
28
|
ftmq/store/memory.py,sha256=lZ_pDzrBWNljbNb1MXJeCoO7TnAdqEfG4kfLDOU5rME,551
|
|
29
29
|
ftmq/store/redis.py,sha256=d0hkGF_BezdIfCMUshXWoQwvGmqT8JFblUMcCxzwkDA,433
|
|
30
30
|
ftmq/store/sql.py,sha256=6h3-gDaTAlD-IkiOONcX-JbaAO9-QfSsMjjMPupclcQ,5216
|
|
31
31
|
ftmq/types.py,sha256=HgF8eT3ynKnDUxBYFtoDytS-uN_CS7Yr3DHIX2r4tnk,774
|
|
32
|
-
ftmq/util.py,sha256=
|
|
33
|
-
ftmq-4.1.
|
|
34
|
-
ftmq-4.1.
|
|
35
|
-
ftmq-4.1.
|
|
36
|
-
ftmq-4.1.
|
|
37
|
-
ftmq-4.1.
|
|
38
|
-
ftmq-4.1.
|
|
32
|
+
ftmq/util.py,sha256=CmbZXYAbsKbAjoWn8WxR1Sz4VPXc2gj9CkHwaTqpBG0,15691
|
|
33
|
+
ftmq-4.3.1.dist-info/METADATA,sha256=4sW694l-QjyhMtxl7CN1M2vyFwPDckDg4kpso9aD0cs,5324
|
|
34
|
+
ftmq-4.3.1.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
35
|
+
ftmq-4.3.1.dist-info/entry_points.txt,sha256=YGDCjEiPgAMaQ5MqFKH8m-XIybehSXgarDucSlmeK3E,37
|
|
36
|
+
ftmq-4.3.1.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
|
37
|
+
ftmq-4.3.1.dist-info/licenses/NOTICE,sha256=LNgfzuMbk3kIP_KnyDiXO8rQJmDrLy_PQ7cAY8lCmMM,463
|
|
38
|
+
ftmq-4.3.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|