ftmq 4.1.1__tar.gz → 4.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {ftmq-4.1.1 → ftmq-4.3.2}/PKG-INFO +12 -9
  2. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/__init__.py +1 -1
  3. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/cli.py +81 -0
  4. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/model/dataset.py +1 -0
  5. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/store/__init__.py +1 -1
  6. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/store/base.py +1 -1
  7. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/store/fragments/__init__.py +1 -1
  8. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/store/fragments/dataset.py +143 -15
  9. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/store/fragments/loader.py +13 -5
  10. ftmq-4.3.2/ftmq/store/fragments/store.py +71 -0
  11. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/store/lake.py +1 -1
  12. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/util.py +80 -12
  13. {ftmq-4.1.1 → ftmq-4.3.2}/pyproject.toml +14 -13
  14. ftmq-4.1.1/ftmq/store/fragments/store.py +0 -43
  15. {ftmq-4.1.1 → ftmq-4.3.2}/LICENSE +0 -0
  16. {ftmq-4.1.1 → ftmq-4.3.2}/NOTICE +0 -0
  17. {ftmq-4.1.1 → ftmq-4.3.2}/README.md +0 -0
  18. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/aggregate.py +0 -0
  19. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/aggregations.py +0 -0
  20. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/enums.py +0 -0
  21. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/filters.py +0 -0
  22. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/io.py +0 -0
  23. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/logging.py +0 -0
  24. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/model/__init__.py +0 -0
  25. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/model/entity.py +0 -0
  26. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/model/mixins.py +0 -0
  27. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/model/stats.py +0 -0
  28. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/query.py +0 -0
  29. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/similar.py +0 -0
  30. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/sql.py +0 -0
  31. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/store/aleph.py +0 -0
  32. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/store/fragments/settings.py +0 -0
  33. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/store/fragments/utils.py +0 -0
  34. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/store/level.py +0 -0
  35. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/store/memory.py +0 -0
  36. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/store/redis.py +0 -0
  37. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/store/sql.py +0 -0
  38. {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/types.py +0 -0
@@ -1,8 +1,10 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: ftmq
3
- Version: 4.1.1
3
+ Version: 4.3.2
4
4
  Summary: followthemoney query dsl and io helpers
5
5
  License: AGPLv3+
6
+ License-File: LICENSE
7
+ License-File: NOTICE
6
8
  Author: Simon Wörpel
7
9
  Author-email: simon.woerpel@pm.me
8
10
  Requires-Python: >=3.11,<4
@@ -19,24 +21,25 @@ Provides-Extra: postgres
19
21
  Provides-Extra: redis
20
22
  Provides-Extra: sql
21
23
  Requires-Dist: alephclient (>=2.6.0,<3.0.0) ; extra == "aleph"
22
- Requires-Dist: anystore (>=0.3.9,<0.4.0)
24
+ Requires-Dist: anystore (>=0.4.0,<0.5.0)
23
25
  Requires-Dist: click (>=8.2.1,<9.0.0)
24
26
  Requires-Dist: click-default-group (>=1.2.4,<2.0.0)
25
- Requires-Dist: deltalake (>=1.1.2,<2.0.0) ; extra == "lake"
26
- Requires-Dist: duckdb (>=1.3.2,<2.0.0) ; extra == "lake"
27
+ Requires-Dist: deltalake (>=1.2.1,<2.0.0) ; extra == "lake"
28
+ Requires-Dist: duckdb (>=1.4.1,<2.0.0) ; extra == "lake"
27
29
  Requires-Dist: fakeredis (>=2.26.2,<3.0.0) ; extra == "redis"
28
- Requires-Dist: followthemoney (>=4.1.1,<5.0.0)
30
+ Requires-Dist: followthemoney (>=4.3.2,<5.0.0)
29
31
  Requires-Dist: furl (>=2.1.4,<3.0.0) ; extra == "aleph"
30
- Requires-Dist: nomenklatura (>=4.1.0,<5.0.0)
32
+ Requires-Dist: nomenklatura (>=4.1.10,<5.0.0)
31
33
  Requires-Dist: orjson (>=3.10.18,<4.0.0)
32
- Requires-Dist: pandas (>=2.3.1,<3.0.0) ; extra == "lake"
34
+ Requires-Dist: pandas (>=2.3.3,<3.0.0) ; extra == "lake"
33
35
  Requires-Dist: plyvel (>=1.5.1,<2.0.0) ; extra == "level"
34
- Requires-Dist: psycopg2 (>=2.9.10,<3.0.0) ; extra == "postgres"
36
+ Requires-Dist: psycopg[pool] (>=3.2.9,<4.0.0) ; extra == "postgres"
35
37
  Requires-Dist: pyarrow (>=21.0.0,<22.0.0) ; extra == "lake"
36
38
  Requires-Dist: pycountry (>=24.6.1,<25.0.0)
37
39
  Requires-Dist: pydantic (>=2.11.3,<3.0.0)
38
40
  Requires-Dist: pyicu (>=2.15.2,<3.0.0)
39
41
  Requires-Dist: redis (>=5.2.1,<6.0.0) ; extra == "redis"
42
+ Requires-Dist: rigour (>=1.4.1,<2.0.0)
40
43
  Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "postgres"
41
44
  Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "sql"
42
45
  Project-URL: Documentation, https://docs.investigraph.dev/lib/ftmq
@@ -2,7 +2,7 @@ from ftmq.io import smart_read_proxies, smart_write_proxies
2
2
  from ftmq.query import Query
3
3
  from ftmq.util import make_entity
4
4
 
5
- __version__ = "4.1.1"
5
+ __version__ = "4.3.2"
6
6
  __all__ = [
7
7
  "smart_read_proxies",
8
8
  "smart_write_proxies",
@@ -1,3 +1,5 @@
1
+ from datetime import datetime
2
+
1
3
  import click
2
4
  from anystore.io import smart_write, smart_write_json, smart_write_model
3
5
  from click_default_group import DefaultGroup
@@ -11,6 +13,9 @@ from ftmq.model.dataset import Catalog, Dataset
11
13
  from ftmq.model.stats import Collector
12
14
  from ftmq.query import Query
13
15
  from ftmq.store import get_store
16
+ from ftmq.store.fragments import get_fragments
17
+ from ftmq.store.fragments import get_store as get_fragments_store
18
+ from ftmq.store.fragments.settings import Settings as FragmentsSettings
14
19
  from ftmq.util import apply_dataset, parse_unknown_filters
15
20
 
16
21
  log = get_logger(__name__)
@@ -311,6 +316,82 @@ def store_iterate(
311
316
  smart_write_proxies(output_uri, store.iterate())
312
317
 
313
318
 
319
+ @cli.group()
320
+ def fragments():
321
+ pass
322
+
323
+
324
+ fragments_settings = FragmentsSettings()
325
+
326
+
327
+ @fragments.command("list-datasets")
328
+ @click.option(
329
+ "-i",
330
+ "--input-uri",
331
+ default=fragments_settings.database_uri,
332
+ show_default=True,
333
+ help="input file or uri",
334
+ )
335
+ @click.option(
336
+ "-o", "--output-uri", default="-", show_default=True, help="output file or uri"
337
+ )
338
+ def fragments_list_datasets(
339
+ input_uri: str = fragments_settings.database_uri,
340
+ output_uri: str = "-",
341
+ ):
342
+ """
343
+ List datasets within a fragments store
344
+ """
345
+ store = get_fragments_store(input_uri)
346
+ datasets = [ds.name for ds in store.all()]
347
+ smart_write(output_uri, "\n".join(datasets).encode() + b"\n")
348
+
349
+
350
+ @fragments.command("iterate")
351
+ @click.option(
352
+ "-i",
353
+ "--input-uri",
354
+ default=fragments_settings.database_uri,
355
+ show_default=True,
356
+ help="fragments store input uri",
357
+ )
358
+ @click.option(
359
+ "-o", "--output-uri", default="-", show_default=True, help="output file or uri"
360
+ )
361
+ @click.option("-d", "--dataset", required=True, help="Dataset name to iterate")
362
+ @click.option("-s", "--schema", default=None, help="Filter by schema")
363
+ @click.option(
364
+ "--since",
365
+ default=None,
366
+ help="Filter by timestamp (since), ISO format: YYYY-MM-DDTHH:MM:SS",
367
+ )
368
+ @click.option(
369
+ "--until",
370
+ default=None,
371
+ help="Filter by timestamp (until), ISO format: YYYY-MM-DDTHH:MM:SS",
372
+ )
373
+ def fragments_iterate(
374
+ input_uri: str = fragments_settings.database_uri,
375
+ output_uri: str = "-",
376
+ dataset: str = None,
377
+ schema: str | None = None,
378
+ since: str | None = None,
379
+ until: str | None = None,
380
+ ):
381
+ """
382
+ Iterate all entities from a fragments dataset
383
+ """
384
+ fragments = get_fragments(dataset, database_uri=input_uri)
385
+
386
+ # Parse timestamp strings to datetime objects
387
+ since_dt = datetime.fromisoformat(since) if since else None
388
+ until_dt = datetime.fromisoformat(until) if until else None
389
+
390
+ smart_write_proxies(
391
+ output_uri, fragments.iterate(schema=schema, since=since_dt, until=until_dt)
392
+ )
393
+
394
+
314
395
  @cli.command("aggregate")
315
396
  @click.option(
316
397
  "-i", "--input-uri", default="-", show_default=True, help="input file or uri"
@@ -16,6 +16,7 @@ ContentType = Literal["documents", "structured", "mixed"]
16
16
 
17
17
 
18
18
  class Dataset(BaseModel, _DatasetModel):
19
+ prefix: str | None = None
19
20
  maintainer: DataPublisher | None = None
20
21
  stats: DatasetStats = DatasetStats()
21
22
  git_repo: AnyUrl | None = None
@@ -1,7 +1,7 @@
1
- from functools import cache
2
1
  from pathlib import Path
3
2
  from urllib.parse import urlparse
4
3
 
4
+ from anystore.functools import weakref_cache as cache
5
5
  from anystore.types import Uri
6
6
  from followthemoney.dataset.dataset import Dataset
7
7
  from nomenklatura import Resolver, settings
@@ -1,7 +1,7 @@
1
- from functools import cache
2
1
  from typing import Generator, Iterable
3
2
  from urllib.parse import urlparse
4
3
 
4
+ from anystore.functools import weakref_cache as cache
5
5
  from followthemoney import DefaultDataset
6
6
  from followthemoney.dataset.dataset import Dataset
7
7
  from nomenklatura import store as nk
@@ -1,4 +1,4 @@
1
- from functools import cache
1
+ from anystore.functools import weakref_cache as cache
2
2
 
3
3
  from ftmq.store.fragments.dataset import Fragments
4
4
  from ftmq.store.fragments.settings import Settings
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ from contextlib import contextmanager
2
3
  from datetime import datetime
3
4
  from typing import Generator, Iterable, TypeAlias
4
5
 
@@ -9,6 +10,7 @@ from normality import slugify
9
10
  from sqlalchemy import (
10
11
  JSON,
11
12
  Column,
13
+ Connection,
12
14
  DateTime,
13
15
  String,
14
16
  Table,
@@ -28,16 +30,42 @@ from ftmq.util import make_dataset
28
30
  log = logging.getLogger(__name__)
29
31
  UNDEFINED = (OperationalError,)
30
32
  try:
31
- from psycopg2.errors import UndefinedTable
33
+ from psycopg.errors import UndefinedTable
32
34
 
33
35
  UNDEFINED = (UndefinedTable, *UNDEFINED)
34
36
  except ImportError:
35
- pass
37
+ try:
38
+ from psycopg2.errors import UndefinedTable
39
+
40
+ UNDEFINED = (UndefinedTable, *UNDEFINED)
41
+ except ImportError:
42
+ pass
36
43
 
37
44
 
38
45
  EntityFragments: TypeAlias = Generator[EntityProxy, None, None]
39
46
 
40
47
 
48
+ @contextmanager
49
+ def disable_timeout(conn: Connection, store):
50
+ # for long running iterations (e.g. re-index in OpenAleph), for postgres we
51
+ # don't want to get cancelled if a idle_in_transaction_timeout is configured
52
+ # on the server
53
+ if store.is_postgres:
54
+ raw_conn = conn.connection.driver_connection
55
+ with raw_conn.cursor() as cursor:
56
+ cursor.execute("SET idle_in_transaction_session_timeout = 0")
57
+ try:
58
+ yield conn
59
+ finally:
60
+ if store.is_postgres:
61
+ try:
62
+ raw_conn = conn.connection.driver_connection
63
+ with raw_conn.cursor() as cursor:
64
+ cursor.execute("SET idle_in_transaction_session_timeout = DEFAULT")
65
+ except Exception:
66
+ pass # Connection might be closed
67
+
68
+
41
69
  class Fragments(object):
42
70
  def __init__(self, store, name, origin=NULL_ORIGIN):
43
71
  self.store = store
@@ -104,7 +132,9 @@ class Fragments(object):
104
132
  def bulk(self, size=1000):
105
133
  return BulkLoader(self, size)
106
134
 
107
- def fragments(self, entity_ids=None, fragment=None):
135
+ def fragments(
136
+ self, entity_ids=None, fragment=None, schema=None, since=None, until=None
137
+ ):
108
138
  stmt = self.table.select()
109
139
  entity_ids = ensure_list(entity_ids)
110
140
  if len(entity_ids) == 1:
@@ -113,25 +143,42 @@ class Fragments(object):
113
143
  stmt = stmt.where(self.table.c.id.in_(entity_ids))
114
144
  if fragment is not None:
115
145
  stmt = stmt.where(self.table.c.fragment == fragment)
146
+ if schema is not None:
147
+ if self.store.is_postgres:
148
+ stmt = stmt.where(self.table.c.entity["schema"].astext == schema)
149
+ else:
150
+ # SQLite JSON support - use json_extract function
151
+ stmt = stmt.where(
152
+ func.json_extract(self.table.c.entity, "$.schema") == schema
153
+ )
154
+ if since is not None:
155
+ stmt = stmt.where(self.table.c.timestamp >= since)
156
+ if until is not None:
157
+ stmt = stmt.where(self.table.c.timestamp <= until)
116
158
  stmt = stmt.order_by(self.table.c.id)
117
159
  # stmt = stmt.order_by(self.table.c.origin)
118
160
  # stmt = stmt.order_by(self.table.c.fragment)
119
161
  conn = self.store.engine.connect()
120
162
  try:
121
- conn = conn.execution_options(stream_results=True)
122
- for ent in conn.execute(stmt):
123
- data = {"id": ent.id, "datasets": [self.name], **ent.entity}
124
- if ent.origin != NULL_ORIGIN:
125
- data["origin"] = ent.origin
126
- yield data
163
+ with disable_timeout(conn, self.store) as conn:
164
+ conn = conn.execution_options(stream_results=True)
165
+ for ent in conn.execute(stmt):
166
+ data = {"id": ent.id, "datasets": [self.name], **ent.entity}
167
+ if ent.origin != NULL_ORIGIN:
168
+ data["origin"] = ent.origin
169
+ yield data
127
170
  except Exception:
128
171
  self.reset()
129
172
  raise
130
173
  finally:
131
174
  conn.close()
132
175
 
133
- def partials(self, entity_id=None, skip_errors=False) -> EntityFragments:
134
- for fragment in self.fragments(entity_ids=entity_id):
176
+ def partials(
177
+ self, entity_id=None, skip_errors=False, schema=None, since=None, until=None
178
+ ) -> EntityFragments:
179
+ for fragment in self.fragments(
180
+ entity_ids=entity_id, schema=schema, since=since, until=until
181
+ ):
135
182
  try:
136
183
  yield EntityProxy.from_dict(fragment, cleaned=True)
137
184
  except Exception:
@@ -140,18 +187,32 @@ class Fragments(object):
140
187
  continue
141
188
  raise
142
189
 
143
- def iterate(self, entity_id=None, skip_errors=False) -> EntityFragments:
190
+ def iterate(
191
+ self, entity_id=None, skip_errors=False, schema=None, since=None, until=None
192
+ ) -> EntityFragments:
193
+ if entity_id is None:
194
+ log.info("Using batched iteration for complete dataset.")
195
+ yield from self.iterate_batched(
196
+ skip_errors=skip_errors, schema=schema, since=since, until=until
197
+ )
198
+ return
144
199
  entity = None
145
200
  invalid = None
146
201
  fragments = 1
147
- for partial in self.partials(entity_id=entity_id, skip_errors=skip_errors):
202
+ for partial in self.partials(
203
+ entity_id=entity_id,
204
+ skip_errors=skip_errors,
205
+ schema=schema,
206
+ since=since,
207
+ until=until,
208
+ ):
148
209
  if partial.id == invalid:
149
210
  continue
150
211
  if entity is not None:
151
212
  if entity.id == partial.id:
152
213
  fragments += 1
153
214
  if fragments % 10000 == 0:
154
- log.debug(
215
+ log.warning(
155
216
  "[%s:%s] aggregated %d fragments...",
156
217
  entity.schema.name,
157
218
  entity.id,
@@ -176,11 +237,76 @@ class Fragments(object):
176
237
  if entity is not None:
177
238
  yield entity
178
239
 
240
+ def iterate_batched(
241
+ self, skip_errors=False, batch_size=10_000, schema=None, since=None, until=None
242
+ ) -> EntityFragments:
243
+ """
244
+ For large datasets an overall sort is not feasible, so we iterate in
245
+ sorted batched IDs.
246
+ """
247
+ for entity_ids in self.get_sorted_id_batches(
248
+ batch_size, schema=schema, since=since, until=until
249
+ ):
250
+ yield from self.iterate(
251
+ entity_id=entity_ids,
252
+ skip_errors=skip_errors,
253
+ schema=schema,
254
+ since=since,
255
+ until=until,
256
+ )
257
+
258
+ def get_sorted_id_batches(
259
+ self, batch_size=10_000, schema=None, since=None, until=None
260
+ ) -> Generator[list[str], None, None]:
261
+ """
262
+ Get sorted ID batches to speed up iteration and useful to parallelize
263
+ processing of iterator Entities
264
+ """
265
+ last_id = None
266
+ with self.store.engine.connect() as conn:
267
+ while True:
268
+ stmt = select(self.table.c.id).distinct()
269
+ if last_id is not None:
270
+ stmt = stmt.where(self.table.c.id > last_id)
271
+ if schema is not None:
272
+ if self.store.is_postgres:
273
+ stmt = stmt.where(
274
+ self.table.c.entity["schema"].astext == schema
275
+ )
276
+ else:
277
+ # SQLite JSON support - use json_extract function
278
+ stmt = stmt.where(
279
+ func.json_extract(self.table.c.entity, "$.schema") == schema
280
+ )
281
+ if since is not None:
282
+ stmt = stmt.where(self.table.c.timestamp >= since)
283
+ if until is not None:
284
+ stmt = stmt.where(self.table.c.timestamp <= until)
285
+ stmt = stmt.order_by(self.table.c.id).limit(batch_size)
286
+ try:
287
+ res = conn.execute(stmt)
288
+ entity_ids = [r.id for r in res.fetchall()]
289
+ if not entity_ids:
290
+ return
291
+ yield entity_ids
292
+ last_id = entity_ids[-1]
293
+ except Exception:
294
+ self.reset()
295
+ raise
296
+
297
+ def get_sorted_ids(
298
+ self, batch_size=10_000, schema=None, since=None, until=None
299
+ ) -> Generator[str, None, None]:
300
+ """Get sorted IDs, optionally filtered by schema"""
301
+ for batch in self.get_sorted_id_batches(batch_size, schema, since, until):
302
+ yield from batch
303
+
179
304
  def statements(
180
305
  self,
181
306
  entity_ids: Iterable[str] | None = None,
182
307
  origin: str | None = None,
183
308
  since: datetime | None = None,
309
+ until: datetime | None = None,
184
310
  ) -> Statements:
185
311
  """Iterate unsorted statements with its fragment origins"""
186
312
  stmt = self.table.select()
@@ -192,7 +318,9 @@ class Fragments(object):
192
318
  if origin is not None:
193
319
  stmt = stmt.where(self.table.c.origin == origin)
194
320
  if since is not None:
195
- stmt = stmt.where(self.table.c.timestamp > since)
321
+ stmt = stmt.where(self.table.c.timestamp >= since)
322
+ if until is not None:
323
+ stmt = stmt.where(self.table.c.timestamp <= until)
196
324
  conn = self.store.engine.connect()
197
325
  default_dataset = make_dataset(self.name)
198
326
  try:
@@ -26,11 +26,16 @@ EXCEPTIONS = (
26
26
  TimeoutError,
27
27
  )
28
28
  try:
29
- from psycopg2 import DatabaseError, OperationalError
29
+ from psycopg import DatabaseError, OperationalError
30
30
 
31
31
  EXCEPTIONS = (DatabaseError, OperationalError, *EXCEPTIONS)
32
32
  except ImportError:
33
- pass
33
+ try:
34
+ from psycopg2 import DatabaseError, OperationalError
35
+
36
+ EXCEPTIONS = (DatabaseError, OperationalError, *EXCEPTIONS)
37
+ except ImportError:
38
+ pass
34
39
 
35
40
  log = logging.getLogger(__name__)
36
41
 
@@ -50,9 +55,12 @@ class BulkLoader(object):
50
55
  else:
51
56
  entity = dict(entity)
52
57
  id_ = entity.pop("id")
53
- self.buffer[(id_, origin, fragment)] = entity
54
- if len(self.buffer) >= self.size:
55
- self.flush()
58
+ if id_:
59
+ self.buffer[(id_, origin, fragment)] = entity
60
+ if len(self.buffer) >= self.size:
61
+ self.flush()
62
+ else:
63
+ log.warning("Entity has no ID!")
56
64
 
57
65
  def _store_values(self, conn, values):
58
66
  table = self.dataset.table
@@ -0,0 +1,71 @@
1
+ from sqlalchemy import MetaData, create_engine
2
+ from sqlalchemy import inspect as sqlalchemy_inspect
3
+
4
+ from ftmq.store.fragments.dataset import Fragments
5
+ from ftmq.store.fragments.utils import NULL_ORIGIN
6
+
7
+
8
+ class Store(object):
9
+ """A database containing multiple tables that represent
10
+ FtM-store datasets."""
11
+
12
+ PREFIX = "ftm"
13
+
14
+ def _adjust_psycopg3_uri(self, database_uri: str) -> str:
15
+ """Adjust PostgreSQL URI to use psycopg3 dialect if psycopg is available."""
16
+ if database_uri.startswith(("postgresql://", "postgres://")):
17
+ try:
18
+ import psycopg # noqa: F401
19
+
20
+ # Use psycopg3 dialect for better performance and compatibility
21
+ if database_uri.startswith("postgresql://"):
22
+ return database_uri.replace(
23
+ "postgresql://", "postgresql+psycopg://", 1
24
+ )
25
+ elif database_uri.startswith("postgres://"):
26
+ return database_uri.replace(
27
+ "postgres://", "postgresql+psycopg://", 1
28
+ )
29
+ except ImportError:
30
+ # Fall back to psycopg2 if psycopg3 is not available
31
+ pass
32
+ return database_uri
33
+
34
+ def __init__(
35
+ self,
36
+ database_uri: str,
37
+ **config,
38
+ ):
39
+ self.database_uri = self._adjust_psycopg3_uri(database_uri)
40
+
41
+ # Configure connection pooling for psycopg3
42
+ config.setdefault("pool_size", 1)
43
+ if self.database_uri.startswith("postgresql+psycopg://"):
44
+ config.setdefault("max_overflow", 5)
45
+ config.setdefault("pool_timeout", 60)
46
+ config.setdefault("pool_recycle", 3600)
47
+ config.setdefault("pool_pre_ping", True)
48
+
49
+ self.engine = create_engine(self.database_uri, future=True, **config)
50
+ self.is_postgres = self.engine.dialect.name == "postgresql"
51
+ self.meta = MetaData()
52
+
53
+ def get(self, name, origin=NULL_ORIGIN):
54
+ return Fragments(self, name, origin=origin)
55
+
56
+ def all(self, origin=NULL_ORIGIN):
57
+ prefix = f"{self.PREFIX}_"
58
+ inspect = sqlalchemy_inspect(self.engine)
59
+ for table in inspect.get_table_names():
60
+ if table.startswith(prefix):
61
+ name = table[len(prefix) :]
62
+ yield Fragments(self, name, origin=origin)
63
+
64
+ def close(self):
65
+ self.engine.dispose()
66
+
67
+ def __len__(self):
68
+ return len(list(self.all()))
69
+
70
+ def __repr__(self):
71
+ return "<Store(%r)>" % self.engine
@@ -18,7 +18,6 @@ Layout:
18
18
  ```
19
19
  """
20
20
 
21
- from functools import cache
22
21
  from pathlib import Path
23
22
  from typing import Any, Generator, Iterable
24
23
  from urllib.parse import urlparse
@@ -26,6 +25,7 @@ from urllib.parse import urlparse
26
25
  import duckdb
27
26
  import numpy as np
28
27
  import pandas as pd
28
+ from anystore.functools import weakref_cache as cache
29
29
  from anystore.lock import Lock
30
30
  from anystore.logging import get_logger
31
31
  from anystore.store.fs import Store as FSStore
@@ -1,10 +1,11 @@
1
- from functools import cache, lru_cache
2
- from typing import Any, Generator, Iterable, Type
1
+ from functools import lru_cache
2
+ from typing import Any, Generator, Type
3
3
 
4
4
  import pycountry
5
- from anystore.types import SDict
5
+ from anystore.functools import weakref_cache as cache
6
+ from anystore.types import SDict, StrGenerator
6
7
  from banal import ensure_list, is_listish
7
- from followthemoney import E
8
+ from followthemoney import E, model
8
9
  from followthemoney.compare import _normalize_names
9
10
  from followthemoney.dataset import Dataset
10
11
  from followthemoney.entity import ValueEntity
@@ -12,7 +13,10 @@ from followthemoney.proxy import EntityProxy
12
13
  from followthemoney.schema import Schema
13
14
  from followthemoney.types import registry
14
15
  from followthemoney.util import make_entity_id, sanitize_text
15
- from normality import collapse_spaces, slugify
16
+ from normality import latinize_text, slugify, squash_spaces
17
+ from rigour.names import Name, Symbol, tag_org_name, tag_person_name
18
+ from rigour.names.tokenize import normalize_name
19
+ from rigour.text.scripts import can_latinize
16
20
 
17
21
  from ftmq.enums import Comparators
18
22
  from ftmq.types import Entity
@@ -317,7 +321,7 @@ def clean_string(value: Any) -> str | None:
317
321
  value = sanitize_text(value)
318
322
  if value is None:
319
323
  return
320
- return collapse_spaces(value)
324
+ return squash_spaces(value)
321
325
 
322
326
 
323
327
  def clean_name(value: Any) -> str | None:
@@ -377,18 +381,21 @@ def make_fingerprint(value: Any) -> str | None:
377
381
 
378
382
 
379
383
  def entity_fingerprints(entity: EntityProxy) -> set[str]:
380
- """Get the set of entity name fingerprints"""
381
- # FIXME private import
382
- return set(_normalize_names(entity.schema, entity.names))
384
+ """Get the set of entity name fingerprints, latinized if the alphabet allows
385
+ it and with org / person tags removed depending on entity schema"""
386
+ return make_fingerprints(*entity.names, schemata={entity.schema})
383
387
 
384
388
 
385
- def make_fingerprints(schemata: set[Schema], names: Iterable[str]) -> set[str]:
386
- """Mimic `fingerprints.generate`"""
389
+ def make_fingerprints(*names: str, schemata: set[Schema] | None = None) -> set[str]:
390
+ """Get the set of name fingerprints, latinized if the alphabet allows
391
+ it and with org / person tags removed depending on given schemata"""
387
392
  # FIXME private import
393
+ schemata = schemata or {model["LegalEntity"]}
388
394
  fps: set[str] = set()
389
395
  for schema in schemata:
390
396
  fps.update(set(_normalize_names(schema, names)))
391
- return fps
397
+ # add latinized if appropriate
398
+ return {latinize_text(fp) if can_latinize(fp) else fp for fp in fps}
392
399
 
393
400
 
394
401
  def make_string_id(*values: Any) -> str | None:
@@ -476,3 +483,64 @@ def must_str(value: Any) -> str:
476
483
  if not value:
477
484
  raise ValueError(f"Value invalid: `{value}`")
478
485
  return value
486
+
487
+
488
+ SELECT_SYMBOLS = "__symbols__"
489
+ SELECT_ANNOTATED = "__annotated__"
490
+
491
+
492
+ def get_name_symbols(schema: Schema, *names: str) -> set[Symbol]:
493
+ """Get the rigour names symbols for the given schema and list of names"""
494
+ symbols: set[Symbol] = set()
495
+ if schema.is_a("Person"):
496
+ taggers = [tag_person_name]
497
+ elif schema.is_a("Organization"):
498
+ taggers = [tag_org_name]
499
+ elif schema.is_a("LegalEntity"):
500
+ taggers = [tag_org_name, tag_person_name]
501
+ else:
502
+ return symbols
503
+ for name in names:
504
+ n = Name(name)
505
+ for tagger in taggers:
506
+ for symbol in tagger(n, normalize_name).symbols:
507
+ symbols.add(symbol)
508
+ return symbols
509
+
510
+
511
+ def get_symbols(entity: EntityProxy) -> set[Symbol]:
512
+ """Get the rigour names symbols for the given entity"""
513
+ if not entity.schema.is_a("LegalEntity"):
514
+ return set()
515
+ names = entity.get_type_values(registry.name, matchable=True)
516
+ return get_name_symbols(entity.schema, *names)
517
+
518
+
519
+ def inline_symbols(entity: EntityProxy) -> None:
520
+ """Get the rigour names symbols for the given entity and write them to `indexText`"""
521
+ # clean up old symbols from indexText:
522
+ for text in entity.pop("indexText"):
523
+ if not text.startswith(SELECT_SYMBOLS):
524
+ entity.add("indexText", text)
525
+ symbols = get_symbols(entity)
526
+ entity.add("indexText", f"{SELECT_SYMBOLS} {','.join(map(str, symbols))}")
527
+
528
+
529
+ def select_data(e: EntityProxy, prefix: str) -> StrGenerator:
530
+ """Select arbitrary stored data in `indexText` identified by given prefix"""
531
+ for text in e.get("indexText", quiet=True):
532
+ if text.startswith(prefix):
533
+ yield text.replace(prefix, "").strip()
534
+
535
+
536
+ def select_symbols(e: EntityProxy) -> set[str]:
537
+ """Select stored symbols in `indexText`"""
538
+ symbols: set[str] = set()
539
+ for data in select_data(e, SELECT_SYMBOLS):
540
+ symbols.update(data.split(","))
541
+ return symbols
542
+
543
+
544
+ def select_annotations(e: EntityProxy) -> set[str]:
545
+ """Select stored annotations in `indexText`"""
546
+ return {s for s in select_data(e, SELECT_ANNOTATED)}
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ftmq"
3
- version = "4.1.1"
3
+ version = "4.3.2"
4
4
  description = "followthemoney query dsl and io helpers"
5
5
  authors = [{ name = "Simon Wörpel", email = "simon.woerpel@pm.me" }]
6
6
  license = "AGPLv3+"
@@ -15,9 +15,10 @@ classifiers = [
15
15
  ]
16
16
  requires-python = ">=3.11,<4"
17
17
  dependencies = [
18
- "anystore (>=0.3.9,<0.4.0)",
19
- "followthemoney (>=4.1.1,<5.0.0)",
20
- "nomenklatura (>=4.1.0,<5.0.0)",
18
+ "anystore (>=0.4.0,<0.5.0)",
19
+ "followthemoney (>=4.3.2,<5.0.0)",
20
+ "nomenklatura (>=4.1.10,<5.0.0)",
21
+ "rigour (>=1.4.1,<2.0.0)",
21
22
  "click (>=8.2.1,<9.0.0)",
22
23
  "click-default-group (>=1.2.4,<2.0.0)",
23
24
  "orjson (>=3.10.18,<4.0.0)",
@@ -29,12 +30,12 @@ dependencies = [
29
30
  [project.optional-dependencies]
30
31
  level = ["plyvel (>=1.5.1,<2.0.0)"]
31
32
  sql = ["sqlalchemy (>=2.0.36,<3.0.0)"]
32
- postgres = ["sqlalchemy (>=2.0.36,<3.0.0)", "psycopg2 (>=2.9.10,<3.0.0)"]
33
+ postgres = ["sqlalchemy (>=2.0.36,<3.0.0)", "psycopg[pool] (>=3.2.9,<4.0.0)"]
33
34
  redis = ["redis (>=5.2.1,<6.0.0)", "fakeredis (>=2.26.2,<3.0.0)"]
34
35
  lake = [
35
- "duckdb (>=1.3.2,<2.0.0)",
36
- "pandas (>=2.3.1,<3.0.0)",
37
- "deltalake (>=1.1.2,<2.0.0)",
36
+ "duckdb (>=1.4.1,<2.0.0)",
37
+ "pandas (>=2.3.3,<3.0.0)",
38
+ "deltalake (>=1.2.1,<2.0.0)",
38
39
  "pyarrow (>=21.0.0,<22.0.0)",
39
40
  ]
40
41
  aleph = ["furl (>=2.1.4,<3.0.0)", "alephclient (>=2.6.0,<3.0.0)"]
@@ -50,19 +51,19 @@ Issues = "https://github.com/dataresearchcenter/ftmq/issues"
50
51
 
51
52
  [tool.poetry.group.dev.dependencies]
52
53
  pytest = ">=7.4.3,<9.0.0"
53
- pytest-cov = ">=4.1,<7.0"
54
+ pytest-cov = ">=4.1,<8.0"
54
55
  pytest-env = "^1.1.1"
55
56
  black = ">=23.11,<26.0"
56
- isort = "^6.0.1"
57
+ isort = "^7.0.0"
57
58
  mypy = "^1.17.1"
58
59
  pre-commit = "^4.0.1"
59
60
  flake8 = ">=6.1,<8.0"
60
61
  ipdb = "^0.13.13"
61
62
  bump2version = "^1.0.1"
62
63
  mkdocs = "^1.6.1"
63
- mkdocstrings-python = "^1.16.10"
64
- mkdocs-autorefs = "^1.4.1"
65
- mkdocs-material = "^9.6.16"
64
+ mkdocs-autorefs = "^1.4.3"
65
+ mkdocstrings-python = "^1.18.2"
66
+ mkdocs-material = "^9.6.18"
66
67
  mkdocs-click = "^0.9.0"
67
68
 
68
69
  [build-system]
@@ -1,43 +0,0 @@
1
- from sqlalchemy import MetaData, create_engine
2
- from sqlalchemy import inspect as sqlalchemy_inspect
3
-
4
- from ftmq.store.fragments.dataset import Fragments
5
- from ftmq.store.fragments.utils import NULL_ORIGIN
6
-
7
-
8
- class Store(object):
9
- """A database containing multiple tables that represent
10
- FtM-store datasets."""
11
-
12
- PREFIX = "ftm"
13
-
14
- def __init__(
15
- self,
16
- database_uri: str,
17
- **config,
18
- ):
19
- self.database_uri = database_uri
20
- # config.setdefault('pool_size', 1)
21
- self.engine = create_engine(database_uri, future=True, **config)
22
- self.is_postgres = self.engine.dialect.name == "postgresql"
23
- self.meta = MetaData()
24
-
25
- def get(self, name, origin=NULL_ORIGIN):
26
- return Fragments(self, name, origin=origin)
27
-
28
- def all(self, origin=NULL_ORIGIN):
29
- prefix = f"{self.PREFIX}_"
30
- inspect = sqlalchemy_inspect(self.engine)
31
- for table in inspect.get_table_names():
32
- if table.startswith(prefix):
33
- name = table[len(prefix) :]
34
- yield Fragments(self, name, origin=origin)
35
-
36
- def close(self):
37
- self.engine.dispose()
38
-
39
- def __len__(self):
40
- return len(list(self.all()))
41
-
42
- def __repr__(self):
43
- return "<Store(%r)>" % self.engine
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes