ftmq 4.3.1__py3-none-any.whl → 4.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ftmq/__init__.py CHANGED
@@ -2,7 +2,7 @@ from ftmq.io import smart_read_proxies, smart_write_proxies
2
2
  from ftmq.query import Query
3
3
  from ftmq.util import make_entity
4
4
 
5
- __version__ = "4.3.1"
5
+ __version__ = "4.3.4"
6
6
  __all__ = [
7
7
  "smart_read_proxies",
8
8
  "smart_write_proxies",
ftmq/model/dataset.py CHANGED
@@ -16,6 +16,7 @@ ContentType = Literal["documents", "structured", "mixed"]
16
16
 
17
17
 
18
18
  class Dataset(BaseModel, _DatasetModel):
19
+ prefix: str | None = None
19
20
  maintainer: DataPublisher | None = None
20
21
  stats: DatasetStats = DatasetStats()
21
22
  git_repo: AnyUrl | None = None
ftmq/store/lake.py CHANGED
@@ -19,7 +19,7 @@ Layout:
19
19
  """
20
20
 
21
21
  from pathlib import Path
22
- from typing import Any, Generator, Iterable
22
+ from typing import Any, Generator
23
23
  from urllib.parse import urlparse
24
24
 
25
25
  import duckdb
@@ -62,6 +62,9 @@ Z_ORDER = ["canonical_id", "entity_id", "schema", "prop"]
62
62
  TARGET_SIZE = 50 * 10_485_760 # 500 MB
63
63
  PARTITION_BY = ["dataset", "bucket", "origin"]
64
64
  DEFAULT_ORIGIN = "default"
65
+ BUCKET_MENTION = "mention"
66
+ BUCKET_PAGE = "page"
67
+ BUCKET_PAGES = "pages"
65
68
  BUCKET_DOCUMENT = "document"
66
69
  BUCKET_INTERVAL = "interval"
67
70
  BUCKET_THING = "thing"
@@ -93,6 +96,7 @@ TABLE = table(
93
96
  column("dataset"),
94
97
  column("bucket"),
95
98
  column("origin"),
99
+ column("source"),
96
100
  column("schema"),
97
101
  column("prop"),
98
102
  column("prop_type"),
@@ -164,6 +168,12 @@ def setup_duckdb_storage() -> None:
164
168
  @cache
165
169
  def get_schema_bucket(schema_name: str) -> str:
166
170
  s = model[schema_name]
171
+ if s.is_a("Mention"):
172
+ return BUCKET_MENTION
173
+ if s.is_a("Page"):
174
+ return BUCKET_PAGE
175
+ if s.is_a("Pages"):
176
+ return BUCKET_PAGES
167
177
  if s.is_a("Document"):
168
178
  return BUCKET_DOCUMENT
169
179
  if s.is_a("Interval"):
@@ -171,19 +181,13 @@ def get_schema_bucket(schema_name: str) -> str:
171
181
  return BUCKET_THING
172
182
 
173
183
 
174
- def pack_statement(stmt: Statement) -> SDict:
184
+ def pack_statement(stmt: Statement, source: str | None = None) -> SDict:
175
185
  data = stmt.to_db_row()
176
186
  data["bucket"] = get_schema_bucket(data["schema"])
187
+ data["source"] = source
177
188
  return data
178
189
 
179
190
 
180
- def pack_statements(statements: Iterable[Statement]) -> pd.DataFrame:
181
- df = pd.DataFrame(map(pack_statement, statements))
182
- df = df.drop_duplicates() # .sort_values(Z_ORDER)
183
- df = df.fillna(np.nan)
184
- return df
185
-
186
-
187
191
  def compile_query(q: Select) -> str:
188
192
  table = nks.STATEMENT_TABLE
189
193
  sql = str(q.compile(compile_kwargs={"literal_binds": True}))
@@ -243,9 +247,6 @@ class LakeStore(SQLStore):
243
247
  self._partition_by = kwargs.pop("partition_by", PARTITION_BY)
244
248
  self._lock: Lock = kwargs.pop("lock", Lock(self._backend))
245
249
  self._enforce_dataset = kwargs.pop("enforce_dataset", False)
246
- assert isinstance(
247
- self._backend, FSStore
248
- ), f"Invalid store backend: `{self._backend.__class__}"
249
250
  kwargs["uri"] = "sqlite:///:memory:" # fake it till you make it
250
251
  get_metadata.cache_clear()
251
252
  super().__init__(*args, **kwargs)
@@ -279,8 +280,10 @@ class LakeStore(SQLStore):
279
280
  scope = scope or self.dataset
280
281
  return LakeQueryView(self, scope, external)
281
282
 
282
- def writer(self, origin: str | None = DEFAULT_ORIGIN) -> "LakeWriter":
283
- return LakeWriter(self, origin=origin or DEFAULT_ORIGIN)
283
+ def writer(
284
+ self, origin: str | None = DEFAULT_ORIGIN, source: str | None = None
285
+ ) -> "LakeWriter":
286
+ return LakeWriter(self, origin=origin or DEFAULT_ORIGIN, source=source)
284
287
 
285
288
  def get_origins(self) -> set[str]:
286
289
  q = select(self.table.c.origin).distinct()
@@ -291,32 +294,50 @@ class LakeWriter(nk.Writer):
291
294
  store: LakeStore
292
295
  BATCH_STATEMENTS = 1_000_000
293
296
 
294
- def __init__(self, store: Store, origin: str | None = DEFAULT_ORIGIN):
297
+ def __init__(
298
+ self,
299
+ store: Store,
300
+ origin: str | None = DEFAULT_ORIGIN,
301
+ source: str | None = None,
302
+ ):
295
303
  super().__init__(store)
296
- self.batch: set[Statement] = set()
304
+ self.batch: dict[Statement, str | None] = {}
297
305
  self.origin = origin or DEFAULT_ORIGIN
306
+ self.source = source
298
307
 
299
- def add_statement(self, stmt: Statement) -> None:
308
+ def add_statement(self, stmt: Statement, source: str | None = None) -> None:
300
309
  if stmt.entity_id is None:
301
310
  return
302
311
  stmt.origin = stmt.origin or self.origin
303
312
  canonical_id = self.store.linker.get_canonical(stmt.entity_id)
304
313
  stmt.canonical_id = canonical_id
305
- self.batch.add(stmt)
314
+ self.batch[stmt] = source or self.source
306
315
 
307
- def add_entity(self, entity: EntityProxy, origin: str | None = None) -> None:
316
+ def add_entity(
317
+ self,
318
+ entity: EntityProxy,
319
+ origin: str | None = None,
320
+ source: str | None = None,
321
+ ) -> None:
308
322
  e = ensure_entity(entity, StatementEntity, self.store.dataset)
309
323
  if self.store._enforce_dataset:
310
324
  e = apply_dataset(e, self.store.dataset, replace=True)
311
325
  for stmt in e.statements:
312
326
  if origin:
313
327
  stmt.origin = origin
314
- self.add_statement(stmt)
328
+ self.add_statement(stmt, source=source)
315
329
  # we check here instead of in `add_statement` as this will keep entities
316
- # together in the same parquet files`
330
+ # together in the same parquet files
317
331
  if len(self.batch) >= self.BATCH_STATEMENTS:
318
332
  self.flush()
319
333
 
334
+ def _pack_statements(self) -> pd.DataFrame:
335
+ data = [pack_statement(stmt, source) for stmt, source in self.batch.items()]
336
+ df = pd.DataFrame(data)
337
+ df = df.drop_duplicates().sort_values(Z_ORDER)
338
+ df = df.fillna(np.nan)
339
+ return df
340
+
320
341
  def flush(self) -> None:
321
342
  if self.batch:
322
343
  log.info(
@@ -326,7 +347,7 @@ class LakeWriter(nk.Writer):
326
347
  with self.store._lock:
327
348
  write_deltalake(
328
349
  str(self.store.uri),
329
- pack_statements(self.batch),
350
+ self._pack_statements(),
330
351
  partition_by=self.store._partition_by,
331
352
  mode="append",
332
353
  schema_mode="merge",
@@ -335,7 +356,7 @@ class LakeWriter(nk.Writer):
335
356
  storage_options=storage_options(),
336
357
  )
337
358
 
338
- self.batch = set()
359
+ self.batch = {}
339
360
 
340
361
  def pop(self, entity_id: str) -> list[Statement]:
341
362
  q = select(TABLE)
@@ -353,13 +374,14 @@ class LakeWriter(nk.Writer):
353
374
  """
354
375
  Optimize the storage: Z-Ordering and compacting
355
376
  """
356
- self.store.deltatable.optimize.z_order(
357
- Z_ORDER, writer_properties=WRITER, target_size=TARGET_SIZE
358
- )
359
- if vacuum:
360
- self.store.deltatable.vacuum(
361
- retention_hours=vacuum_keep_hours,
362
- enforce_retention_duration=False,
363
- dry_run=False,
364
- full=True,
377
+ with self.store._lock:
378
+ self.store.deltatable.optimize.z_order(
379
+ Z_ORDER, writer_properties=WRITER, target_size=TARGET_SIZE
365
380
  )
381
+ if vacuum:
382
+ self.store.deltatable.vacuum(
383
+ retention_hours=vacuum_keep_hours,
384
+ enforce_retention_duration=False,
385
+ dry_run=False,
386
+ full=True,
387
+ )
ftmq/util.py CHANGED
@@ -1,7 +1,5 @@
1
- from functools import lru_cache
2
1
  from typing import Any, Generator, Type
3
2
 
4
- import pycountry
5
3
  from anystore.functools import weakref_cache as cache
6
4
  from anystore.types import SDict, StrGenerator
7
5
  from banal import ensure_list, is_listish
@@ -13,9 +11,10 @@ from followthemoney.proxy import EntityProxy
13
11
  from followthemoney.schema import Schema
14
12
  from followthemoney.types import registry
15
13
  from followthemoney.util import make_entity_id, sanitize_text
16
- from normality import collapse_spaces, latinize_text, slugify
14
+ from normality import latinize_text, slugify, squash_spaces
17
15
  from rigour.names import Name, Symbol, tag_org_name, tag_person_name
18
16
  from rigour.names.tokenize import normalize_name
17
+ from rigour.territories import lookup_territory
19
18
  from rigour.text.scripts import can_latinize
20
19
 
21
20
  from ftmq.enums import Comparators
@@ -149,7 +148,7 @@ def apply_dataset(entity: E, dataset: str | Dataset, replace: bool | None = Fals
149
148
  def get_country_name(code: str) -> str:
150
149
  """
151
150
  Get the (english) country name for the given 2-letter iso code via
152
- [pycountry](https://pypi.org/project/pycountry/)
151
+ [rigour.territories](https://rigour.followthemoney.tech/territories/)
153
152
 
154
153
  Examples:
155
154
  >>> get_country_name("de")
@@ -165,22 +164,17 @@ def get_country_name(code: str) -> str:
165
164
  Returns:
166
165
  Either the country name for a valid code or the code as fallback.
167
166
  """
168
- code_clean = get_country_code(code)
169
- if code_clean is None:
170
- code_clean = code.lower()
171
- try:
172
- country = pycountry.countries.get(alpha_2=code_clean)
173
- if country is not None:
174
- return country.name
175
- except (LookupError, AttributeError):
176
- return code
177
- return code_clean
167
+ territory = lookup_territory(code)
168
+ if territory is not None:
169
+ return territory.name
170
+ return code
178
171
 
179
172
 
180
- @lru_cache(1024)
173
+ @cache
181
174
  def get_country_code(value: Any, splitter: str | None = ",") -> str | None:
182
175
  """
183
- Get the 2-letter iso country code for an arbitrary country name
176
+ Get the 2-letter iso country code for an arbitrary country name via
177
+ [rigour.territories](https://rigour.followthemoney.tech/territories/)
184
178
 
185
179
  Examples:
186
180
  >>> get_country_code("Germany")
@@ -201,15 +195,16 @@ def get_country_code(value: Any, splitter: str | None = ",") -> str | None:
201
195
  """
202
196
  value = clean_string(value)
203
197
  if not value:
204
- return
205
- code = registry.country.clean_text(value)
206
- if code:
207
- return code
208
- for token in value.split(splitter):
209
- code = registry.country.clean_text(token)
210
- if code:
211
- return code
212
- return
198
+ return None
199
+ territory = lookup_territory(value)
200
+ if territory is not None:
201
+ return territory.ftm_country
202
+ if splitter:
203
+ for token in value.split(splitter):
204
+ territory = lookup_territory(token.strip())
205
+ if territory is not None:
206
+ return territory.ftm_country
207
+ return None
213
208
 
214
209
 
215
210
  def join_slug(
@@ -321,7 +316,7 @@ def clean_string(value: Any) -> str | None:
321
316
  value = sanitize_text(value)
322
317
  if value is None:
323
318
  return
324
- return collapse_spaces(value)
319
+ return squash_spaces(value)
325
320
 
326
321
 
327
322
  def clean_name(value: Any) -> str | None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ftmq
3
- Version: 4.3.1
3
+ Version: 4.3.4
4
4
  Summary: followthemoney query dsl and io helpers
5
5
  License: AGPLv3+
6
6
  License-File: LICENSE
@@ -21,25 +21,24 @@ Provides-Extra: postgres
21
21
  Provides-Extra: redis
22
22
  Provides-Extra: sql
23
23
  Requires-Dist: alephclient (>=2.6.0,<3.0.0) ; extra == "aleph"
24
- Requires-Dist: anystore (>=0.4.0,<0.5.0)
24
+ Requires-Dist: anystore (>=0.4.2,<0.5.0)
25
25
  Requires-Dist: click (>=8.2.1,<9.0.0)
26
26
  Requires-Dist: click-default-group (>=1.2.4,<2.0.0)
27
27
  Requires-Dist: deltalake (>=1.2.1,<2.0.0) ; extra == "lake"
28
28
  Requires-Dist: duckdb (>=1.4.1,<2.0.0) ; extra == "lake"
29
29
  Requires-Dist: fakeredis (>=2.26.2,<3.0.0) ; extra == "redis"
30
- Requires-Dist: followthemoney (>=4.3.2,<5.0.0)
30
+ Requires-Dist: followthemoney (>=4.3.4,<5.0.0)
31
31
  Requires-Dist: furl (>=2.1.4,<3.0.0) ; extra == "aleph"
32
- Requires-Dist: nomenklatura (>=4.1.10,<5.0.0)
32
+ Requires-Dist: nomenklatura (>=4.3.2,<5.0.0)
33
33
  Requires-Dist: orjson (>=3.10.18,<4.0.0)
34
34
  Requires-Dist: pandas (>=2.3.3,<3.0.0) ; extra == "lake"
35
35
  Requires-Dist: plyvel (>=1.5.1,<2.0.0) ; extra == "level"
36
36
  Requires-Dist: psycopg[pool] (>=3.2.9,<4.0.0) ; extra == "postgres"
37
37
  Requires-Dist: pyarrow (>=21.0.0,<22.0.0) ; extra == "lake"
38
- Requires-Dist: pycountry (>=24.6.1,<25.0.0)
39
38
  Requires-Dist: pydantic (>=2.11.3,<3.0.0)
40
39
  Requires-Dist: pyicu (>=2.15.2,<3.0.0)
41
40
  Requires-Dist: redis (>=5.2.1,<6.0.0) ; extra == "redis"
42
- Requires-Dist: rigour (>=1.4.0,<2.0.0)
41
+ Requires-Dist: rigour (>=1.5.0,<2.0.0)
43
42
  Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "postgres"
44
43
  Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "sql"
45
44
  Project-URL: Documentation, https://docs.investigraph.dev/lib/ftmq
@@ -1,4 +1,4 @@
1
- ftmq/__init__.py,sha256=NbZU0yvkkgqiSaOjVHmRAE8FeeI0fIiwknWOfU9j1Ow,245
1
+ ftmq/__init__.py,sha256=7UYwz5cBV9gTOnLRo15i9E2HMsAZFU5DwaqNeo5HAK4,245
2
2
  ftmq/aggregate.py,sha256=nyAI5w6jKG1T4Jf2yy1ruhPh0vY6p7JWOEYh0SLBdZY,2163
3
3
  ftmq/aggregations.py,sha256=YLu1WF3WgcI3tadWKqsoZk3f_3bYUJetIREy2N1u_EM,4794
4
4
  ftmq/cli.py,sha256=e4wLGfGFmn-8Z6_EhEcOJvUwQM3R-J1i40qh2UVHcGo,12078
@@ -7,7 +7,7 @@ ftmq/filters.py,sha256=TKs454wbSvA5tPj8WbpIrMojctE2jGTLnrrVQKM9PE4,7908
7
7
  ftmq/io.py,sha256=gUbqoZuYXWwxoiJ505H7FhmXLYr5MEUiCEvqIiOkbgo,3789
8
8
  ftmq/logging.py,sha256=p6UvN8qkki8bOjI_NfIjKoLRhdtAdGm4swMWPssETKY,3211
9
9
  ftmq/model/__init__.py,sha256=hgdLNeoYm4o6XoY24_7gtIYf8SUq3d8s_2mda1EQDzg,197
10
- ftmq/model/dataset.py,sha256=Z8RQNz4wvYsbeX77EabI9wH-7difC32_Y_b1QrJkkXA,2198
10
+ ftmq/model/dataset.py,sha256=d6KQDhvxhfQv7J4oF76_ePANDJQ8onxpzhc4-wnui_Q,2228
11
11
  ftmq/model/entity.py,sha256=MIJWORs6X4Z22DDQkHRQlcR3Hug8M_YjZwVEo5KU538,2073
12
12
  ftmq/model/mixins.py,sha256=O_z3Pxv0HzXxu5K-lNFglrIAhh4jejmbcXUjQfVe0_o,435
13
13
  ftmq/model/stats.py,sha256=BiOK7x-JymI5f3dRpwgr3J7OknDP1pe9955Y7AvOJGo,4048
@@ -23,16 +23,16 @@ ftmq/store/fragments/loader.py,sha256=iVh8F22IApe9MRY_Z2fOLvT80fCYstFyxu410l4pPQ
23
23
  ftmq/store/fragments/settings.py,sha256=4c-BW-blVM9gC_IGPch03eExbZYFZ3V5h9yTfhcHvOI,303
24
24
  ftmq/store/fragments/store.py,sha256=LiSfg95LjEmyq2IUpX4CMtp2tE37SEBLtAw0EWufImM,2534
25
25
  ftmq/store/fragments/utils.py,sha256=SDoLPFF5O_oJLIPrCEab5iGn-pl6y0AhYZDYIPYxYkk,1098
26
- ftmq/store/lake.py,sha256=snbEZXTuR9Oy3o7p6XA61IVpsd4-70a6PgoL63PBrRg,11683
26
+ ftmq/store/lake.py,sha256=PW6uwPeyMo-7bxLUK0zINzOgmo6N1Pulx_Xn-7Y2010,12252
27
27
  ftmq/store/level.py,sha256=ZGx-mMtJZJNWkpvbe7ajTREnW5MPcnw0ct3nSFLVF0I,781
28
28
  ftmq/store/memory.py,sha256=lZ_pDzrBWNljbNb1MXJeCoO7TnAdqEfG4kfLDOU5rME,551
29
29
  ftmq/store/redis.py,sha256=d0hkGF_BezdIfCMUshXWoQwvGmqT8JFblUMcCxzwkDA,433
30
30
  ftmq/store/sql.py,sha256=6h3-gDaTAlD-IkiOONcX-JbaAO9-QfSsMjjMPupclcQ,5216
31
31
  ftmq/types.py,sha256=HgF8eT3ynKnDUxBYFtoDytS-uN_CS7Yr3DHIX2r4tnk,774
32
- ftmq/util.py,sha256=CmbZXYAbsKbAjoWn8WxR1Sz4VPXc2gj9CkHwaTqpBG0,15691
33
- ftmq-4.3.1.dist-info/METADATA,sha256=4sW694l-QjyhMtxl7CN1M2vyFwPDckDg4kpso9aD0cs,5324
34
- ftmq-4.3.1.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
35
- ftmq-4.3.1.dist-info/entry_points.txt,sha256=YGDCjEiPgAMaQ5MqFKH8m-XIybehSXgarDucSlmeK3E,37
36
- ftmq-4.3.1.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
37
- ftmq-4.3.1.dist-info/licenses/NOTICE,sha256=LNgfzuMbk3kIP_KnyDiXO8rQJmDrLy_PQ7cAY8lCmMM,463
38
- ftmq-4.3.1.dist-info/RECORD,,
32
+ ftmq/util.py,sha256=8N3NVMnfqzOzCpd7tdpeBlWy37sBoLplzud8isX4r8M,15677
33
+ ftmq-4.3.4.dist-info/METADATA,sha256=l7INHEM2JnzmexcV9XOYVqAuYapVx1F1rdb173aRmqE,5279
34
+ ftmq-4.3.4.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
35
+ ftmq-4.3.4.dist-info/entry_points.txt,sha256=YGDCjEiPgAMaQ5MqFKH8m-XIybehSXgarDucSlmeK3E,37
36
+ ftmq-4.3.4.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
37
+ ftmq-4.3.4.dist-info/licenses/NOTICE,sha256=LNgfzuMbk3kIP_KnyDiXO8rQJmDrLy_PQ7cAY8lCmMM,463
38
+ ftmq-4.3.4.dist-info/RECORD,,
File without changes