ftmq 4.3.0__py3-none-any.whl → 4.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ftmq/__init__.py CHANGED
@@ -2,7 +2,7 @@ from ftmq.io import smart_read_proxies, smart_write_proxies
2
2
  from ftmq.query import Query
3
3
  from ftmq.util import make_entity
4
4
 
5
- __version__ = "4.3.0"
5
+ __version__ = "4.3.2"
6
6
  __all__ = [
7
7
  "smart_read_proxies",
8
8
  "smart_write_proxies",
ftmq/cli.py CHANGED
@@ -1,3 +1,5 @@
1
+ from datetime import datetime
2
+
1
3
  import click
2
4
  from anystore.io import smart_write, smart_write_json, smart_write_model
3
5
  from click_default_group import DefaultGroup
@@ -357,16 +359,37 @@ def fragments_list_datasets(
357
359
  "-o", "--output-uri", default="-", show_default=True, help="output file or uri"
358
360
  )
359
361
  @click.option("-d", "--dataset", required=True, help="Dataset name to iterate")
362
+ @click.option("-s", "--schema", default=None, help="Filter by schema")
363
+ @click.option(
364
+ "--since",
365
+ default=None,
366
+ help="Filter by timestamp (since), ISO format: YYYY-MM-DDTHH:MM:SS",
367
+ )
368
+ @click.option(
369
+ "--until",
370
+ default=None,
371
+ help="Filter by timestamp (until), ISO format: YYYY-MM-DDTHH:MM:SS",
372
+ )
360
373
  def fragments_iterate(
361
374
  input_uri: str = fragments_settings.database_uri,
362
375
  output_uri: str = "-",
363
376
  dataset: str = None,
377
+ schema: str | None = None,
378
+ since: str | None = None,
379
+ until: str | None = None,
364
380
  ):
365
381
  """
366
382
  Iterate all entities from a fragments dataset
367
383
  """
368
384
  fragments = get_fragments(dataset, database_uri=input_uri)
369
- smart_write_proxies(output_uri, fragments.iterate())
385
+
386
+ # Parse timestamp strings to datetime objects
387
+ since_dt = datetime.fromisoformat(since) if since else None
388
+ until_dt = datetime.fromisoformat(until) if until else None
389
+
390
+ smart_write_proxies(
391
+ output_uri, fragments.iterate(schema=schema, since=since_dt, until=until_dt)
392
+ )
370
393
 
371
394
 
372
395
  @cli.command("aggregate")
ftmq/model/dataset.py CHANGED
@@ -16,6 +16,7 @@ ContentType = Literal["documents", "structured", "mixed"]
16
16
 
17
17
 
18
18
  class Dataset(BaseModel, _DatasetModel):
19
+ prefix: str | None = None
19
20
  maintainer: DataPublisher | None = None
20
21
  stats: DatasetStats = DatasetStats()
21
22
  git_repo: AnyUrl | None = None
@@ -132,7 +132,9 @@ class Fragments(object):
132
132
  def bulk(self, size=1000):
133
133
  return BulkLoader(self, size)
134
134
 
135
- def fragments(self, entity_ids=None, fragment=None):
135
+ def fragments(
136
+ self, entity_ids=None, fragment=None, schema=None, since=None, until=None
137
+ ):
136
138
  stmt = self.table.select()
137
139
  entity_ids = ensure_list(entity_ids)
138
140
  if len(entity_ids) == 1:
@@ -141,6 +143,18 @@ class Fragments(object):
141
143
  stmt = stmt.where(self.table.c.id.in_(entity_ids))
142
144
  if fragment is not None:
143
145
  stmt = stmt.where(self.table.c.fragment == fragment)
146
+ if schema is not None:
147
+ if self.store.is_postgres:
148
+ stmt = stmt.where(self.table.c.entity["schema"].astext == schema)
149
+ else:
150
+ # SQLite JSON support - use json_extract function
151
+ stmt = stmt.where(
152
+ func.json_extract(self.table.c.entity, "$.schema") == schema
153
+ )
154
+ if since is not None:
155
+ stmt = stmt.where(self.table.c.timestamp >= since)
156
+ if until is not None:
157
+ stmt = stmt.where(self.table.c.timestamp <= until)
144
158
  stmt = stmt.order_by(self.table.c.id)
145
159
  # stmt = stmt.order_by(self.table.c.origin)
146
160
  # stmt = stmt.order_by(self.table.c.fragment)
@@ -159,8 +173,12 @@ class Fragments(object):
159
173
  finally:
160
174
  conn.close()
161
175
 
162
- def partials(self, entity_id=None, skip_errors=False) -> EntityFragments:
163
- for fragment in self.fragments(entity_ids=entity_id):
176
+ def partials(
177
+ self, entity_id=None, skip_errors=False, schema=None, since=None, until=None
178
+ ) -> EntityFragments:
179
+ for fragment in self.fragments(
180
+ entity_ids=entity_id, schema=schema, since=since, until=until
181
+ ):
164
182
  try:
165
183
  yield EntityProxy.from_dict(fragment, cleaned=True)
166
184
  except Exception:
@@ -169,22 +187,32 @@ class Fragments(object):
169
187
  continue
170
188
  raise
171
189
 
172
- def iterate(self, entity_id=None, skip_errors=False) -> EntityFragments:
190
+ def iterate(
191
+ self, entity_id=None, skip_errors=False, schema=None, since=None, until=None
192
+ ) -> EntityFragments:
173
193
  if entity_id is None:
174
194
  log.info("Using batched iteration for complete dataset.")
175
- yield from self.iterate_batched()
195
+ yield from self.iterate_batched(
196
+ skip_errors=skip_errors, schema=schema, since=since, until=until
197
+ )
176
198
  return
177
199
  entity = None
178
200
  invalid = None
179
201
  fragments = 1
180
- for partial in self.partials(entity_id=entity_id, skip_errors=skip_errors):
202
+ for partial in self.partials(
203
+ entity_id=entity_id,
204
+ skip_errors=skip_errors,
205
+ schema=schema,
206
+ since=since,
207
+ until=until,
208
+ ):
181
209
  if partial.id == invalid:
182
210
  continue
183
211
  if entity is not None:
184
212
  if entity.id == partial.id:
185
213
  fragments += 1
186
214
  if fragments % 10000 == 0:
187
- log.debug(
215
+ log.warning(
188
216
  "[%s:%s] aggregated %d fragments...",
189
217
  entity.schema.name,
190
218
  entity.id,
@@ -209,16 +237,26 @@ class Fragments(object):
209
237
  if entity is not None:
210
238
  yield entity
211
239
 
212
- def iterate_batched(self, skip_errors=False, batch_size=10_000) -> EntityFragments:
240
+ def iterate_batched(
241
+ self, skip_errors=False, batch_size=10_000, schema=None, since=None, until=None
242
+ ) -> EntityFragments:
213
243
  """
214
244
  For large datasets an overall sort is not feasible, so we iterate in
215
245
  sorted batched IDs.
216
246
  """
217
- for entity_ids in self.get_sorted_id_batches(batch_size):
218
- yield from self.iterate(entity_id=entity_ids, skip_errors=skip_errors)
247
+ for entity_ids in self.get_sorted_id_batches(
248
+ batch_size, schema=schema, since=since, until=until
249
+ ):
250
+ yield from self.iterate(
251
+ entity_id=entity_ids,
252
+ skip_errors=skip_errors,
253
+ schema=schema,
254
+ since=since,
255
+ until=until,
256
+ )
219
257
 
220
258
  def get_sorted_id_batches(
221
- self, batch_size=10_000
259
+ self, batch_size=10_000, schema=None, since=None, until=None
222
260
  ) -> Generator[list[str], None, None]:
223
261
  """
224
262
  Get sorted ID batches to speed up iteration and useful to parallelize
@@ -230,6 +268,20 @@ class Fragments(object):
230
268
  stmt = select(self.table.c.id).distinct()
231
269
  if last_id is not None:
232
270
  stmt = stmt.where(self.table.c.id > last_id)
271
+ if schema is not None:
272
+ if self.store.is_postgres:
273
+ stmt = stmt.where(
274
+ self.table.c.entity["schema"].astext == schema
275
+ )
276
+ else:
277
+ # SQLite JSON support - use json_extract function
278
+ stmt = stmt.where(
279
+ func.json_extract(self.table.c.entity, "$.schema") == schema
280
+ )
281
+ if since is not None:
282
+ stmt = stmt.where(self.table.c.timestamp >= since)
283
+ if until is not None:
284
+ stmt = stmt.where(self.table.c.timestamp <= until)
233
285
  stmt = stmt.order_by(self.table.c.id).limit(batch_size)
234
286
  try:
235
287
  res = conn.execute(stmt)
@@ -242,11 +294,19 @@ class Fragments(object):
242
294
  self.reset()
243
295
  raise
244
296
 
297
+ def get_sorted_ids(
298
+ self, batch_size=10_000, schema=None, since=None, until=None
299
+ ) -> Generator[str, None, None]:
300
+ """Get sorted IDs, optionally filtered by schema"""
301
+ for batch in self.get_sorted_id_batches(batch_size, schema, since, until):
302
+ yield from batch
303
+
245
304
  def statements(
246
305
  self,
247
306
  entity_ids: Iterable[str] | None = None,
248
307
  origin: str | None = None,
249
308
  since: datetime | None = None,
309
+ until: datetime | None = None,
250
310
  ) -> Statements:
251
311
  """Iterate unsorted statements with its fragment origins"""
252
312
  stmt = self.table.select()
@@ -258,7 +318,9 @@ class Fragments(object):
258
318
  if origin is not None:
259
319
  stmt = stmt.where(self.table.c.origin == origin)
260
320
  if since is not None:
261
- stmt = stmt.where(self.table.c.timestamp > since)
321
+ stmt = stmt.where(self.table.c.timestamp >= since)
322
+ if until is not None:
323
+ stmt = stmt.where(self.table.c.timestamp <= until)
262
324
  conn = self.store.engine.connect()
263
325
  default_dataset = make_dataset(self.name)
264
326
  try:
ftmq/util.py CHANGED
@@ -13,7 +13,7 @@ from followthemoney.proxy import EntityProxy
13
13
  from followthemoney.schema import Schema
14
14
  from followthemoney.types import registry
15
15
  from followthemoney.util import make_entity_id, sanitize_text
16
- from normality import collapse_spaces, latinize_text, slugify
16
+ from normality import latinize_text, slugify, squash_spaces
17
17
  from rigour.names import Name, Symbol, tag_org_name, tag_person_name
18
18
  from rigour.names.tokenize import normalize_name
19
19
  from rigour.text.scripts import can_latinize
@@ -321,7 +321,7 @@ def clean_string(value: Any) -> str | None:
321
321
  value = sanitize_text(value)
322
322
  if value is None:
323
323
  return
324
- return collapse_spaces(value)
324
+ return squash_spaces(value)
325
325
 
326
326
 
327
327
  def clean_name(value: Any) -> str | None:
@@ -1,8 +1,10 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: ftmq
3
- Version: 4.3.0
3
+ Version: 4.3.2
4
4
  Summary: followthemoney query dsl and io helpers
5
5
  License: AGPLv3+
6
+ License-File: LICENSE
7
+ License-File: NOTICE
6
8
  Author: Simon Wörpel
7
9
  Author-email: simon.woerpel@pm.me
8
10
  Requires-Python: >=3.11,<4
@@ -22,12 +24,12 @@ Requires-Dist: alephclient (>=2.6.0,<3.0.0) ; extra == "aleph"
22
24
  Requires-Dist: anystore (>=0.4.0,<0.5.0)
23
25
  Requires-Dist: click (>=8.2.1,<9.0.0)
24
26
  Requires-Dist: click-default-group (>=1.2.4,<2.0.0)
25
- Requires-Dist: deltalake (>=1.2.0,<2.0.0) ; extra == "lake"
27
+ Requires-Dist: deltalake (>=1.2.1,<2.0.0) ; extra == "lake"
26
28
  Requires-Dist: duckdb (>=1.4.1,<2.0.0) ; extra == "lake"
27
29
  Requires-Dist: fakeredis (>=2.26.2,<3.0.0) ; extra == "redis"
28
- Requires-Dist: followthemoney (>=4.3.0,<5.0.0)
30
+ Requires-Dist: followthemoney (>=4.3.2,<5.0.0)
29
31
  Requires-Dist: furl (>=2.1.4,<3.0.0) ; extra == "aleph"
30
- Requires-Dist: nomenklatura (>=4.1.9,<5.0.0)
32
+ Requires-Dist: nomenklatura (>=4.1.10,<5.0.0)
31
33
  Requires-Dist: orjson (>=3.10.18,<4.0.0)
32
34
  Requires-Dist: pandas (>=2.3.3,<3.0.0) ; extra == "lake"
33
35
  Requires-Dist: plyvel (>=1.5.1,<2.0.0) ; extra == "level"
@@ -37,7 +39,7 @@ Requires-Dist: pycountry (>=24.6.1,<25.0.0)
37
39
  Requires-Dist: pydantic (>=2.11.3,<3.0.0)
38
40
  Requires-Dist: pyicu (>=2.15.2,<3.0.0)
39
41
  Requires-Dist: redis (>=5.2.1,<6.0.0) ; extra == "redis"
40
- Requires-Dist: rigour (>=1.3.13,<2.0.0)
42
+ Requires-Dist: rigour (>=1.4.1,<2.0.0)
41
43
  Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "postgres"
42
44
  Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "sql"
43
45
  Project-URL: Documentation, https://docs.investigraph.dev/lib/ftmq
@@ -1,13 +1,13 @@
1
- ftmq/__init__.py,sha256=yG0oVaBCNY1c6a2EIhTGutZEjNyiMgcn10v_8khS6sQ,245
1
+ ftmq/__init__.py,sha256=ueHfqk0txp2sdGHbXsKPFwErBru33CPWbewYF8swWfA,245
2
2
  ftmq/aggregate.py,sha256=nyAI5w6jKG1T4Jf2yy1ruhPh0vY6p7JWOEYh0SLBdZY,2163
3
3
  ftmq/aggregations.py,sha256=YLu1WF3WgcI3tadWKqsoZk3f_3bYUJetIREy2N1u_EM,4794
4
- ftmq/cli.py,sha256=roKQ1k-PUm15-mxD0fvjOwOhmpCBpuelYoECycfVjHw,11400
4
+ ftmq/cli.py,sha256=e4wLGfGFmn-8Z6_EhEcOJvUwQM3R-J1i40qh2UVHcGo,12078
5
5
  ftmq/enums.py,sha256=4AJ6Ii8Bnbfz0BSznyK0IUopy-XMquuAuvud8ZcrD54,2521
6
6
  ftmq/filters.py,sha256=TKs454wbSvA5tPj8WbpIrMojctE2jGTLnrrVQKM9PE4,7908
7
7
  ftmq/io.py,sha256=gUbqoZuYXWwxoiJ505H7FhmXLYr5MEUiCEvqIiOkbgo,3789
8
8
  ftmq/logging.py,sha256=p6UvN8qkki8bOjI_NfIjKoLRhdtAdGm4swMWPssETKY,3211
9
9
  ftmq/model/__init__.py,sha256=hgdLNeoYm4o6XoY24_7gtIYf8SUq3d8s_2mda1EQDzg,197
10
- ftmq/model/dataset.py,sha256=Z8RQNz4wvYsbeX77EabI9wH-7difC32_Y_b1QrJkkXA,2198
10
+ ftmq/model/dataset.py,sha256=d6KQDhvxhfQv7J4oF76_ePANDJQ8onxpzhc4-wnui_Q,2228
11
11
  ftmq/model/entity.py,sha256=MIJWORs6X4Z22DDQkHRQlcR3Hug8M_YjZwVEo5KU538,2073
12
12
  ftmq/model/mixins.py,sha256=O_z3Pxv0HzXxu5K-lNFglrIAhh4jejmbcXUjQfVe0_o,435
13
13
  ftmq/model/stats.py,sha256=BiOK7x-JymI5f3dRpwgr3J7OknDP1pe9955Y7AvOJGo,4048
@@ -18,7 +18,7 @@ ftmq/store/__init__.py,sha256=HH30KAHqo1kPr4qbjo6oKxdvgpF_XrqPgk8hSCkPbAY,3152
18
18
  ftmq/store/aleph.py,sha256=vENWpMgV1rViMwcef9wAwOGyrciCnSPRN-dSI103478,3977
19
19
  ftmq/store/base.py,sha256=1IgX4haeNUA9NTBCi-hBFVe0u44ra1nGdHAjSTOUEAs,4762
20
20
  ftmq/store/fragments/__init__.py,sha256=jHXHejqXe6sBNllAt-BuU24Ou8m5evmsD1UPac5J2GE,750
21
- ftmq/store/fragments/dataset.py,sha256=8mHJo7CW_9kyYgnXapvvZamaG33DXBy8lx8qmNbuD3s,10623
21
+ ftmq/store/fragments/dataset.py,sha256=P1ljcpoRpl_IIApA2A3vizpB2goRxxue_NNoeI1JQN8,13032
22
22
  ftmq/store/fragments/loader.py,sha256=iVh8F22IApe9MRY_Z2fOLvT80fCYstFyxu410l4pPQY,4066
23
23
  ftmq/store/fragments/settings.py,sha256=4c-BW-blVM9gC_IGPch03eExbZYFZ3V5h9yTfhcHvOI,303
24
24
  ftmq/store/fragments/store.py,sha256=LiSfg95LjEmyq2IUpX4CMtp2tE37SEBLtAw0EWufImM,2534
@@ -29,10 +29,10 @@ ftmq/store/memory.py,sha256=lZ_pDzrBWNljbNb1MXJeCoO7TnAdqEfG4kfLDOU5rME,551
29
29
  ftmq/store/redis.py,sha256=d0hkGF_BezdIfCMUshXWoQwvGmqT8JFblUMcCxzwkDA,433
30
30
  ftmq/store/sql.py,sha256=6h3-gDaTAlD-IkiOONcX-JbaAO9-QfSsMjjMPupclcQ,5216
31
31
  ftmq/types.py,sha256=HgF8eT3ynKnDUxBYFtoDytS-uN_CS7Yr3DHIX2r4tnk,774
32
- ftmq/util.py,sha256=CmbZXYAbsKbAjoWn8WxR1Sz4VPXc2gj9CkHwaTqpBG0,15691
33
- ftmq-4.3.0.dist-info/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
34
- ftmq-4.3.0.dist-info/METADATA,sha256=uOlBTQdtpSar7Bc4GRiLh3RQd9tlxWhuX0UOUd9pbn8,5281
35
- ftmq-4.3.0.dist-info/NOTICE,sha256=LNgfzuMbk3kIP_KnyDiXO8rQJmDrLy_PQ7cAY8lCmMM,463
36
- ftmq-4.3.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
37
- ftmq-4.3.0.dist-info/entry_points.txt,sha256=YGDCjEiPgAMaQ5MqFKH8m-XIybehSXgarDucSlmeK3E,37
38
- ftmq-4.3.0.dist-info/RECORD,,
32
+ ftmq/util.py,sha256=s6TypjCvOy5FVgxn-FeCsIDtwkmUsyj6oIZ60Q56PBo,15687
33
+ ftmq-4.3.2.dist-info/METADATA,sha256=Un8mFQdd6tOfyD2wAKvQlcmU9dZ8tUFGynX0MN2Xv-E,5324
34
+ ftmq-4.3.2.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
35
+ ftmq-4.3.2.dist-info/entry_points.txt,sha256=YGDCjEiPgAMaQ5MqFKH8m-XIybehSXgarDucSlmeK3E,37
36
+ ftmq-4.3.2.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
37
+ ftmq-4.3.2.dist-info/licenses/NOTICE,sha256=LNgfzuMbk3kIP_KnyDiXO8rQJmDrLy_PQ7cAY8lCmMM,463
38
+ ftmq-4.3.2.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.1.3
2
+ Generator: poetry-core 2.2.1
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any