ftmq 4.3.0__py3-none-any.whl → 4.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ftmq/__init__.py +1 -1
- ftmq/cli.py +24 -1
- ftmq/model/dataset.py +1 -0
- ftmq/store/fragments/dataset.py +74 -12
- ftmq/util.py +2 -2
- {ftmq-4.3.0.dist-info → ftmq-4.3.2.dist-info}/METADATA +8 -6
- {ftmq-4.3.0.dist-info → ftmq-4.3.2.dist-info}/RECORD +11 -11
- {ftmq-4.3.0.dist-info → ftmq-4.3.2.dist-info}/WHEEL +1 -1
- {ftmq-4.3.0.dist-info → ftmq-4.3.2.dist-info}/entry_points.txt +0 -0
- {ftmq-4.3.0.dist-info → ftmq-4.3.2.dist-info/licenses}/LICENSE +0 -0
- {ftmq-4.3.0.dist-info → ftmq-4.3.2.dist-info/licenses}/NOTICE +0 -0
ftmq/__init__.py
CHANGED
ftmq/cli.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
|
|
1
3
|
import click
|
|
2
4
|
from anystore.io import smart_write, smart_write_json, smart_write_model
|
|
3
5
|
from click_default_group import DefaultGroup
|
|
@@ -357,16 +359,37 @@ def fragments_list_datasets(
|
|
|
357
359
|
"-o", "--output-uri", default="-", show_default=True, help="output file or uri"
|
|
358
360
|
)
|
|
359
361
|
@click.option("-d", "--dataset", required=True, help="Dataset name to iterate")
|
|
362
|
+
@click.option("-s", "--schema", default=None, help="Filter by schema")
|
|
363
|
+
@click.option(
|
|
364
|
+
"--since",
|
|
365
|
+
default=None,
|
|
366
|
+
help="Filter by timestamp (since), ISO format: YYYY-MM-DDTHH:MM:SS",
|
|
367
|
+
)
|
|
368
|
+
@click.option(
|
|
369
|
+
"--until",
|
|
370
|
+
default=None,
|
|
371
|
+
help="Filter by timestamp (until), ISO format: YYYY-MM-DDTHH:MM:SS",
|
|
372
|
+
)
|
|
360
373
|
def fragments_iterate(
|
|
361
374
|
input_uri: str = fragments_settings.database_uri,
|
|
362
375
|
output_uri: str = "-",
|
|
363
376
|
dataset: str = None,
|
|
377
|
+
schema: str | None = None,
|
|
378
|
+
since: str | None = None,
|
|
379
|
+
until: str | None = None,
|
|
364
380
|
):
|
|
365
381
|
"""
|
|
366
382
|
Iterate all entities from a fragments dataset
|
|
367
383
|
"""
|
|
368
384
|
fragments = get_fragments(dataset, database_uri=input_uri)
|
|
369
|
-
|
|
385
|
+
|
|
386
|
+
# Parse timestamp strings to datetime objects
|
|
387
|
+
since_dt = datetime.fromisoformat(since) if since else None
|
|
388
|
+
until_dt = datetime.fromisoformat(until) if until else None
|
|
389
|
+
|
|
390
|
+
smart_write_proxies(
|
|
391
|
+
output_uri, fragments.iterate(schema=schema, since=since_dt, until=until_dt)
|
|
392
|
+
)
|
|
370
393
|
|
|
371
394
|
|
|
372
395
|
@cli.command("aggregate")
|
ftmq/model/dataset.py
CHANGED
ftmq/store/fragments/dataset.py
CHANGED
|
@@ -132,7 +132,9 @@ class Fragments(object):
|
|
|
132
132
|
def bulk(self, size=1000):
|
|
133
133
|
return BulkLoader(self, size)
|
|
134
134
|
|
|
135
|
-
def fragments(
|
|
135
|
+
def fragments(
|
|
136
|
+
self, entity_ids=None, fragment=None, schema=None, since=None, until=None
|
|
137
|
+
):
|
|
136
138
|
stmt = self.table.select()
|
|
137
139
|
entity_ids = ensure_list(entity_ids)
|
|
138
140
|
if len(entity_ids) == 1:
|
|
@@ -141,6 +143,18 @@ class Fragments(object):
|
|
|
141
143
|
stmt = stmt.where(self.table.c.id.in_(entity_ids))
|
|
142
144
|
if fragment is not None:
|
|
143
145
|
stmt = stmt.where(self.table.c.fragment == fragment)
|
|
146
|
+
if schema is not None:
|
|
147
|
+
if self.store.is_postgres:
|
|
148
|
+
stmt = stmt.where(self.table.c.entity["schema"].astext == schema)
|
|
149
|
+
else:
|
|
150
|
+
# SQLite JSON support - use json_extract function
|
|
151
|
+
stmt = stmt.where(
|
|
152
|
+
func.json_extract(self.table.c.entity, "$.schema") == schema
|
|
153
|
+
)
|
|
154
|
+
if since is not None:
|
|
155
|
+
stmt = stmt.where(self.table.c.timestamp >= since)
|
|
156
|
+
if until is not None:
|
|
157
|
+
stmt = stmt.where(self.table.c.timestamp <= until)
|
|
144
158
|
stmt = stmt.order_by(self.table.c.id)
|
|
145
159
|
# stmt = stmt.order_by(self.table.c.origin)
|
|
146
160
|
# stmt = stmt.order_by(self.table.c.fragment)
|
|
@@ -159,8 +173,12 @@ class Fragments(object):
|
|
|
159
173
|
finally:
|
|
160
174
|
conn.close()
|
|
161
175
|
|
|
162
|
-
def partials(
|
|
163
|
-
|
|
176
|
+
def partials(
|
|
177
|
+
self, entity_id=None, skip_errors=False, schema=None, since=None, until=None
|
|
178
|
+
) -> EntityFragments:
|
|
179
|
+
for fragment in self.fragments(
|
|
180
|
+
entity_ids=entity_id, schema=schema, since=since, until=until
|
|
181
|
+
):
|
|
164
182
|
try:
|
|
165
183
|
yield EntityProxy.from_dict(fragment, cleaned=True)
|
|
166
184
|
except Exception:
|
|
@@ -169,22 +187,32 @@ class Fragments(object):
|
|
|
169
187
|
continue
|
|
170
188
|
raise
|
|
171
189
|
|
|
172
|
-
def iterate(
|
|
190
|
+
def iterate(
|
|
191
|
+
self, entity_id=None, skip_errors=False, schema=None, since=None, until=None
|
|
192
|
+
) -> EntityFragments:
|
|
173
193
|
if entity_id is None:
|
|
174
194
|
log.info("Using batched iteration for complete dataset.")
|
|
175
|
-
yield from self.iterate_batched(
|
|
195
|
+
yield from self.iterate_batched(
|
|
196
|
+
skip_errors=skip_errors, schema=schema, since=since, until=until
|
|
197
|
+
)
|
|
176
198
|
return
|
|
177
199
|
entity = None
|
|
178
200
|
invalid = None
|
|
179
201
|
fragments = 1
|
|
180
|
-
for partial in self.partials(
|
|
202
|
+
for partial in self.partials(
|
|
203
|
+
entity_id=entity_id,
|
|
204
|
+
skip_errors=skip_errors,
|
|
205
|
+
schema=schema,
|
|
206
|
+
since=since,
|
|
207
|
+
until=until,
|
|
208
|
+
):
|
|
181
209
|
if partial.id == invalid:
|
|
182
210
|
continue
|
|
183
211
|
if entity is not None:
|
|
184
212
|
if entity.id == partial.id:
|
|
185
213
|
fragments += 1
|
|
186
214
|
if fragments % 10000 == 0:
|
|
187
|
-
log.
|
|
215
|
+
log.warning(
|
|
188
216
|
"[%s:%s] aggregated %d fragments...",
|
|
189
217
|
entity.schema.name,
|
|
190
218
|
entity.id,
|
|
@@ -209,16 +237,26 @@ class Fragments(object):
|
|
|
209
237
|
if entity is not None:
|
|
210
238
|
yield entity
|
|
211
239
|
|
|
212
|
-
def iterate_batched(
|
|
240
|
+
def iterate_batched(
|
|
241
|
+
self, skip_errors=False, batch_size=10_000, schema=None, since=None, until=None
|
|
242
|
+
) -> EntityFragments:
|
|
213
243
|
"""
|
|
214
244
|
For large datasets an overall sort is not feasible, so we iterate in
|
|
215
245
|
sorted batched IDs.
|
|
216
246
|
"""
|
|
217
|
-
for entity_ids in self.get_sorted_id_batches(
|
|
218
|
-
|
|
247
|
+
for entity_ids in self.get_sorted_id_batches(
|
|
248
|
+
batch_size, schema=schema, since=since, until=until
|
|
249
|
+
):
|
|
250
|
+
yield from self.iterate(
|
|
251
|
+
entity_id=entity_ids,
|
|
252
|
+
skip_errors=skip_errors,
|
|
253
|
+
schema=schema,
|
|
254
|
+
since=since,
|
|
255
|
+
until=until,
|
|
256
|
+
)
|
|
219
257
|
|
|
220
258
|
def get_sorted_id_batches(
|
|
221
|
-
self, batch_size=10_000
|
|
259
|
+
self, batch_size=10_000, schema=None, since=None, until=None
|
|
222
260
|
) -> Generator[list[str], None, None]:
|
|
223
261
|
"""
|
|
224
262
|
Get sorted ID batches to speed up iteration and useful to parallelize
|
|
@@ -230,6 +268,20 @@ class Fragments(object):
|
|
|
230
268
|
stmt = select(self.table.c.id).distinct()
|
|
231
269
|
if last_id is not None:
|
|
232
270
|
stmt = stmt.where(self.table.c.id > last_id)
|
|
271
|
+
if schema is not None:
|
|
272
|
+
if self.store.is_postgres:
|
|
273
|
+
stmt = stmt.where(
|
|
274
|
+
self.table.c.entity["schema"].astext == schema
|
|
275
|
+
)
|
|
276
|
+
else:
|
|
277
|
+
# SQLite JSON support - use json_extract function
|
|
278
|
+
stmt = stmt.where(
|
|
279
|
+
func.json_extract(self.table.c.entity, "$.schema") == schema
|
|
280
|
+
)
|
|
281
|
+
if since is not None:
|
|
282
|
+
stmt = stmt.where(self.table.c.timestamp >= since)
|
|
283
|
+
if until is not None:
|
|
284
|
+
stmt = stmt.where(self.table.c.timestamp <= until)
|
|
233
285
|
stmt = stmt.order_by(self.table.c.id).limit(batch_size)
|
|
234
286
|
try:
|
|
235
287
|
res = conn.execute(stmt)
|
|
@@ -242,11 +294,19 @@ class Fragments(object):
|
|
|
242
294
|
self.reset()
|
|
243
295
|
raise
|
|
244
296
|
|
|
297
|
+
def get_sorted_ids(
|
|
298
|
+
self, batch_size=10_000, schema=None, since=None, until=None
|
|
299
|
+
) -> Generator[str, None, None]:
|
|
300
|
+
"""Get sorted IDs, optionally filtered by schema"""
|
|
301
|
+
for batch in self.get_sorted_id_batches(batch_size, schema, since, until):
|
|
302
|
+
yield from batch
|
|
303
|
+
|
|
245
304
|
def statements(
|
|
246
305
|
self,
|
|
247
306
|
entity_ids: Iterable[str] | None = None,
|
|
248
307
|
origin: str | None = None,
|
|
249
308
|
since: datetime | None = None,
|
|
309
|
+
until: datetime | None = None,
|
|
250
310
|
) -> Statements:
|
|
251
311
|
"""Iterate unsorted statements with its fragment origins"""
|
|
252
312
|
stmt = self.table.select()
|
|
@@ -258,7 +318,9 @@ class Fragments(object):
|
|
|
258
318
|
if origin is not None:
|
|
259
319
|
stmt = stmt.where(self.table.c.origin == origin)
|
|
260
320
|
if since is not None:
|
|
261
|
-
stmt = stmt.where(self.table.c.timestamp
|
|
321
|
+
stmt = stmt.where(self.table.c.timestamp >= since)
|
|
322
|
+
if until is not None:
|
|
323
|
+
stmt = stmt.where(self.table.c.timestamp <= until)
|
|
262
324
|
conn = self.store.engine.connect()
|
|
263
325
|
default_dataset = make_dataset(self.name)
|
|
264
326
|
try:
|
ftmq/util.py
CHANGED
|
@@ -13,7 +13,7 @@ from followthemoney.proxy import EntityProxy
|
|
|
13
13
|
from followthemoney.schema import Schema
|
|
14
14
|
from followthemoney.types import registry
|
|
15
15
|
from followthemoney.util import make_entity_id, sanitize_text
|
|
16
|
-
from normality import
|
|
16
|
+
from normality import latinize_text, slugify, squash_spaces
|
|
17
17
|
from rigour.names import Name, Symbol, tag_org_name, tag_person_name
|
|
18
18
|
from rigour.names.tokenize import normalize_name
|
|
19
19
|
from rigour.text.scripts import can_latinize
|
|
@@ -321,7 +321,7 @@ def clean_string(value: Any) -> str | None:
|
|
|
321
321
|
value = sanitize_text(value)
|
|
322
322
|
if value is None:
|
|
323
323
|
return
|
|
324
|
-
return
|
|
324
|
+
return squash_spaces(value)
|
|
325
325
|
|
|
326
326
|
|
|
327
327
|
def clean_name(value: Any) -> str | None:
|
|
@@ -1,8 +1,10 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: ftmq
|
|
3
|
-
Version: 4.3.
|
|
3
|
+
Version: 4.3.2
|
|
4
4
|
Summary: followthemoney query dsl and io helpers
|
|
5
5
|
License: AGPLv3+
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
License-File: NOTICE
|
|
6
8
|
Author: Simon Wörpel
|
|
7
9
|
Author-email: simon.woerpel@pm.me
|
|
8
10
|
Requires-Python: >=3.11,<4
|
|
@@ -22,12 +24,12 @@ Requires-Dist: alephclient (>=2.6.0,<3.0.0) ; extra == "aleph"
|
|
|
22
24
|
Requires-Dist: anystore (>=0.4.0,<0.5.0)
|
|
23
25
|
Requires-Dist: click (>=8.2.1,<9.0.0)
|
|
24
26
|
Requires-Dist: click-default-group (>=1.2.4,<2.0.0)
|
|
25
|
-
Requires-Dist: deltalake (>=1.2.
|
|
27
|
+
Requires-Dist: deltalake (>=1.2.1,<2.0.0) ; extra == "lake"
|
|
26
28
|
Requires-Dist: duckdb (>=1.4.1,<2.0.0) ; extra == "lake"
|
|
27
29
|
Requires-Dist: fakeredis (>=2.26.2,<3.0.0) ; extra == "redis"
|
|
28
|
-
Requires-Dist: followthemoney (>=4.3.
|
|
30
|
+
Requires-Dist: followthemoney (>=4.3.2,<5.0.0)
|
|
29
31
|
Requires-Dist: furl (>=2.1.4,<3.0.0) ; extra == "aleph"
|
|
30
|
-
Requires-Dist: nomenklatura (>=4.1.
|
|
32
|
+
Requires-Dist: nomenklatura (>=4.1.10,<5.0.0)
|
|
31
33
|
Requires-Dist: orjson (>=3.10.18,<4.0.0)
|
|
32
34
|
Requires-Dist: pandas (>=2.3.3,<3.0.0) ; extra == "lake"
|
|
33
35
|
Requires-Dist: plyvel (>=1.5.1,<2.0.0) ; extra == "level"
|
|
@@ -37,7 +39,7 @@ Requires-Dist: pycountry (>=24.6.1,<25.0.0)
|
|
|
37
39
|
Requires-Dist: pydantic (>=2.11.3,<3.0.0)
|
|
38
40
|
Requires-Dist: pyicu (>=2.15.2,<3.0.0)
|
|
39
41
|
Requires-Dist: redis (>=5.2.1,<6.0.0) ; extra == "redis"
|
|
40
|
-
Requires-Dist: rigour (>=1.
|
|
42
|
+
Requires-Dist: rigour (>=1.4.1,<2.0.0)
|
|
41
43
|
Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "postgres"
|
|
42
44
|
Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "sql"
|
|
43
45
|
Project-URL: Documentation, https://docs.investigraph.dev/lib/ftmq
|
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
ftmq/__init__.py,sha256=
|
|
1
|
+
ftmq/__init__.py,sha256=ueHfqk0txp2sdGHbXsKPFwErBru33CPWbewYF8swWfA,245
|
|
2
2
|
ftmq/aggregate.py,sha256=nyAI5w6jKG1T4Jf2yy1ruhPh0vY6p7JWOEYh0SLBdZY,2163
|
|
3
3
|
ftmq/aggregations.py,sha256=YLu1WF3WgcI3tadWKqsoZk3f_3bYUJetIREy2N1u_EM,4794
|
|
4
|
-
ftmq/cli.py,sha256=
|
|
4
|
+
ftmq/cli.py,sha256=e4wLGfGFmn-8Z6_EhEcOJvUwQM3R-J1i40qh2UVHcGo,12078
|
|
5
5
|
ftmq/enums.py,sha256=4AJ6Ii8Bnbfz0BSznyK0IUopy-XMquuAuvud8ZcrD54,2521
|
|
6
6
|
ftmq/filters.py,sha256=TKs454wbSvA5tPj8WbpIrMojctE2jGTLnrrVQKM9PE4,7908
|
|
7
7
|
ftmq/io.py,sha256=gUbqoZuYXWwxoiJ505H7FhmXLYr5MEUiCEvqIiOkbgo,3789
|
|
8
8
|
ftmq/logging.py,sha256=p6UvN8qkki8bOjI_NfIjKoLRhdtAdGm4swMWPssETKY,3211
|
|
9
9
|
ftmq/model/__init__.py,sha256=hgdLNeoYm4o6XoY24_7gtIYf8SUq3d8s_2mda1EQDzg,197
|
|
10
|
-
ftmq/model/dataset.py,sha256=
|
|
10
|
+
ftmq/model/dataset.py,sha256=d6KQDhvxhfQv7J4oF76_ePANDJQ8onxpzhc4-wnui_Q,2228
|
|
11
11
|
ftmq/model/entity.py,sha256=MIJWORs6X4Z22DDQkHRQlcR3Hug8M_YjZwVEo5KU538,2073
|
|
12
12
|
ftmq/model/mixins.py,sha256=O_z3Pxv0HzXxu5K-lNFglrIAhh4jejmbcXUjQfVe0_o,435
|
|
13
13
|
ftmq/model/stats.py,sha256=BiOK7x-JymI5f3dRpwgr3J7OknDP1pe9955Y7AvOJGo,4048
|
|
@@ -18,7 +18,7 @@ ftmq/store/__init__.py,sha256=HH30KAHqo1kPr4qbjo6oKxdvgpF_XrqPgk8hSCkPbAY,3152
|
|
|
18
18
|
ftmq/store/aleph.py,sha256=vENWpMgV1rViMwcef9wAwOGyrciCnSPRN-dSI103478,3977
|
|
19
19
|
ftmq/store/base.py,sha256=1IgX4haeNUA9NTBCi-hBFVe0u44ra1nGdHAjSTOUEAs,4762
|
|
20
20
|
ftmq/store/fragments/__init__.py,sha256=jHXHejqXe6sBNllAt-BuU24Ou8m5evmsD1UPac5J2GE,750
|
|
21
|
-
ftmq/store/fragments/dataset.py,sha256=
|
|
21
|
+
ftmq/store/fragments/dataset.py,sha256=P1ljcpoRpl_IIApA2A3vizpB2goRxxue_NNoeI1JQN8,13032
|
|
22
22
|
ftmq/store/fragments/loader.py,sha256=iVh8F22IApe9MRY_Z2fOLvT80fCYstFyxu410l4pPQY,4066
|
|
23
23
|
ftmq/store/fragments/settings.py,sha256=4c-BW-blVM9gC_IGPch03eExbZYFZ3V5h9yTfhcHvOI,303
|
|
24
24
|
ftmq/store/fragments/store.py,sha256=LiSfg95LjEmyq2IUpX4CMtp2tE37SEBLtAw0EWufImM,2534
|
|
@@ -29,10 +29,10 @@ ftmq/store/memory.py,sha256=lZ_pDzrBWNljbNb1MXJeCoO7TnAdqEfG4kfLDOU5rME,551
|
|
|
29
29
|
ftmq/store/redis.py,sha256=d0hkGF_BezdIfCMUshXWoQwvGmqT8JFblUMcCxzwkDA,433
|
|
30
30
|
ftmq/store/sql.py,sha256=6h3-gDaTAlD-IkiOONcX-JbaAO9-QfSsMjjMPupclcQ,5216
|
|
31
31
|
ftmq/types.py,sha256=HgF8eT3ynKnDUxBYFtoDytS-uN_CS7Yr3DHIX2r4tnk,774
|
|
32
|
-
ftmq/util.py,sha256=
|
|
33
|
-
ftmq-4.3.
|
|
34
|
-
ftmq-4.3.
|
|
35
|
-
ftmq-4.3.
|
|
36
|
-
ftmq-4.3.
|
|
37
|
-
ftmq-4.3.
|
|
38
|
-
ftmq-4.3.
|
|
32
|
+
ftmq/util.py,sha256=s6TypjCvOy5FVgxn-FeCsIDtwkmUsyj6oIZ60Q56PBo,15687
|
|
33
|
+
ftmq-4.3.2.dist-info/METADATA,sha256=Un8mFQdd6tOfyD2wAKvQlcmU9dZ8tUFGynX0MN2Xv-E,5324
|
|
34
|
+
ftmq-4.3.2.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
35
|
+
ftmq-4.3.2.dist-info/entry_points.txt,sha256=YGDCjEiPgAMaQ5MqFKH8m-XIybehSXgarDucSlmeK3E,37
|
|
36
|
+
ftmq-4.3.2.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
|
37
|
+
ftmq-4.3.2.dist-info/licenses/NOTICE,sha256=LNgfzuMbk3kIP_KnyDiXO8rQJmDrLy_PQ7cAY8lCmMM,463
|
|
38
|
+
ftmq-4.3.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|