ingestify 0.6.3__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. ingestify/__init__.py +1 -1
  2. ingestify/application/dataset_store.py +228 -11
  3. ingestify/application/ingestion_engine.py +229 -7
  4. ingestify/application/loader.py +153 -28
  5. ingestify/cmdline.py +0 -48
  6. ingestify/domain/models/__init__.py +2 -0
  7. ingestify/domain/models/dataset/collection.py +0 -9
  8. ingestify/domain/models/dataset/dataset_repository.py +4 -0
  9. ingestify/domain/models/dataset/dataset_state.py +5 -0
  10. ingestify/domain/models/dataset/events.py +13 -0
  11. ingestify/domain/models/dataset/file.py +1 -1
  12. ingestify/domain/models/dataset/selector.py +8 -1
  13. ingestify/domain/models/event/event_bus.py +16 -1
  14. ingestify/domain/models/ingestion/ingestion_job.py +23 -4
  15. ingestify/domain/models/resources/dataset_resource.py +0 -1
  16. ingestify/infra/source/statsbomb/base.py +36 -0
  17. ingestify/infra/source/statsbomb/match.py +137 -0
  18. ingestify/infra/source/statsbomb_github.py +46 -44
  19. ingestify/infra/store/dataset/sqlalchemy/repository.py +100 -31
  20. ingestify/infra/store/dataset/sqlalchemy/tables.py +10 -0
  21. ingestify/main.py +35 -10
  22. ingestify/utils.py +2 -32
  23. ingestify-0.7.0.dist-info/METADATA +211 -0
  24. {ingestify-0.6.3.dist-info → ingestify-0.7.0.dist-info}/RECORD +28 -36
  25. ingestify/infra/source/wyscout.py +0 -175
  26. ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -19
  27. ingestify/static/templates/statsbomb_github/database/README.md +0 -1
  28. ingestify/static/templates/statsbomb_github/query.py +0 -14
  29. ingestify/static/templates/wyscout/.env +0 -5
  30. ingestify/static/templates/wyscout/.gitignore +0 -2
  31. ingestify/static/templates/wyscout/README.md +0 -0
  32. ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -18
  33. ingestify/static/templates/wyscout/database/README.md +0 -1
  34. ingestify/static/templates/wyscout/query.py +0 -14
  35. ingestify-0.6.3.dist-info/METADATA +0 -266
  36. /ingestify/{static/templates/statsbomb_github/README.md → infra/source/statsbomb/__init__.py} +0 -0
  37. {ingestify-0.6.3.dist-info → ingestify-0.7.0.dist-info}/WHEEL +0 -0
  38. {ingestify-0.6.3.dist-info → ingestify-0.7.0.dist-info}/entry_points.txt +0 -0
  39. {ingestify-0.6.3.dist-info → ingestify-0.7.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,137 @@
1
+ from datetime import datetime
2
+
3
+ from ingestify import DatasetResource
4
+ from ingestify.domain.models.dataset.dataset import DatasetState
5
+
6
+ from .base import StatsBombBaseAPI
7
+
8
+
9
+ class StatsBombMatchAPI(StatsBombBaseAPI):
10
+ def discover_selectors(self, dataset_type: str):
11
+ assert dataset_type == "match"
12
+
13
+ competitions = self.get(data_spec_version="v4", path="competitions")
14
+
15
+ def get_last_modified(competition):
16
+ if not competition["match_updated"]:
17
+ return None
18
+
19
+ last_modified = datetime.fromisoformat(
20
+ competition["match_updated"] + "+00:00"
21
+ )
22
+ if competition["match_updated_360"]:
23
+ last_modified = max(
24
+ last_modified,
25
+ datetime.fromisoformat(competition["match_updated_360"] + "+00:00"),
26
+ )
27
+ return last_modified
28
+
29
+ return [
30
+ dict(
31
+ competition_id=competition["competition_id"],
32
+ season_id=competition["season_id"],
33
+ # Passing the LastModified for an entire competition allows Ingestify to entirely skip
34
+ # this Selector based on a datetime based check. Dataset comparison won't happen. When the
35
+ # DataSpecVersion is changed, but LastModified isn't changed on the Source, new files ARE NOT ingested!
36
+ _last_modified=get_last_modified(competition),
37
+ )
38
+ for competition in competitions
39
+ ]
40
+
41
+ def find_datasets(
42
+ self,
43
+ dataset_type: str,
44
+ competition_id: str,
45
+ season_id: str,
46
+ match_id: str = None,
47
+ data_spec_versions=None,
48
+ dataset_collection_metadata=None,
49
+ ):
50
+ assert dataset_type == "match"
51
+
52
+ match_data_spec_version = data_spec_versions.get_version("match")
53
+
54
+ matches = self.get(
55
+ path=f"competitions/{competition_id}/seasons/{season_id}/matches",
56
+ data_spec_version=match_data_spec_version,
57
+ )
58
+
59
+ for match in matches:
60
+ if match_id:
61
+ if match["match_id"] != match_id:
62
+ continue
63
+
64
+ last_modified = datetime.fromisoformat(match["last_updated"] + "+00:00")
65
+
66
+ if match["collection_status"] == "Complete":
67
+ if match["match_status"] == "available":
68
+ state = DatasetState.COMPLETE
69
+ else:
70
+ # This could be "processing"
71
+ state = DatasetState.PARTIAL
72
+ else:
73
+ state = DatasetState.SCHEDULED
74
+
75
+ name = (
76
+ f"{match['match_date']} / "
77
+ f"{match['home_team']['home_team_name']} - {match['away_team']['away_team_name']}"
78
+ )
79
+
80
+ dataset_resource = DatasetResource(
81
+ dataset_resource_id=dict(
82
+ competition_id=competition_id,
83
+ season_id=season_id,
84
+ match_id=match["match_id"],
85
+ ),
86
+ dataset_type=dataset_type,
87
+ provider=self.provider,
88
+ name=name,
89
+ metadata=match,
90
+ state=state,
91
+ )
92
+
93
+ dataset_resource.add_file(
94
+ last_modified=last_modified,
95
+ data_feed_key="match",
96
+ data_spec_version=match_data_spec_version,
97
+ json_content=match,
98
+ )
99
+
100
+ if state.is_complete:
101
+ name += f" / {match['home_score']}-{match['away_score']}"
102
+
103
+ for data_feed_key in ["lineups", "events"]:
104
+ for data_spec_version in data_spec_versions[data_feed_key]:
105
+ dataset_resource.add_file(
106
+ # Note: the LastModified value can be incorrect when only match Metadata (match file)
107
+ # is changed. Use it anyway for indication. Ingestify will also use the
108
+ # Dataset.last_modified_at value to determine if a file should be refetched
109
+ last_modified=last_modified,
110
+ data_feed_key=data_feed_key,
111
+ data_spec_version=data_spec_version,
112
+ url=self.get_url(
113
+ data_feed_key, data_spec_version, match["match_id"]
114
+ ),
115
+ http_options=dict(auth=(self.username, self.password)),
116
+ data_serialization_format="json",
117
+ )
118
+
119
+ if (
120
+ match["last_updated_360"]
121
+ and match["match_status_360"] == "available"
122
+ ):
123
+ for data_spec_version in data_spec_versions.get("360-frames", []):
124
+ dataset_resource.add_file(
125
+ last_modified=datetime.fromisoformat(
126
+ match["last_updated_360"] + "+00:00"
127
+ ),
128
+ data_feed_key="360-frames",
129
+ data_spec_version=data_spec_version,
130
+ url=self.get_url(
131
+ "360-frames", data_spec_version, match["match_id"]
132
+ ),
133
+ http_options=dict(auth=(self.username, self.password)),
134
+ data_serialization_format="json",
135
+ )
136
+
137
+ yield dataset_resource
@@ -21,6 +21,7 @@ class StatsbombGithub(Source):
21
21
  dict(
22
22
  competition_id=competition["competition_id"],
23
23
  season_id=competition["season_id"],
24
+ _name=f"{competition['competition_name']} - {competition['season_name']}",
24
25
  )
25
26
  for competition in competitions
26
27
  ]
@@ -53,53 +54,54 @@ class StatsbombGithub(Source):
53
54
  name = (
54
55
  f"{match['match_date']} / "
55
56
  f"{match['home_team']['home_team_name']} - {match['away_team']['away_team_name']}"
57
+ f" / {match['home_score']}-{match['away_score']}"
56
58
  )
57
59
 
58
- dataset_resource = DatasetResource(
59
- dataset_resource_id=dict(
60
- competition_id=competition_id,
61
- season_id=season_id,
62
- match_id=match["match_id"],
63
- ),
64
- dataset_type=dataset_type,
65
- provider=self.provider,
66
- name=name,
67
- metadata=match,
68
- state=state,
60
+ dataset_resource = (
61
+ DatasetResource(
62
+ dataset_resource_id=dict(
63
+ competition_id=competition_id,
64
+ season_id=season_id,
65
+ match_id=match["match_id"],
66
+ ),
67
+ dataset_type=dataset_type,
68
+ provider=self.provider,
69
+ name=name,
70
+ metadata=match,
71
+ state=state,
72
+ )
73
+ .add_file(
74
+ last_modified=last_modified,
75
+ data_feed_key="match",
76
+ data_spec_version=DATA_SPEC_VERSION,
77
+ json_content=match,
78
+ )
79
+ .add_file(
80
+ last_modified=last_modified,
81
+ data_feed_key="lineups",
82
+ data_spec_version=DATA_SPEC_VERSION,
83
+ url=f"{BASE_URL}/lineups/{match['match_id']}.json",
84
+ data_serialization_format="json",
85
+ )
86
+ .add_file(
87
+ last_modified=last_modified,
88
+ data_feed_key="events",
89
+ data_spec_version=DATA_SPEC_VERSION,
90
+ url=f"{BASE_URL}/events/{match['match_id']}.json",
91
+ data_serialization_format="json",
92
+ )
69
93
  )
70
94
 
71
- dataset_resource.add_file(
72
- last_modified=last_modified,
73
- data_feed_key="match",
74
- data_spec_version=DATA_SPEC_VERSION,
75
- json_content=match,
76
- )
77
-
78
- if state.is_complete:
79
- name += f" / {match['home_score']}-{match['away_score']}"
80
-
81
- for data_feed_key in ["lineups", "events"]:
82
- dataset_resource.add_file(
83
- last_modified=last_modified,
84
- data_feed_key=data_feed_key,
85
- data_spec_version=DATA_SPEC_VERSION,
86
- url=f"{BASE_URL}/{data_feed_key}/{match['match_id']}.json",
87
- data_serialization_format="json",
88
- )
89
-
90
- if (
91
- match["last_updated_360"]
92
- and match["match_status_360"] == "available"
93
- ):
94
- dataset_resource.add_file(
95
- last_modified=datetime.fromisoformat(
96
- match["last_updated_360"] + "+00:00"
97
- ),
98
- data_feed_key="360-frames",
99
- data_spec_version=DATA_SPEC_VERSION,
100
- url=f"{BASE_URL}/three-sixty/{match['match_id']}.json",
101
- data_serialization_format="json",
102
- http_options={"ignore_not_found": True},
103
- )
95
+ if match["last_updated_360"] and match["match_status_360"] == "available":
96
+ dataset_resource.add_file(
97
+ last_modified=datetime.fromisoformat(
98
+ match["last_updated_360"] + "+00:00"
99
+ ),
100
+ data_feed_key="360-frames",
101
+ data_spec_version=DATA_SPEC_VERSION,
102
+ url=f"{BASE_URL}/three-sixty/{match['match_id']}.json",
103
+ data_serialization_format="json",
104
+ http_options={"ignore_not_found": True},
105
+ )
104
106
 
105
107
  yield dataset_resource
@@ -29,6 +29,7 @@ from ingestify.domain.models import (
29
29
  Dataset,
30
30
  DatasetCollection,
31
31
  DatasetRepository,
32
+ DatasetState,
32
33
  Selector,
33
34
  )
34
35
  from ingestify.domain.models.dataset.collection_metadata import (
@@ -46,6 +47,7 @@ from .tables import (
46
47
  revision_table,
47
48
  ingestion_job_summary_table,
48
49
  task_summary_table,
50
+ store_version_table,
49
51
  )
50
52
 
51
53
  logger = logging.getLogger(__name__)
@@ -159,6 +161,10 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
159
161
  entities: list[dict],
160
162
  immutable_rows: bool = False,
161
163
  ):
164
+ if not entities:
165
+ # Nothing to do
166
+ return
167
+
162
168
  dialect = self.dialect.name
163
169
  if dialect == "mysql":
164
170
  from sqlalchemy.dialects.mysql import insert
@@ -230,6 +236,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
230
236
  provider: Optional[str] = None,
231
237
  dataset_id: Optional[Union[str, List[str]]] = None,
232
238
  selector: Optional[Union[Selector, List[Selector]]] = None,
239
+ dataset_state: Optional[List[DatasetState]] = None,
233
240
  ):
234
241
  if dataset_id is not None:
235
242
  if isinstance(dataset_id, list):
@@ -268,33 +275,35 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
268
275
  if not selectors:
269
276
  raise ValueError("Selectors must contain at least one item")
270
277
 
271
- attribute_cte = self._build_cte(
272
- [selector.filtered_attributes for selector in selectors], "attributes"
273
- )
274
-
275
- keys = list(selectors[0].filtered_attributes.keys())
276
278
  first_selector = selectors[0].filtered_attributes
279
+ keys = list(first_selector.keys())
277
280
 
278
- join_conditions = []
279
- for k in keys:
280
- if dialect == "postgresql":
281
- column = dataset_table.c.identifier[k]
281
+ if keys:
282
+ attribute_cte = self._build_cte(
283
+ [selector.filtered_attributes for selector in selectors],
284
+ "attributes",
285
+ )
282
286
 
283
- # Take the value from the first selector to determine the type.
284
- # TODO: check all selectors to determine the type
285
- v = first_selector[k]
286
- if isinstance(v, int):
287
- column = column.as_integer()
287
+ join_conditions = []
288
+ for k in keys:
289
+ if dialect == "postgresql":
290
+ column = dataset_table.c.identifier[k]
291
+
292
+ # Take the value from the first selector to determine the type.
293
+ # TODO: check all selectors to determine the type
294
+ v = first_selector[k]
295
+ if isinstance(v, int):
296
+ column = column.as_integer()
297
+ else:
298
+ column = column.as_string()
288
299
  else:
289
- column = column.as_string()
290
- else:
291
- column = func.json_extract(dataset_table.c.identifier, f"$.{k}")
300
+ column = func.json_extract(dataset_table.c.identifier, f"$.{k}")
292
301
 
293
- join_conditions.append(attribute_cte.c[k] == column)
302
+ join_conditions.append(attribute_cte.c[k] == column)
294
303
 
295
- query = query.select_from(
296
- dataset_table.join(attribute_cte, and_(*join_conditions))
297
- )
304
+ query = query.select_from(
305
+ dataset_table.join(attribute_cte, and_(*join_conditions))
306
+ )
298
307
 
299
308
  if where:
300
309
  query = query.filter(text(where))
@@ -304,6 +313,8 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
304
313
  query = query.filter(dataset_table.c.dataset_type == dataset_type)
305
314
  if provider:
306
315
  query = query.filter(dataset_table.c.provider == provider)
316
+ if dataset_state:
317
+ query = query.filter(dataset_table.c.state.in_(dataset_state))
307
318
 
308
319
  return query
309
320
 
@@ -393,6 +404,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
393
404
  dataset_id: Optional[Union[str, List[str]]] = None,
394
405
  selector: Optional[Union[Selector, List[Selector]]] = None,
395
406
  metadata_only: bool = False,
407
+ page: Optional[int] = None,
408
+ page_size: Optional[int] = None,
409
+ dataset_state: Optional[List[DatasetState]] = None,
396
410
  ) -> DatasetCollection:
397
411
  def apply_query_filter(query):
398
412
  return self._filter_query(
@@ -402,15 +416,23 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
402
416
  provider=provider,
403
417
  dataset_id=dataset_id,
404
418
  selector=selector,
419
+ dataset_state=dataset_state,
405
420
  )
406
421
 
407
422
  with self.session:
408
423
  # Use a contextmanager to make sure it's closed afterwards
409
424
 
410
425
  if not metadata_only:
426
+ # Apply sorting by created_at in ascending order
411
427
  dataset_query = apply_query_filter(
412
428
  self.session.query(dataset_table.c.dataset_id)
413
- )
429
+ ).order_by(dataset_table.c.created_at.asc())
430
+
431
+ # Apply pagination if both page and page_size are provided
432
+ if page is not None and page_size is not None:
433
+ offset = (page - 1) * page_size
434
+ dataset_query = dataset_query.offset(offset).limit(page_size)
435
+
414
436
  self._debug_query(dataset_query)
415
437
  dataset_ids = [row.dataset_id for row in dataset_query]
416
438
  datasets = self._load_datasets(dataset_ids)
@@ -501,19 +523,25 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
501
523
  with self.connect() as connection:
502
524
  try:
503
525
  # Delete modified files related to the dataset
504
- file_table.delete().where(
505
- file_table.c.dataset_id == dataset.dataset_id
506
- ).execute()
526
+ connection.execute(
527
+ file_table.delete().where(
528
+ file_table.c.dataset_id == dataset.dataset_id
529
+ )
530
+ )
507
531
 
508
532
  # Delete revisions related to the dataset
509
- revision_table.delete().where(
510
- revision_table.c.dataset_id == dataset.dataset_id
511
- ).execute()
533
+ connection.execute(
534
+ revision_table.delete().where(
535
+ revision_table.c.dataset_id == dataset.dataset_id
536
+ )
537
+ )
512
538
 
513
539
  # Delete the dataset itself
514
- dataset_table.delete().where(
515
- dataset_table.c.dataset_id == dataset.dataset_id
516
- ).execute()
540
+ connection.execute(
541
+ dataset_table.delete().where(
542
+ dataset_table.c.dataset_id == dataset.dataset_id
543
+ )
544
+ )
517
545
 
518
546
  connection.commit()
519
547
  except Exception:
@@ -604,3 +632,44 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
604
632
  )
605
633
  )
606
634
  return ingestion_job_summaries
635
+
636
+ def get_store_version(self) -> Optional[str]:
637
+ """Get the current Ingestify version stored for this store."""
638
+ with self.session:
639
+ row = self.session.query(store_version_table.c.ingestify_version).first()
640
+ return row.ingestify_version if row else None
641
+
642
+ def set_store_version(self, version: str):
643
+ """Set the Ingestify version for this store."""
644
+ from ingestify.utils import utcnow
645
+
646
+ now = utcnow()
647
+ entity = {
648
+ "id": 1,
649
+ "ingestify_version": version,
650
+ "created_at": now,
651
+ "updated_at": now,
652
+ }
653
+
654
+ with self.connect() as connection:
655
+ try:
656
+ self._upsert(connection, store_version_table, [entity])
657
+ connection.commit()
658
+ except Exception:
659
+ connection.rollback()
660
+ raise
661
+
662
+ def ensure_compatible_version(self, current_version: str):
663
+ """Ensure the store is compatible with the current Ingestify version."""
664
+ stored_version = self.get_store_version()
665
+
666
+ if stored_version is None:
667
+ # First time setup - store the current version
668
+ self.set_store_version(current_version)
669
+ logger.info(f"Initialized store with Ingestify version {current_version}")
670
+ elif stored_version != current_version:
671
+ # Version mismatch - for now just log, future: trigger migration
672
+ logger.warning(
673
+ f"Store version mismatch: stored={stored_version}, current={current_version}. "
674
+ f"Future versions may require migration."
675
+ )
@@ -84,6 +84,7 @@ class PathString(TypeDecorator):
84
84
 
85
85
 
86
86
  class DatasetStateString(TypeDecorator):
87
+ cache_ok = True
87
88
  impl = String(255)
88
89
 
89
90
  def process_bind_param(self, value: DatasetState, dialect):
@@ -318,6 +319,15 @@ task_summary_table = Table(
318
319
  # Column("state", RevisionStateString, default=RevisionState.PENDING_VALIDATION),
319
320
  # Column("source", JSONType()),
320
321
  )
322
+
323
+ store_version_table = Table(
324
+ "store_version",
325
+ metadata,
326
+ Column("id", Integer, primary_key=True, default=1),
327
+ Column("ingestify_version", String(255), nullable=False),
328
+ Column("created_at", TZDateTime(6), nullable=False),
329
+ Column("updated_at", TZDateTime(6), nullable=False),
330
+ )
321
331
  #
322
332
  #
323
333
  # mapper_registry = registry()
ingestify/main.py CHANGED
@@ -138,12 +138,16 @@ def get_remote_datastore(url: str, bucket: str, **kwargs) -> DatasetStore:
138
138
 
139
139
  def get_source_cls(key: str) -> Type[Source]:
140
140
  if key.startswith("ingestify."):
141
- _, type_ = key.split(".")
141
+ _, type_ = key.split(".", maxsplit=1)
142
142
  if type_ == "wyscout":
143
143
  from ingestify.infra.source.wyscout import Wyscout
144
144
 
145
145
  return Wyscout
146
146
 
147
+ elif type_ == "statsbomb.match":
148
+ from ingestify.infra.source.statsbomb.match import StatsBombMatchAPI
149
+
150
+ return StatsBombMatchAPI
147
151
  elif type_ == "statsbomb_github":
148
152
  from ingestify.infra.source.statsbomb_github import StatsbombGithub
149
153
 
@@ -183,15 +187,36 @@ def get_event_subscriber_cls(key: str) -> Type[Subscriber]:
183
187
 
184
188
 
185
189
  def get_engine(
186
- config_file, bucket: Optional[str] = None, disable_events: bool = False
190
+ config_file: Optional[str] = None,
191
+ bucket: Optional[str] = None,
192
+ disable_events: bool = False,
193
+ metadata_url: Optional[str] = None,
194
+ file_url: Optional[str] = None,
187
195
  ) -> IngestionEngine:
188
- config = parse_config(config_file, default_value="")
189
-
190
- logger.info("Initializing sources")
191
196
  sources = {}
192
- sys.path.append(os.path.dirname(config_file))
193
- for name, source_args in config["sources"].items():
194
- sources[name] = build_source(name=name, source_args=source_args)
197
+
198
+ if not config_file:
199
+ if not metadata_url or not file_url:
200
+ raise ValueError(
201
+ f"You must specify metadata_url and file_url in case you don't use a config_file"
202
+ )
203
+
204
+ config = {
205
+ "main": {
206
+ "metadata_url": metadata_url,
207
+ "file_url": file_url,
208
+ "default_bucket": bucket or "main",
209
+ }
210
+ }
211
+ elif not config_file:
212
+ raise ValueError("You must specify a config file")
213
+ else:
214
+ config = parse_config(config_file, default_value="")
215
+
216
+ logger.info("Initializing sources")
217
+ sys.path.append(os.path.dirname(config_file))
218
+ for name, source_args in config.get("sources", {}).items():
219
+ sources[name] = build_source(name=name, source_args=source_args)
195
220
 
196
221
  logger.info("Initializing IngestionEngine")
197
222
  store = get_dataset_store_by_urls(
@@ -244,13 +269,13 @@ def get_engine(
244
269
  # but makes it easier later one where we loop over selectors.
245
270
  selectors = [Selector.build({}, data_spec_versions=data_spec_versions)]
246
271
 
247
- ingestion_plan = IngestionPlan(
272
+ ingestion_plan_ = IngestionPlan(
248
273
  source=sources[ingestion_plan["source"]],
249
274
  dataset_type=ingestion_plan["dataset_type"],
250
275
  selectors=selectors,
251
276
  fetch_policy=fetch_policy,
252
277
  data_spec_versions=data_spec_versions,
253
278
  )
254
- ingestion_engine.add_ingestion_plan(ingestion_plan)
279
+ ingestion_engine.add_ingestion_plan(ingestion_plan_)
255
280
 
256
281
  return ingestion_engine
ingestify/utils.py CHANGED
@@ -5,13 +5,11 @@ import re
5
5
  import traceback
6
6
  from concurrent.futures import ThreadPoolExecutor
7
7
  from contextlib import contextmanager
8
- from multiprocessing import get_context, cpu_count, get_all_start_methods
9
8
 
10
9
  from datetime import datetime, timezone
11
10
  from string import Template
12
11
  from typing import Dict, Tuple, Optional, Any, List
13
12
 
14
- import cloudpickle
15
13
  from pydantic import Field
16
14
  from typing_extensions import Self
17
15
 
@@ -75,8 +73,8 @@ class AttributeBag:
75
73
  return Template(string).substitute(**self.attributes)
76
74
 
77
75
  def matches(self, attributes: Dict) -> bool:
78
- for k, v in self.attributes.items():
79
- if attributes.get(k) != v:
76
+ for k, v in attributes.items():
77
+ if k in self.attributes and self.attributes[k] != v:
80
78
  return False
81
79
  return True
82
80
 
@@ -110,34 +108,6 @@ class AttributeBag:
110
108
  )
111
109
 
112
110
 
113
- def cloud_unpack_and_call(args):
114
- f_pickled, org_args = args
115
-
116
- f = cloudpickle.loads(f_pickled)
117
- return f(org_args)
118
-
119
-
120
- def map_in_pool(func, iterable, processes=0):
121
- # TODO: move to cmdline
122
- if os.environ.get("INGESTIFY_RUN_EAGER") == "true":
123
- return list(map(func, iterable))
124
-
125
- if not processes:
126
- processes = int(os.environ.get("INGESTIFY_CONCURRENCY", "0"))
127
-
128
- if "fork" in get_all_start_methods():
129
- ctx = get_context("fork")
130
- else:
131
- ctx = get_context("spawn")
132
-
133
- wrapped_fn = cloudpickle.dumps(func)
134
-
135
- with ctx.Pool(processes or cpu_count()) as pool:
136
- return pool.map(
137
- cloud_unpack_and_call, ((wrapped_fn, item) for item in iterable)
138
- )
139
-
140
-
141
111
  class SyncExecutor:
142
112
  def map(self, func, iterable):
143
113
  return [func(item) for item in iterable]