PyPI - ingestify - Versions diffs - 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

ingestify 0.4.1py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

ingestify/__init__.py +1 -1
ingestify/application/loader.py +8 -4
ingestify/cmdline.py +10 -1
ingestify/domain/models/ingestion/ingestion_job.py +19 -13
ingestify/domain/models/ingestion/ingestion_job_summary.py +23 -16
ingestify/infra/store/dataset/sqlalchemy/repository.py +76 -37
ingestify/infra/store/dataset/sqlalchemy/tables.py +2 -1
ingestify/infra/store/file/s3_file_repository.py +6 -1
ingestify/main.py +13 -6
ingestify/utils.py +38 -28
{ingestify-0.4.1.dist-info → ingestify-0.5.0.dist-info}/METADATA +1 -1
{ingestify-0.4.1.dist-info → ingestify-0.5.0.dist-info}/RECORD +15 -15
{ingestify-0.4.1.dist-info → ingestify-0.5.0.dist-info}/WHEEL +0 -0
{ingestify-0.4.1.dist-info → ingestify-0.5.0.dist-info}/entry_points.txt +0 -0
{ingestify-0.4.1.dist-info → ingestify-0.5.0.dist-info}/top_level.txt +0 -0

ingestify/__init__.py CHANGED Viewed

@@ -8,4 +8,4 @@ if not __INGESTIFY_SETUP__:
     from .infra import retrieve_http
     from .source_base import Source, DatasetResource
-__version__ = "0.4.1"
+__version__ = "0.5.0"

ingestify/application/loader.py CHANGED Viewed

@@ -35,11 +35,8 @@ class Loader:
         provider: Optional[str] = None,
         source: Optional[str] = None,
     ):
-        # First collect all selectors, before discovering datasets
-        selectors = {}
+        ingestion_plans = []
         for ingestion_plan in self.ingestion_plans:
-            logger.info(f"Determining selectors for {ingestion_plan}")
             if provider is not None:
                 if ingestion_plan.source.provider != provider:
                     logger.info(
@@ -54,6 +51,13 @@ class Loader:
                     )
                     continue
+            ingestion_plans.append(ingestion_plan)
+        # First collect all selectors, before discovering datasets
+        selectors = {}
+        for ingestion_plan in ingestion_plans:
+            logger.info(f"Determining selectors for {ingestion_plan}")
             static_selectors = [
                 selector
                 for selector in ingestion_plan.selectors

ingestify/cmdline.py CHANGED Viewed

@@ -88,6 +88,14 @@ def cli():
     help="Source - only run tasks for a single source",
     type=str,
 )
+@click.option(
+    "--disable-events",
+    "disable_events",
+    required=False,
+    help="Disable events - disable all event handlers",
+    is_flag=True,
+    type=bool,
+)
 def run(
     config_file: str,
     bucket: Optional[str],
@@ -95,9 +103,10 @@ def run(
     provider: Optional[str],
     source: Optional[str],
     debug: Optional[bool],
+    disable_events: Optional[bool],
 ):
     try:
-        engine = get_engine(config_file, bucket)
+        engine = get_engine(config_file, bucket, disable_events=disable_events)
     except ConfigurationError as e:
         if debug:
             raise

ingestify/domain/models/ingestion/ingestion_job.py CHANGED Viewed

@@ -218,7 +218,7 @@ class IngestionJob:
         # Process all items in batches. Yield a IngestionJobSummary per batch
         logger.info("Finding metadata")
-        with ingestion_job_summary.record_timing("get_dataset_collection"):
+        with ingestion_job_summary.record_timing("get_dataset_collection_metadata"):
             dataset_collection_metadata = store.get_dataset_collection(
                 dataset_type=self.ingestion_plan.dataset_type,
                 provider=self.ingestion_plan.source.provider,
@@ -232,6 +232,7 @@ class IngestionJob:
         # 1. The discover_datasets returns a list, and the entire list can be processed at once
         # 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
         try:
+            logger.info(f"Finding datasets for selector={self.selector}")
             with ingestion_job_summary.record_timing("find_datasets"):
                 dataset_resources = self.ingestion_plan.source.find_datasets(
                     dataset_type=self.ingestion_plan.dataset_type,
@@ -249,6 +250,8 @@ class IngestionJob:
             yield ingestion_job_summary
             return
+        logger.info("Starting tasks")
         finish_task_timer = ingestion_job_summary.start_timing("tasks")
         while True:
@@ -273,15 +276,18 @@ class IngestionJob:
                 for dataset_resource in batch
             ]
-            # Load all available datasets based on the discovered dataset identifiers
-            dataset_collection = store.get_dataset_collection(
-                dataset_type=self.ingestion_plan.dataset_type,
-                # Assume all DatasetResources share the same provider
-                provider=batch[0].provider,
-                selector=dataset_identifiers,
-            )
+            logger.info(f"Searching for existing Datasets for DatasetResources")
+            with ingestion_job_summary.record_timing("get_dataset_collection"):
+                # Load all available datasets based on the discovered dataset identifiers
+                dataset_collection = store.get_dataset_collection(
+                    dataset_type=self.ingestion_plan.dataset_type,
+                    # Assume all DatasetResources share the same provider
+                    provider=batch[0].provider,
+                    selector=dataset_identifiers,
+                )
-            skipped_datasets = 0
+            skipped_tasks = 0
             task_set = TaskSet()
             for dataset_resource in batch:
@@ -301,7 +307,7 @@ class IngestionJob:
                             )
                         )
                     else:
-                        skipped_datasets += 1
+                        skipped_tasks += 1
                 else:
                     if self.ingestion_plan.fetch_policy.should_fetch(dataset_resource):
                         task_set.add(
@@ -311,12 +317,12 @@ class IngestionJob:
                             )
                         )
                     else:
-                        skipped_datasets += 1
+                        skipped_tasks += 1
             if task_set:
                 logger.info(
                     f"Discovered {len(dataset_identifiers)} datasets from {self.ingestion_plan.source.__class__.__name__} "
-                    f"using selector {self.selector} => {len(task_set)} tasks. {skipped_datasets} skipped."
+                    f"using selector {self.selector} => {len(task_set)} tasks. {skipped_tasks} skipped."
                 )
                 logger.info(f"Running {len(task_set)} tasks")
                 ingestion_job_summary.add_task_summaries(
@@ -328,7 +334,7 @@ class IngestionJob:
                     f"using selector {self.selector} => nothing to do"
                 )
-            ingestion_job_summary.increase_skipped_datasets(skipped_datasets)
+            ingestion_job_summary.increase_skipped_tasks(skipped_tasks)
             if ingestion_job_summary.task_count() >= MAX_TASKS_PER_CHUNK:
                 finish_task_timer()

ingestify/domain/models/ingestion/ingestion_job_summary.py CHANGED Viewed

@@ -41,7 +41,8 @@ class IngestionJobSummary(BaseModel, HasTiming):
     state: IngestionJobState = IngestionJobState.RUNNING
     task_summaries: List[TaskSummary] = Field(default_factory=list)
-    skipped_datasets: int = 0
+    total_tasks: int = 0
+    skipped_tasks: int = 0
     failed_tasks: int = 0
     successful_tasks: int = 0
     ignored_successful_tasks: int = 0
@@ -62,11 +63,11 @@ class IngestionJobSummary(BaseModel, HasTiming):
     def add_task_summaries(self, task_summaries: List[TaskSummary]):
         self.task_summaries.extend(task_summaries)
-    def increase_skipped_datasets(self, skipped_datasets: int):
-        self.skipped_datasets += skipped_datasets
+    def increase_skipped_tasks(self, skipped_tasks: int):
+        self.skipped_tasks += skipped_tasks
     def task_count(self):
-        return len(self.task_summaries)
+        return len(self.task_summaries) + self.skipped_tasks
     def _set_ended(self):
         self.failed_tasks = len(
@@ -82,6 +83,12 @@ class IngestionJobSummary(BaseModel, HasTiming):
                 if task.state == TaskState.FINISHED_IGNORED
             ]
         )
+        self.total_tasks = (
+            self.failed_tasks
+            + self.successful_tasks
+            + self.ignored_successful_tasks
+            + self.skipped_tasks
+        )
         self.ended_at = utcnow()
         # Only keep failed tasks. Rest isn't interesting
@@ -106,22 +113,22 @@ class IngestionJobSummary(BaseModel, HasTiming):
             f"\nIngestionJobSummary {self.state.value} in {format_duration(self.duration)}"
         )
         print("********************************")
-        print(f"*  - IngestionPlan:")
-        print(f"*        Source: {self.source_name}")
-        print(f"*        Provider: {self.provider}")
-        print(f"*        DatasetType: {self.dataset_type}")
-        print(f"*  - Selector: {self.selector}")
-        print(f"*  - Timings: ")
+        print(f" - IngestionPlan:")
+        print(f"       Source: {self.source_name}")
+        print(f"       Provider: {self.provider}")
+        print(f"       DatasetType: {self.dataset_type}")
+        print(f" - Selector: {self.selector}")
+        print(f" - Timings: ")
         for timing in self.timings:
-            print(f"*    - {timing.name}: {format_duration(timing.duration)}")
+            print(f"   - {timing.name}: {format_duration(timing.duration)}")
         print(
-            f"*  - Tasks: {len(self.task_summaries)} - {(len(self.task_summaries) / self.duration.total_seconds()):.1f} tasks/sec"
+            f" - Tasks: {self.total_tasks} - {(self.total_tasks / self.duration.total_seconds()):.1f} tasks/sec"
         )
-        print(f"*    - Failed tasks: {self.failed_tasks}")
-        print(f"*    - Successful tasks: {self.successful_tasks}")
-        print(f"*    - Successful ignored tasks: {self.ignored_successful_tasks}")
-        print(f"*    - Skipped datasets: {self.skipped_datasets}")
+        print(f"   - Failed tasks: {self.failed_tasks}")
+        print(f"   - Successful tasks: {self.successful_tasks}")
+        print(f"   - Successful ignored tasks: {self.ignored_successful_tasks}")
+        print(f"   - Skipped datasets: {self.skipped_tasks}")
         print("********************************")
     def __enter__(self):

ingestify/infra/store/dataset/sqlalchemy/repository.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import itertools
+import logging
 import uuid
 from typing import Optional, Union, List
@@ -14,10 +15,11 @@ from sqlalchemy import (
     and_,
     Column,
     or_,
+    Dialect,
 )
 from sqlalchemy.engine import make_url
 from sqlalchemy.exc import NoSuchModuleError
-from sqlalchemy.orm import Session
+from sqlalchemy.orm import Session, Query, sessionmaker, scoped_session
 from ingestify.domain import File, Revision
 from ingestify.domain.models import (
@@ -32,6 +34,7 @@ from ingestify.domain.models.dataset.collection_metadata import (
 from ingestify.domain.models.ingestion.ingestion_job_summary import IngestionJobSummary
 from ingestify.domain.models.task.task_summary import TaskSummary
 from ingestify.exceptions import IngestifyError
+from ingestify.utils import get_concurrency
 from .tables import (
     metadata,
@@ -42,6 +45,8 @@ from .tables import (
     task_summary_table,
 )
+logger = logging.getLogger(__name__)
 def parse_value(v):
     try:
@@ -92,16 +97,15 @@ class SqlAlchemySessionProvider:
             self.url,
             # Use the default isolation level, don't need SERIALIZABLE
             # isolation_level="SERIALIZABLE",
+            pool_size=get_concurrency(),  # Maximum number of connections in the pool
+            max_overflow=5,
+            pool_recycle=1800,
+            pool_pre_ping=True,
         )
-        self.session = Session(bind=self.engine)
-    def __init__(self, url: str):
-        url = self.fix_url(url)
-        self.url = url
-        self._init_engine()
+        self.dialect = self.engine.dialect
-        metadata.create_all(self.engine)
+        session_factory = sessionmaker(bind=self.engine)
+        self.session = scoped_session(session_factory)
     def __getstate__(self):
         return {"url": self.url}
@@ -110,20 +114,27 @@ class SqlAlchemySessionProvider:
         self.url = state["url"]
         self._init_engine()
-    def _close_engine(self):
-        if hasattr(self, "session"):
-            self.session.close()
-            self.engine.dispose()
+    def __init__(self, url: str):
+        url = self.fix_url(url)
+        self.url = url
+        self._init_engine()
+        metadata.create_all(self.engine)
     def __del__(self):
-        self._close_engine()
+        self.close()
     def reset(self):
-        self._close_engine()
+        self.close()
         self._init_engine()
+    def close(self):
+        if hasattr(self, "engine"):
+            self.engine.dispose()
     def get(self):
-        return self.session
+        return self.session()
 def in_(column: Column, values):
@@ -138,8 +149,12 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
     def session(self):
         return self.session_provider.get()
+    @property
+    def dialect(self) -> Dialect:
+        return self.session_provider.dialect
     def _upsert(self, connection: Connection, table: Table, entities: list[dict]):
-        dialect = self.session.bind.dialect.name
+        dialect = self.dialect.name
         if dialect == "mysql":
             from sqlalchemy.dialects.mysql import insert
         elif dialect == "postgresql":
@@ -183,7 +198,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
             else:
                 query = query.filter(dataset_table.c.dataset_id == dataset_id)
-        dialect = self.session.bind.dialect.name
+        dialect = self.dialect.name
         if not isinstance(selector, list):
             where, selector = selector.split("where")
@@ -199,9 +214,6 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
             if not selectors:
                 raise ValueError("Selectors must contain at least one item")
-            attribute_keys = selectors[
-                0
-            ].filtered_attributes.keys()  # Assume all selectors have the same keys
             attribute_sets = {
                 tuple(selector.filtered_attributes.items()) for selector in selectors
             }
@@ -249,7 +261,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
         return query
-    def load_datasets(self, dataset_ids: list[str]) -> list[Dataset]:
+    def _load_datasets(self, dataset_ids: list[str]) -> list[Dataset]:
         if not dataset_ids:
             return []
@@ -303,6 +315,12 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
             )
         return datasets
+    def _debug_query(self, q: Query):
+        text_ = q.statement.compile(
+            compile_kwargs={"literal_binds": True}, dialect=self.dialect
+        )
+        logger.debug(f"Running query: {text_}")
     def get_dataset_collection(
         self,
         bucket: str,
@@ -322,22 +340,40 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
                 selector=selector,
             )
-        if not metadata_only:
-            dataset_query = apply_query_filter(
-                self.session.query(dataset_table.c.dataset_id)
-            )
-            dataset_ids = [row.dataset_id for row in dataset_query]
-            datasets = self.load_datasets(dataset_ids)
-        else:
-            datasets = []
+        with self.session:
+            # Use a contextmanager to make sure it's closed afterwards
-        metadata_result_row = apply_query_filter(
-            self.session.query(
-                func.max(dataset_table.c.last_modified_at).label("last_modified_at"),
-                func.count().label("row_count"),
-            )
-        ).first()
-        dataset_collection_metadata = DatasetCollectionMetadata(*metadata_result_row)
+            if not metadata_only:
+                dataset_query = apply_query_filter(
+                    self.session.query(dataset_table.c.dataset_id)
+                )
+                self._debug_query(dataset_query)
+                dataset_ids = [row.dataset_id for row in dataset_query]
+                datasets = self._load_datasets(dataset_ids)
+                dataset_collection_metadata = DatasetCollectionMetadata(
+                    last_modified=max(dataset.last_modified_at for dataset in datasets)
+                    if datasets
+                    else None,
+                    row_count=len(datasets),
+                )
+            else:
+                datasets = []
+                metadata_result_query = apply_query_filter(
+                    self.session.query(
+                        func.max(dataset_table.c.last_modified_at).label(
+                            "last_modified_at"
+                        ),
+                        func.count().label("row_count"),
+                    )
+                )
+                self._debug_query(metadata_result_query)
+                dataset_collection_metadata = DatasetCollectionMetadata(
+                    *metadata_result_query.first()
+                )
         return DatasetCollection(dataset_collection_metadata, datasets)
@@ -350,6 +386,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
     def connect(self):
         return self.session_provider.engine.connect()
+    def __del__(self):
+        self.session_provider.close()
     def _save(self, datasets: list[Dataset]):
         """Only do upserts. Never delete. Rows get only deleted when an entire Dataset is removed."""
         datasets_entities = []

ingestify/infra/store/dataset/sqlalchemy/tables.py CHANGED Viewed

@@ -247,9 +247,10 @@ ingestion_job_summary_table = Table(
     Column("ended_at", TZDateTime(6)),
     # Some task counters
     Column("state", IngestionJobStateString),
+    Column("total_tasks", Integer),
     Column("successful_tasks", Integer),
     Column("ignored_successful_tasks", Integer),
-    Column("skipped_datasets", Integer),
+    Column("skipped_tasks", Integer),
     Column("failed_tasks", Integer),
     Column(
         "timings",

ingestify/infra/store/file/s3_file_repository.py CHANGED Viewed

@@ -2,9 +2,11 @@ from pathlib import Path
 from typing import BinaryIO
 import boto3 as boto3
+import botocore.config
 from ingestify.domain import Dataset
 from ingestify.domain.models import FileRepository
+from ingestify.utils import get_concurrency
 class S3FileRepository(FileRepository):
@@ -13,7 +15,10 @@ class S3FileRepository(FileRepository):
     @property
     def s3(self):
         if not self._s3:
-            self._s3 = boto3.resource("s3")
+            client_config = botocore.config.Config(
+                max_pool_connections=get_concurrency(),
+            )
+            self._s3 = boto3.resource("s3", config=client_config)
         return self._s3
     def __getstate__(self):

ingestify/main.py CHANGED Viewed

@@ -182,7 +182,9 @@ def get_event_subscriber_cls(key: str) -> Type[Subscriber]:
     return import_cls(key)
-def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
+def get_engine(
+    config_file, bucket: Optional[str] = None, disable_events: bool = False
+) -> IngestionEngine:
     config = parse_config(config_file, default_value="")
     logger.info("Initializing sources")
@@ -201,11 +203,16 @@ def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
     # Setup an EventBus and wire some more components
     event_bus = EventBus()
-    publisher = Publisher()
-    for subscriber in config.get("event_subscribers", []):
-        cls = get_event_subscriber_cls(subscriber["type"])
-        publisher.add_subscriber(cls(store))
-    event_bus.register(publisher)
+    if not disable_events:
+        # When we disable all events we don't register any publishers
+        publisher = Publisher()
+        for subscriber in config.get("event_subscribers", []):
+            cls = get_event_subscriber_cls(subscriber["type"])
+            publisher.add_subscriber(cls(store))
+        event_bus.register(publisher)
+    else:
+        logger.info("Disabling all event handlers")
     store.set_event_bus(event_bus)
     ingestion_engine = IngestionEngine(

ingestify/utils.py CHANGED Viewed

@@ -3,6 +3,7 @@ import os
 import time
 import re
 import traceback
+from concurrent.futures import ThreadPoolExecutor
 from contextlib import contextmanager
 from multiprocessing import get_context, cpu_count, get_all_start_methods
@@ -137,59 +138,65 @@ def map_in_pool(func, iterable, processes=0):
         )
-class SyncPool:
+class SyncExecutor:
     def map(self, func, iterable):
         return [func(item) for item in iterable]
-    def join(self):
-        return True
+    def __enter__(self):
+        return self
-    def close(self):
-        return True
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        pass
-class DummyPool:
+class DummyExecutor:
     def map(self, func, iterable):
         logger.info(f"DummyPool: not running {len(list(iterable))} tasks")
         return None
-    def join(self):
-        return True
+    def __enter__(self):
+        return self
-    def close(self):
-        return True
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        pass
 class TaskExecutor:
     def __init__(self, processes=0, dry_run: bool = False):
         if dry_run:
-            pool = DummyPool()
+            executor = DummyExecutor()
         elif os.environ.get("INGESTIFY_RUN_EAGER") == "true":
-            pool = SyncPool()
+            executor = SyncExecutor()
         else:
             if not processes:
-                processes = int(os.environ.get("INGESTIFY_CONCURRENCY", "0"))
+                processes = get_concurrency()
+            # if "fork" in get_all_start_methods():
+            #     ctx = get_context("fork")
+            # else:
+            #     ctx = get_context("spawn")
-            if "fork" in get_all_start_methods():
-                ctx = get_context("fork")
-            else:
-                ctx = get_context("spawn")
+            # pool = ctx.Pool(processes or cpu_count())
-            pool = ctx.Pool(processes or cpu_count())
-        self.pool = pool
+            executor = ThreadPoolExecutor(max_workers=processes)
+        self.executor = executor
     def __enter__(self):
+        self.executor.__enter__()
         return self
     def __exit__(self, exc_type, exc_val, exc_tb):
-        self.join()
+        self.executor.__exit__(exc_type, exc_val, exc_tb)
     def run(self, func, iterable):
-        wrapped_fn = cloudpickle.dumps(func)
+        # If multiprocessing
+        # wrapped_fn = cloudpickle.dumps(func)
+        # res = self.executor.map(
+        #     cloud_unpack_and_call, ((wrapped_fn, item) for item in iterable)
+        # )
         start_time = time.time()
-        res = self.pool.map(
-            cloud_unpack_and_call, ((wrapped_fn, item) for item in iterable)
-        )
+        res = list(self.executor.map(func, iterable))
         if res:
             took = time.time() - start_time
             logger.info(
@@ -197,10 +204,6 @@ class TaskExecutor:
             )
         return res
-    def join(self):
-        self.pool.close()
-        self.pool.join()
 def try_number(s: str):
     try:
@@ -253,3 +256,10 @@ class HasTiming:
             self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
         return finish
+def get_concurrency():
+    concurrency = int(os.environ.get("INGESTIFY_CONCURRENCY", "0"))
+    if not concurrency:
+        concurrency = min(32, (os.cpu_count() or 1) + 4)
+    return concurrency

{ingestify-0.4.1.dist-info → ingestify-0.5.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ingestify
-Version: 0.4.1
+Version: 0.5.0
 Summary: Data Ingestion Framework
 Author: Koen Vossen
 Author-email: info@koenvossen.nl

{ingestify-0.4.1.dist-info → ingestify-0.5.0.dist-info}/RECORD RENAMED Viewed

@@ -1,14 +1,14 @@
-ingestify/__init__.py,sha256=xCS7JQ_JaB6zVzrq6WUeAZyNxVKJEOc7AKh-3vY_Ji8,301
-ingestify/cmdline.py,sha256=oagUe-Jup1SU3s6jVl25f0cSG0wlNYhxFY-gGBwWmr0,7482
+ingestify/__init__.py,sha256=6SmxhtKjGRDG31Ij8xc2i9L-7qC3qjA5DE89jQoD48Q,301
+ingestify/cmdline.py,sha256=JcveX6e4i6mJtIllhTuruwbqxyoKIITIWE8kB6byvJU,7721
 ingestify/exceptions.py,sha256=izRzaLQmMy-4P8ZqGqVZyf4k6LFYOYqwYLuRaUH8BJw,187
-ingestify/main.py,sha256=Xr0VbGgstPO7doDX18xqk4lBb4W2sbGWtQuXZaARsHA,8763
+ingestify/main.py,sha256=yYKA-4WAk04RdBCGmatsCKiPFQzpyufoG4VzHiWkVtU,8979
 ingestify/server.py,sha256=OVrf_XtpAQIn88MzqQzShXgsA9_jbnqYvD8YPBjn3cs,2413
 ingestify/source_base.py,sha256=GXAFCoT11Zov9M2v-fqQr9gFCXbtVfEIEH32V7r2oE8,382
-ingestify/utils.py,sha256=6BqgEZjecLW_anqYP5WrFpi93bmdhF-EdrebEkm59Ds,6806
+ingestify/utils.py,sha256=EMdG3ZP3bX9DHxHvBLdkYLC3vcEVym7dmpIXQTikI3I,7281
 ingestify/application/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 ingestify/application/dataset_store.py,sha256=JkAb1W0HaUgOwbohKntM4ttyrFXQ7df1uZSu2rbZllY,11680
 ingestify/application/ingestion_engine.py,sha256=4SAmPZDm3e2QA5jZvMrb6xz1eDDshKoSZDWH3TCe4Bo,2372
-ingestify/application/loader.py,sha256=2LpYSHvedsoe5wFsIkQv0xLcKcqtebwVOSPWapAypao,7566
+ingestify/application/loader.py,sha256=Lg3qPLaeKOFGheeqqfVeCBEF3cn61oZThgYYHoqfOvQ,7694
 ingestify/application/secrets_manager.py,sha256=5qCbPfUvRGP1Xbq6xPkMfpgYl8uPzF_0NbiKeRY5zxU,1757
 ingestify/domain/__init__.py,sha256=M7_fVTJjQUx53P4UQUPhowRKPKsIIjx4JYZL1yjHKsM,46
 ingestify/domain/models/__init__.py,sha256=cjQmdSDFA-saXjdF1mLPNWILFHIFgdj20J_fC5FmFsI,770
@@ -39,8 +39,8 @@ ingestify/domain/models/event/event_bus.py,sha256=iseourbCwdUg-ODM5bM_u6cageJmce
 ingestify/domain/models/event/publisher.py,sha256=TOAawYYiPQCLR2Gm17LumMEzeapMDYcAYeklLFmwqAY,620
 ingestify/domain/models/event/subscriber.py,sha256=tP1ZFSvpJWKUITnATYekRxJzepz85UY7egBTMiP-dwg,1039
 ingestify/domain/models/ingestion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-ingestify/domain/models/ingestion/ingestion_job.py,sha256=H9jfbbWFZw73nxMOW0480LgSHV-o4sA5IcvpUZmFpS4,13140
-ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=Xvmtu0BwE9C7FxBl6D8tN49I6--E_RngcMfWeme4DPA,4499
+ingestify/domain/models/ingestion/ingestion_job.py,sha256=-SxHunvtG8J2u8LwXacF26oItwMkLJN7Suelt-hjHgk,13434
+ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=ZEoL8kZfDM_NUYXD4_7Xpmtz6WduN50UcJBgNOxOxrE,4669
 ingestify/domain/models/ingestion/ingestion_plan.py,sha256=KAvITBMQt3zmMFokESQJyp3rMuz1Hxr6msfZK1_faZM,648
 ingestify/domain/models/resources/__init__.py,sha256=ZuY9DPRfwk-aLB3Lj6DYP_NqMkcQfcYjZp4VejTtcbU,46
 ingestify/domain/models/resources/dataset_resource.py,sha256=NRnN029ct3P_Eg2d9Unb1t7A12Ksv_emBGhoe9DpPwM,3118
@@ -64,12 +64,12 @@ ingestify/infra/source/wyscout.py,sha256=DxCzdkzYpVRHTfV9GpF8pe3FzwIk-WHYUlea6nO
 ingestify/infra/store/__init__.py,sha256=3dA6NWfB6FS5SFdQiSlJ0ZghBfnUAUuGIP5Vr4rkCqk,43
 ingestify/infra/store/dataset/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 ingestify/infra/store/dataset/sqlalchemy/__init__.py,sha256=Z5JHWGO_hwT6rO-ecMOOAmOKjFFJi449KZvJTQgt6vQ,52
-ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=3xDTqEEy_MxZoIX9qezpXasOFW7NMmduJEaR0PwTZXk,16110
-ingestify/infra/store/dataset/sqlalchemy/tables.py,sha256=OLB1FMElb3gSAnOsKX-oiLl_YVXaVEa6Q29QoHp2okU,10602
+ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=mIF7ly-lyCSNJQeem2Dpxlllzn34MxEA97qV929ARDY,17361
+ingestify/infra/store/dataset/sqlalchemy/tables.py,sha256=kALM32jbdeZ4Wn9gON-w2WSb5tH1lIWaBFgn5i29qTk,10635
 ingestify/infra/store/file/__init__.py,sha256=DuEekZa2pmDuRCFiulbgoGotN0wGv3OrRXSvokY0PhY,104
 ingestify/infra/store/file/dummy_file_repository.py,sha256=azUq9c43Mz9-GWk9j0E97BaqyUKu-ZMrcuaIednLq5E,723
 ingestify/infra/store/file/local_file_repository.py,sha256=1hhLqds5LlppJq2QBB0oN0Q98j6aXreCtYQYz3Q1P8g,819
-ingestify/infra/store/file/s3_file_repository.py,sha256=Zu7j3qqeQhKi9Lx8UQRKZ2g1vT0h0OucOaHjq0uZpFs,1290
+ingestify/infra/store/file/s3_file_repository.py,sha256=tz_EZ_gun7W2qJMlI3j_R03iKBZlJSDcG7AUJ1JkdpE,1501
 ingestify/static/templates/statsbomb_github/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 ingestify/static/templates/statsbomb_github/config.yaml.jinja2,sha256=_gAuAipfBL3ddLacyS1IBP5JluvPS2vmrb8GGaFtcUM,386
 ingestify/static/templates/statsbomb_github/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
@@ -80,8 +80,8 @@ ingestify/static/templates/wyscout/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
 ingestify/static/templates/wyscout/config.yaml.jinja2,sha256=0zQXuvJVwd0oL2OJsPMZ8sOvRbdfRbieSGLQ44ezmYc,379
 ingestify/static/templates/wyscout/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
 ingestify/static/templates/wyscout/database/README.md,sha256=7IuzjKo7Pqkx5wkmOETRZDljVOslqfA3ALuHMONq5dg,32
-ingestify-0.4.1.dist-info/METADATA,sha256=Tz062FbilTuQmmW2FPyr2sj0GIK1vjtZs189R5bkxEM,18854
-ingestify-0.4.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
-ingestify-0.4.1.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
-ingestify-0.4.1.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
-ingestify-0.4.1.dist-info/RECORD,,
+ingestify-0.5.0.dist-info/METADATA,sha256=EsJsolUWxelVsEOhLUyiut_tKPYfqHx9Pvvg_T-HFG4,18854
+ingestify-0.5.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
+ingestify-0.5.0.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
+ingestify-0.5.0.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
+ingestify-0.5.0.dist-info/RECORD,,

{ingestify-0.4.1.dist-info → ingestify-0.5.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{ingestify-0.4.1.dist-info → ingestify-0.5.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{ingestify-0.4.1.dist-info → ingestify-0.5.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

ingestify 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl

ingestify 0.4.1py3-none-any.whl → 0.5.0py3-none-any.whl