PyPI - ingestify - Versions diffs - 0.4.2__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

ingestify 0.4.2py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

ingestify/__init__.py CHANGED Viewed

@@ -8,4 +8,4 @@ if not __INGESTIFY_SETUP__:
     from .infra import retrieve_http
     from .source_base import Source, DatasetResource
-__version__ = "0.4.2"
+__version__ = "0.5.1"

ingestify/domain/models/ingestion/ingestion_job.py CHANGED Viewed

@@ -24,7 +24,9 @@ from ingestify.utils import TaskExecutor, chunker
 logger = logging.getLogger(__name__)
-DEFAULT_CHUNK_SIZE = 1_000
+# Decrease batch size from 1_000 to 500. The sqlalchemy repository uses
+# a compound select, which breaks at more than 500 select statements
+DEFAULT_CHUNK_SIZE = 500
 def run_task(task):
@@ -287,7 +289,7 @@ class IngestionJob:
                     selector=dataset_identifiers,
                 )
-            skipped_datasets = 0
+            skipped_tasks = 0
             task_set = TaskSet()
             for dataset_resource in batch:
@@ -307,7 +309,7 @@ class IngestionJob:
                             )
                         )
                     else:
-                        skipped_datasets += 1
+                        skipped_tasks += 1
                 else:
                     if self.ingestion_plan.fetch_policy.should_fetch(dataset_resource):
                         task_set.add(
@@ -317,12 +319,12 @@ class IngestionJob:
                             )
                         )
                     else:
-                        skipped_datasets += 1
+                        skipped_tasks += 1
             if task_set:
                 logger.info(
                     f"Discovered {len(dataset_identifiers)} datasets from {self.ingestion_plan.source.__class__.__name__} "
-                    f"using selector {self.selector} => {len(task_set)} tasks. {skipped_datasets} skipped."
+                    f"using selector {self.selector} => {len(task_set)} tasks. {skipped_tasks} skipped."
                 )
                 logger.info(f"Running {len(task_set)} tasks")
                 ingestion_job_summary.add_task_summaries(
@@ -334,7 +336,7 @@ class IngestionJob:
                     f"using selector {self.selector} => nothing to do"
                 )
-            ingestion_job_summary.increase_skipped_datasets(skipped_datasets)
+            ingestion_job_summary.increase_skipped_tasks(skipped_tasks)
             if ingestion_job_summary.task_count() >= MAX_TASKS_PER_CHUNK:
                 finish_task_timer()

ingestify/domain/models/ingestion/ingestion_job_summary.py CHANGED Viewed

@@ -41,7 +41,8 @@ class IngestionJobSummary(BaseModel, HasTiming):
     state: IngestionJobState = IngestionJobState.RUNNING
     task_summaries: List[TaskSummary] = Field(default_factory=list)
-    skipped_datasets: int = 0
+    total_tasks: int = 0
+    skipped_tasks: int = 0
     failed_tasks: int = 0
     successful_tasks: int = 0
     ignored_successful_tasks: int = 0
@@ -62,11 +63,11 @@ class IngestionJobSummary(BaseModel, HasTiming):
     def add_task_summaries(self, task_summaries: List[TaskSummary]):
         self.task_summaries.extend(task_summaries)
-    def increase_skipped_datasets(self, skipped_datasets: int):
-        self.skipped_datasets += skipped_datasets
+    def increase_skipped_tasks(self, skipped_tasks: int):
+        self.skipped_tasks += skipped_tasks
     def task_count(self):
-        return len(self.task_summaries) + self.skipped_datasets
+        return len(self.task_summaries) + self.skipped_tasks
     def _set_ended(self):
         self.failed_tasks = len(
@@ -82,6 +83,12 @@ class IngestionJobSummary(BaseModel, HasTiming):
                 if task.state == TaskState.FINISHED_IGNORED
             ]
         )
+        self.total_tasks = (
+            self.failed_tasks
+            + self.successful_tasks
+            + self.ignored_successful_tasks
+            + self.skipped_tasks
+        )
         self.ended_at = utcnow()
         # Only keep failed tasks. Rest isn't interesting
@@ -115,13 +122,13 @@ class IngestionJobSummary(BaseModel, HasTiming):
         for timing in self.timings:
             print(f"   - {timing.name}: {format_duration(timing.duration)}")
         print(
-            f" - Tasks: {len(self.task_summaries)} - {(len(self.task_summaries) / self.duration.total_seconds()):.1f} tasks/sec"
+            f" - Tasks: {self.total_tasks} - {(self.total_tasks / self.duration.total_seconds()):.1f} tasks/sec"
         )
         print(f"   - Failed tasks: {self.failed_tasks}")
         print(f"   - Successful tasks: {self.successful_tasks}")
         print(f"   - Successful ignored tasks: {self.ignored_successful_tasks}")
-        print(f"   - Skipped datasets: {self.skipped_datasets}")
+        print(f"   - Skipped datasets: {self.skipped_tasks}")
         print("********************************")
     def __enter__(self):

ingestify/infra/store/dataset/sqlalchemy/repository.py CHANGED Viewed

@@ -19,7 +19,7 @@ from sqlalchemy import (
 )
 from sqlalchemy.engine import make_url
 from sqlalchemy.exc import NoSuchModuleError
-from sqlalchemy.orm import Session, Query
+from sqlalchemy.orm import Session, Query, sessionmaker, scoped_session
 from ingestify.domain import File, Revision
 from ingestify.domain.models import (
@@ -34,6 +34,7 @@ from ingestify.domain.models.dataset.collection_metadata import (
 from ingestify.domain.models.ingestion.ingestion_job_summary import IngestionJobSummary
 from ingestify.domain.models.task.task_summary import TaskSummary
 from ingestify.exceptions import IngestifyError
+from ingestify.utils import get_concurrency
 from .tables import (
     metadata,
@@ -96,17 +97,15 @@ class SqlAlchemySessionProvider:
             self.url,
             # Use the default isolation level, don't need SERIALIZABLE
             # isolation_level="SERIALIZABLE",
+            pool_size=get_concurrency(),  # Maximum number of connections in the pool
+            max_overflow=5,
+            pool_recycle=1800,
+            pool_pre_ping=True,
         )
         self.dialect = self.engine.dialect
-        self.session = Session(bind=self.engine)
-    def __init__(self, url: str):
-        url = self.fix_url(url)
-        self.url = url
-        self._init_engine()
-        metadata.create_all(self.engine)
+        session_factory = sessionmaker(bind=self.engine)
+        self.session = scoped_session(session_factory)
     def __getstate__(self):
         return {"url": self.url}
@@ -115,6 +114,14 @@ class SqlAlchemySessionProvider:
         self.url = state["url"]
         self._init_engine()
+    def __init__(self, url: str):
+        url = self.fix_url(url)
+        self.url = url
+        self._init_engine()
+        metadata.create_all(self.engine)
     def __del__(self):
         self.close()
@@ -123,16 +130,11 @@ class SqlAlchemySessionProvider:
         self._init_engine()
     def close(self):
-        if hasattr(self, "session"):
-            self.session.close()
+        if hasattr(self, "engine"):
             self.engine.dispose()
     def get(self):
-        return self.session
-def in_(column: Column, values):
-    return or_(*[column == value for value in values])
+        return self.session()
 class SqlAlchemyDatasetRepository(DatasetRepository):
@@ -188,7 +190,19 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
                     # return an empty DatasetCollection
                     return DatasetCollection()
-                query = query.filter(in_(dataset_table.c.dataset_id, dataset_id))
+                dataset_ids_cte = union_all(
+                    *[
+                        select(literal(dataset_id).label("dataset_id"))
+                        for dataset_id in set(dataset_id)
+                    ]
+                ).cte("dataset_ids")
+                query = query.select_from(
+                    dataset_table.join(
+                        dataset_ids_cte,
+                        dataset_ids_cte.c.dataset_id == dataset_table.c.dataset_id,
+                    )
+                )
             else:
                 query = query.filter(dataset_table.c.dataset_id == dataset_id)
@@ -259,15 +273,30 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
         if not dataset_ids:
             return []
+        dataset_ids_cte = union_all(
+            *[
+                select(literal(dataset_id).label("dataset_id"))
+                for dataset_id in set(dataset_ids)
+            ]
+        ).cte("dataset_ids")
         dataset_rows = list(
-            self.session.query(dataset_table).filter(
-                in_(dataset_table.c.dataset_id, dataset_ids)
+            self.session.query(dataset_table).select_from(
+                dataset_table.join(
+                    dataset_ids_cte,
+                    dataset_ids_cte.c.dataset_id == dataset_table.c.dataset_id,
+                )
             )
         )
         revisions_per_dataset = {}
         rows = (
             self.session.query(revision_table)
-            .filter(in_(revision_table.c.dataset_id, dataset_ids))
+            .select_from(
+                revision_table.join(
+                    dataset_ids_cte,
+                    dataset_ids_cte.c.dataset_id == revision_table.c.dataset_id,
+                )
+            )
             .order_by(revision_table.c.dataset_id)
         )
@@ -279,7 +308,12 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
         files_per_revision = {}
         rows = (
             self.session.query(file_table)
-            .filter(in_(file_table.c.dataset_id, dataset_ids))
+            .select_from(
+                file_table.join(
+                    dataset_ids_cte,
+                    dataset_ids_cte.c.dataset_id == file_table.c.dataset_id,
+                )
+            )
             .order_by(file_table.c.dataset_id, file_table.c.revision_id)
         )

ingestify/infra/store/dataset/sqlalchemy/tables.py CHANGED Viewed

@@ -247,9 +247,10 @@ ingestion_job_summary_table = Table(
     Column("ended_at", TZDateTime(6)),
     # Some task counters
     Column("state", IngestionJobStateString),
+    Column("total_tasks", Integer),
     Column("successful_tasks", Integer),
     Column("ignored_successful_tasks", Integer),
-    Column("skipped_datasets", Integer),
+    Column("skipped_tasks", Integer),
     Column("failed_tasks", Integer),
     Column(
         "timings",

ingestify/infra/store/file/s3_file_repository.py CHANGED Viewed

@@ -2,9 +2,11 @@ from pathlib import Path
 from typing import BinaryIO
 import boto3 as boto3
+import botocore.config
 from ingestify.domain import Dataset
 from ingestify.domain.models import FileRepository
+from ingestify.utils import get_concurrency
 class S3FileRepository(FileRepository):
@@ -13,7 +15,10 @@ class S3FileRepository(FileRepository):
     @property
     def s3(self):
         if not self._s3:
-            self._s3 = boto3.resource("s3")
+            client_config = botocore.config.Config(
+                max_pool_connections=get_concurrency(),
+            )
+            self._s3 = boto3.resource("s3", config=client_config)
         return self._s3
     def __getstate__(self):

ingestify/utils.py CHANGED Viewed

@@ -3,6 +3,7 @@ import os
 import time
 import re
 import traceback
+from concurrent.futures import ThreadPoolExecutor
 from contextlib import contextmanager
 from multiprocessing import get_context, cpu_count, get_all_start_methods
@@ -137,59 +138,65 @@ def map_in_pool(func, iterable, processes=0):
         )
-class SyncPool:
+class SyncExecutor:
     def map(self, func, iterable):
         return [func(item) for item in iterable]
-    def join(self):
-        return True
+    def __enter__(self):
+        return self
-    def close(self):
-        return True
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        pass
-class DummyPool:
+class DummyExecutor:
     def map(self, func, iterable):
         logger.info(f"DummyPool: not running {len(list(iterable))} tasks")
         return None
-    def join(self):
-        return True
+    def __enter__(self):
+        return self
-    def close(self):
-        return True
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        pass
 class TaskExecutor:
     def __init__(self, processes=0, dry_run: bool = False):
         if dry_run:
-            pool = DummyPool()
+            executor = DummyExecutor()
         elif os.environ.get("INGESTIFY_RUN_EAGER") == "true":
-            pool = SyncPool()
+            executor = SyncExecutor()
         else:
             if not processes:
-                processes = int(os.environ.get("INGESTIFY_CONCURRENCY", "0"))
+                processes = get_concurrency()
+            # if "fork" in get_all_start_methods():
+            #     ctx = get_context("fork")
+            # else:
+            #     ctx = get_context("spawn")
-            if "fork" in get_all_start_methods():
-                ctx = get_context("fork")
-            else:
-                ctx = get_context("spawn")
+            # pool = ctx.Pool(processes or cpu_count())
-            pool = ctx.Pool(processes or cpu_count())
-        self.pool = pool
+            executor = ThreadPoolExecutor(max_workers=processes)
+        self.executor = executor
     def __enter__(self):
+        self.executor.__enter__()
         return self
     def __exit__(self, exc_type, exc_val, exc_tb):
-        self.join()
+        self.executor.__exit__(exc_type, exc_val, exc_tb)
     def run(self, func, iterable):
-        wrapped_fn = cloudpickle.dumps(func)
+        # If multiprocessing
+        # wrapped_fn = cloudpickle.dumps(func)
+        # res = self.executor.map(
+        #     cloud_unpack_and_call, ((wrapped_fn, item) for item in iterable)
+        # )
         start_time = time.time()
-        res = self.pool.map(
-            cloud_unpack_and_call, ((wrapped_fn, item) for item in iterable)
-        )
+        res = list(self.executor.map(func, iterable))
         if res:
             took = time.time() - start_time
             logger.info(
@@ -197,10 +204,6 @@ class TaskExecutor:
             )
         return res
-    def join(self):
-        self.pool.close()
-        self.pool.join()
 def try_number(s: str):
     try:
@@ -253,3 +256,10 @@ class HasTiming:
             self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
         return finish
+def get_concurrency():
+    concurrency = int(os.environ.get("INGESTIFY_CONCURRENCY", "0"))
+    if not concurrency:
+        concurrency = min(32, (os.cpu_count() or 1) + 4)
+    return concurrency

{ingestify-0.4.2.dist-info → ingestify-0.5.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ingestify
-Version: 0.4.2
+Version: 0.5.1
 Summary: Data Ingestion Framework
 Author: Koen Vossen
 Author-email: info@koenvossen.nl

{ingestify-0.4.2.dist-info → ingestify-0.5.1.dist-info}/RECORD RENAMED Viewed

@@ -1,10 +1,10 @@
-ingestify/__init__.py,sha256=x4r1Cw7NXlEu1lunx4jwI0b3SZ7MhTbWSVlHStDtVaI,301
+ingestify/__init__.py,sha256=Un08YQgoC1u_2dbvOdtOD59OKsUL78ekru-86GA3zpA,301
 ingestify/cmdline.py,sha256=JcveX6e4i6mJtIllhTuruwbqxyoKIITIWE8kB6byvJU,7721
 ingestify/exceptions.py,sha256=izRzaLQmMy-4P8ZqGqVZyf4k6LFYOYqwYLuRaUH8BJw,187
 ingestify/main.py,sha256=yYKA-4WAk04RdBCGmatsCKiPFQzpyufoG4VzHiWkVtU,8979
 ingestify/server.py,sha256=OVrf_XtpAQIn88MzqQzShXgsA9_jbnqYvD8YPBjn3cs,2413
 ingestify/source_base.py,sha256=GXAFCoT11Zov9M2v-fqQr9gFCXbtVfEIEH32V7r2oE8,382
-ingestify/utils.py,sha256=6BqgEZjecLW_anqYP5WrFpi93bmdhF-EdrebEkm59Ds,6806
+ingestify/utils.py,sha256=EMdG3ZP3bX9DHxHvBLdkYLC3vcEVym7dmpIXQTikI3I,7281
 ingestify/application/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 ingestify/application/dataset_store.py,sha256=JkAb1W0HaUgOwbohKntM4ttyrFXQ7df1uZSu2rbZllY,11680
 ingestify/application/ingestion_engine.py,sha256=4SAmPZDm3e2QA5jZvMrb6xz1eDDshKoSZDWH3TCe4Bo,2372
@@ -39,8 +39,8 @@ ingestify/domain/models/event/event_bus.py,sha256=iseourbCwdUg-ODM5bM_u6cageJmce
 ingestify/domain/models/event/publisher.py,sha256=TOAawYYiPQCLR2Gm17LumMEzeapMDYcAYeklLFmwqAY,620
 ingestify/domain/models/event/subscriber.py,sha256=tP1ZFSvpJWKUITnATYekRxJzepz85UY7egBTMiP-dwg,1039
 ingestify/domain/models/ingestion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-ingestify/domain/models/ingestion/ingestion_job.py,sha256=Xprxv3SiMrJ5efleEbH2HS6MxZdMqDd7Pw2qp-yiM2U,13452
-ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=MYd0-IYbEtAp4VWAXLA0xnyat1e52VNOevDZo3M4jg0,4499
+ingestify/domain/models/ingestion/ingestion_job.py,sha256=2Tibe1fKZU84LO_qHniO33ChTcJy3K0YLkVro8CjJPs,13573
+ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=ZEoL8kZfDM_NUYXD4_7Xpmtz6WduN50UcJBgNOxOxrE,4669
 ingestify/domain/models/ingestion/ingestion_plan.py,sha256=KAvITBMQt3zmMFokESQJyp3rMuz1Hxr6msfZK1_faZM,648
 ingestify/domain/models/resources/__init__.py,sha256=ZuY9DPRfwk-aLB3Lj6DYP_NqMkcQfcYjZp4VejTtcbU,46
 ingestify/domain/models/resources/dataset_resource.py,sha256=NRnN029ct3P_Eg2d9Unb1t7A12Ksv_emBGhoe9DpPwM,3118
@@ -64,12 +64,12 @@ ingestify/infra/source/wyscout.py,sha256=DxCzdkzYpVRHTfV9GpF8pe3FzwIk-WHYUlea6nO
 ingestify/infra/store/__init__.py,sha256=3dA6NWfB6FS5SFdQiSlJ0ZghBfnUAUuGIP5Vr4rkCqk,43
 ingestify/infra/store/dataset/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 ingestify/infra/store/dataset/sqlalchemy/__init__.py,sha256=Z5JHWGO_hwT6rO-ecMOOAmOKjFFJi449KZvJTQgt6vQ,52
-ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=ope_F-PVkXVo_oiUmsYdbUplC9aUnrTe4anlou-Y-y8,17078
-ingestify/infra/store/dataset/sqlalchemy/tables.py,sha256=OLB1FMElb3gSAnOsKX-oiLl_YVXaVEa6Q29QoHp2okU,10602
+ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=hn4x5tglqxO4EdyiAt_4wnDXmmWU87twHtBmVBRHVSY,18309
+ingestify/infra/store/dataset/sqlalchemy/tables.py,sha256=kALM32jbdeZ4Wn9gON-w2WSb5tH1lIWaBFgn5i29qTk,10635
 ingestify/infra/store/file/__init__.py,sha256=DuEekZa2pmDuRCFiulbgoGotN0wGv3OrRXSvokY0PhY,104
 ingestify/infra/store/file/dummy_file_repository.py,sha256=azUq9c43Mz9-GWk9j0E97BaqyUKu-ZMrcuaIednLq5E,723
 ingestify/infra/store/file/local_file_repository.py,sha256=1hhLqds5LlppJq2QBB0oN0Q98j6aXreCtYQYz3Q1P8g,819
-ingestify/infra/store/file/s3_file_repository.py,sha256=Zu7j3qqeQhKi9Lx8UQRKZ2g1vT0h0OucOaHjq0uZpFs,1290
+ingestify/infra/store/file/s3_file_repository.py,sha256=tz_EZ_gun7W2qJMlI3j_R03iKBZlJSDcG7AUJ1JkdpE,1501
 ingestify/static/templates/statsbomb_github/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 ingestify/static/templates/statsbomb_github/config.yaml.jinja2,sha256=_gAuAipfBL3ddLacyS1IBP5JluvPS2vmrb8GGaFtcUM,386
 ingestify/static/templates/statsbomb_github/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
@@ -80,8 +80,8 @@ ingestify/static/templates/wyscout/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
 ingestify/static/templates/wyscout/config.yaml.jinja2,sha256=0zQXuvJVwd0oL2OJsPMZ8sOvRbdfRbieSGLQ44ezmYc,379
 ingestify/static/templates/wyscout/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
 ingestify/static/templates/wyscout/database/README.md,sha256=7IuzjKo7Pqkx5wkmOETRZDljVOslqfA3ALuHMONq5dg,32
-ingestify-0.4.2.dist-info/METADATA,sha256=E_if9fF-7cbW-CD3a4aQyinXPCgna-ZEv4mg_sTyl-0,18854
-ingestify-0.4.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
-ingestify-0.4.2.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
-ingestify-0.4.2.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
-ingestify-0.4.2.dist-info/RECORD,,
+ingestify-0.5.1.dist-info/METADATA,sha256=TOqbUz13KxM8v8kR-owtafHvYEXwj5ruaDP_bigoIbI,18854
+ingestify-0.5.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
+ingestify-0.5.1.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
+ingestify-0.5.1.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
+ingestify-0.5.1.dist-info/RECORD,,

{ingestify-0.4.2.dist-info → ingestify-0.5.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{ingestify-0.4.2.dist-info → ingestify-0.5.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{ingestify-0.4.2.dist-info → ingestify-0.5.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

ingestify 0.4.2__py3-none-any.whl → 0.5.1__py3-none-any.whl

ingestify 0.4.2py3-none-any.whl → 0.5.1py3-none-any.whl