PyPI - ingestr - Versions diffs - 0.13.59__py3-none-any.whl → 0.13.60__py3-none-any.whl - Mend

ingestr 0.13.59py3-none-any.whl → 0.13.60py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ingestr might be problematic. Click here for more details.

Files changed (10) hide show

ingestr/main.py CHANGED Viewed

@@ -543,6 +543,7 @@ def ingest(
             sql_reflection_level=sql_reflection_level.value,
             sql_limit=sql_limit,
             sql_exclude_columns=sql_exclude_columns,
+            extract_parallelism=extract_parallelism,
         )
         resource.for_each(dlt_source, lambda x: x.add_map(cast_set_to_list))

ingestr/src/buildinfo.py CHANGED Viewed

	@@ -1 +1 @@
1	- version = "v0.13.59"
1	+ version = "v0.13.60"

ingestr/src/sources.py CHANGED Viewed

@@ -737,6 +737,7 @@ class StripeAnalyticsSource:
                         endpoint,
                     ],
                     stripe_secret_key=api_key[0],
+                    max_workers=kwargs.get("extract_parallelism", 4),
                 ).with_resources(endpoint)
         raise ValueError(

ingestr/src/stripe_analytics/__init__.py CHANGED Viewed

@@ -10,7 +10,6 @@ from pendulum import DateTime
 from .helpers import (
     async_parallel_pagination,
     pagination,
-    parallel_pagination,
     transform_date,
 )
@@ -55,53 +54,13 @@ def stripe_source(
         )(endpoint)
-@dlt.source(max_table_nesting=0)
-def parallel_stripe_source(
-    endpoints: Tuple[str, ...],
-    stripe_secret_key: str = dlt.secrets.value,
-    start_date: Optional[DateTime] = None,
-    end_date: Optional[DateTime] = None,
-    max_workers: int = 12,
-) -> Iterable[DltResource]:
-    """
-    Retrieves data from the Stripe API for the specified endpoints using parallel pagination.
-    This source divides the date range across multiple workers to fetch data in parallel,
-    which can significantly speed up data retrieval for large date ranges.
-    Args:
-        endpoints (Tuple[str, ...]): A tuple of endpoint names to retrieve data from.
-        stripe_secret_key (str): The API access token for authentication. Defaults to the value in the `dlt.secrets` object.
-        start_date (Optional[DateTime]): An optional start date to limit the data retrieved. Format: datetime(YYYY, MM, DD). Required for parallel processing.
-        end_date (Optional[DateTime]): An optional end date to limit the data retrieved. Format: datetime(YYYY, MM, DD). Required for parallel processing.
-        max_workers (int): Maximum number of worker threads for parallel fetching. Defaults to 4.
-    Returns:
-        Iterable[DltResource]: Resources with data that was created during the period greater than or equal to 'start_date' and less than 'end_date'.
-    """
-    stripe.api_key = stripe_secret_key
-    stripe.api_version = "2022-11-15"
-    def parallel_stripe_resource(
-        endpoint: str,
-    ) -> Generator[Dict[Any, Any], Any, None]:
-        yield from parallel_pagination(endpoint, start_date, end_date, max_workers)
-    for endpoint in endpoints:
-        yield dlt.resource(
-            parallel_stripe_resource,
-            name=endpoint,
-            write_disposition="replace",
-        )(endpoint)
 @dlt.source(max_table_nesting=0)
 def async_stripe_source(
     endpoints: Tuple[str, ...],
     stripe_secret_key: str = dlt.secrets.value,
     start_date: Optional[DateTime] = None,
     end_date: Optional[DateTime] = None,
-    max_workers: int = 40,
+    max_workers: int = 4,
     rate_limit_delay: float = 0.03,
 ) -> Iterable[DltResource]:
     """

ingestr/src/stripe_analytics/helpers.py CHANGED Viewed

@@ -43,67 +43,6 @@ def pagination(
             break
-def parallel_pagination(
-    endpoint: str,
-    start_date: Optional[Any] = None,
-    end_date: Optional[Any] = None,
-    max_workers: int = 4,
-) -> Iterable[TDataItem]:
-    """
-    Retrieves data from an endpoint with parallel pagination by dividing date ranges across workers.
-    Args:
-        endpoint (str): The endpoint to retrieve data from.
-        start_date (Optional[Any]): An optional start date to limit the data retrieved. Defaults to 2010-01-01 if None.
-        end_date (Optional[Any]): An optional end date to limit the data retrieved. Defaults to today if None.
-        max_workers (int): Maximum number of worker threads to use for parallel fetching. Defaults to 4.
-    Returns:
-        Iterable[TDataItem]: Data items retrieved from the endpoint.
-    """
-    # Set default date range if not provided: 2010 to today
-    if not start_date:
-        start_date = pendulum.datetime(2010, 1, 1)
-    if not end_date:
-        end_date = pendulum.now()
-    # Convert dates to timestamps for processing
-    start_ts = transform_date(start_date)
-    end_ts = transform_date(end_date)
-    # If date range is very small, use sequential pagination
-    date_range_days = (end_ts - start_ts) / (24 * 60 * 60)
-    if date_range_days < 30:  # Less than 30 days
-        yield from pagination(endpoint, start_date, end_date)
-        return
-    # Create time chunks with larger chunks for 2010s (less data expected)
-    time_chunks = _create_adaptive_time_chunks(start_ts, end_ts, max_workers)
-    # Use ThreadPoolExecutor to fetch data in parallel and yield as soon as ready
-    with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        # Submit all tasks
-        future_to_chunk = {
-            executor.submit(
-                _fetch_chunk_data_streaming, endpoint, chunk_start, chunk_end
-            ): (chunk_start, chunk_end)
-            for chunk_start, chunk_end in time_chunks
-        }
-        # MAXIMUM SPEED - Yield results immediately as they complete
-        for future in as_completed(future_to_chunk):
-            chunk_start, chunk_end = future_to_chunk[future]
-            try:
-                chunk_data = future.result()
-                # Yield all batches from this chunk immediately - NO ORDERING
-                for batch in chunk_data:
-                    yield batch
-            except Exception as exc:
-                print(f"Chunk {chunk_start}-{chunk_end} generated an exception: {exc}")
-                raise exc
 def _create_time_chunks(start_ts: int, end_ts: int, num_chunks: int) -> List[tuple]:
     """
     Divide a time range into equal chunks for parallel processing.
@@ -295,7 +234,6 @@ async def async_parallel_pagination(
     async def fetch_chunk_with_semaphore(chunk_start: int, chunk_end: int):
         async with semaphore:
-            await asyncio.sleep(rate_limit_delay)
             return await _fetch_chunk_data_async_fast(endpoint, chunk_start, chunk_end)
     # Create all tasks
@@ -390,6 +328,10 @@ async def stripe_get_data_async(
     max_wait_time_ms = 10000
     while retry_count < max_retries:
+        # print(
+        #     f"Fetching {resource} from {datetime.fromtimestamp(start_date).strftime('%Y-%m-%d %H:%M:%S') if start_date else 'None'} to {datetime.fromtimestamp(end_date).strftime('%Y-%m-%d %H:%M:%S') if end_date else 'None'}, retry {retry_count} of {max_retries}",
+        #     flush=True,
+        # )
         try:
             resource_dict = await getattr(stripe, resource).list_async(
                 created={"gte": start_date, "lt": end_date}, limit=100, **kwargs
@@ -399,6 +341,10 @@ async def stripe_get_data_async(
             retry_count += 1
             if retry_count < max_retries:
                 wait_time = min(2**retry_count * 0.001, max_wait_time_ms)
+                print(
+                    f"Got rate limited, sleeping {wait_time} seconds before retrying...",
+                    flush=True,
+                )
                 await asyncio.sleep(wait_time)
             else:
                 # Re-raise the last exception if we've exhausted retries

{ingestr-0.13.59.dist-info → ingestr-0.13.60.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ingestr
-Version: 0.13.59
+Version: 0.13.60
 Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
 Project-URL: Homepage, https://github.com/bruin-data/ingestr
 Project-URL: Issues, https://github.com/bruin-data/ingestr/issues

{ingestr-0.13.59.dist-info → ingestr-0.13.60.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,8 @@
 ingestr/conftest.py,sha256=Q03FIJIZpLBbpj55cfCHIKEjc1FCvWJhMF2cidUJKQU,1748
-ingestr/main.py,sha256=GkC1hdq8AVGrvolc95zMfjmibI95p2pmFkbgCOVf-Og,26311
+ingestr/main.py,sha256=taDyHyaVSpB17iNLl8zA0gmr4CqDO-MSTQX1CaRBB9U,26364
 ingestr/src/.gitignore,sha256=8cX1AZTSI0TcdZFGTmS_oyBjpfCzhOEt0DdAo2dFIY8,203
 ingestr/src/blob.py,sha256=UUWMjHUuoR9xP1XZQ6UANQmnMVyDx3d0X4-2FQC271I,2138
-ingestr/src/buildinfo.py,sha256=RkwXqGVjCaeOi85qaAT-2wI5-IYpZt76x8qkp2dVM1o,21
+ingestr/src/buildinfo.py,sha256=1sTup4WLO36DuLnh5cnxtmEDBjKKYxAOSisEvjELy1w,21
 ingestr/src/destinations.py,sha256=TcxM2rcwHfgMMP6U0yRNcfWKlEzkBbZbqCIDww7lkTY,16882
 ingestr/src/errors.py,sha256=Ufs4_DfE77_E3vnA1fOQdi6cmuLVNm7_SbFLkL1XPGk,686
 ingestr/src/factory.py,sha256=OKqjYqvHhgaOF48-eSNSabcfXt4Gmr1yZ8cFGizXh0g,6319
@@ -11,7 +11,7 @@ ingestr/src/http_client.py,sha256=bxqsk6nJNXCo-79gW04B53DQO-yr25vaSsqP0AKtjx4,73
 ingestr/src/loader.py,sha256=9NaWAyfkXdqAZSS-N72Iwo36Lbx4PyqIfaaH1dNdkFs,1712
 ingestr/src/partition.py,sha256=BrIP6wFJvyR7Nus_3ElnfxknUXeCipK_E_bB8kZowfc,969
 ingestr/src/resource.py,sha256=ZqmZxFQVGlF8rFPhBiUB08HES0yoTj8sZ--jKfaaVps,1164
-ingestr/src/sources.py,sha256=C2qPplmvRQdm1nzSPvGbMpYG6oGCbGZMTlVtVS48n6k,98977
+ingestr/src/sources.py,sha256=sJmiiInFb-KCPsaIy4qus6lx59MDCOobWgxJ7lfKH08,99047
 ingestr/src/table_definition.py,sha256=REbAbqdlmUMUuRh8nEQRreWjPVOQ5ZcfqGkScKdCrmk,390
 ingestr/src/time.py,sha256=H_Fk2J4ShXyUM-EMY7MqCLZQhlnZMZvO952bmZPc4yE,254
 ingestr/src/version.py,sha256=J_2xgZ0mKlvuHcjdKCx2nlioneLH0I47JiU_Slr_Nwc,189
@@ -119,8 +119,8 @@ ingestr/src/solidgate/__init__.py,sha256=JdaXvAu5QGuf9-FY294vwCQCEmfrqIld9oqbzqC
 ingestr/src/solidgate/helpers.py,sha256=oePEc9nnvmN3IaKrfJCvyKL79xdGM0-gRTN3-8tY4Fc,4952
 ingestr/src/sql_database/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 ingestr/src/sql_database/callbacks.py,sha256=sEFFmXxAURY3yeBjnawigDtq9LBCvi8HFqG4kLd7tMU,2002
-ingestr/src/stripe_analytics/__init__.py,sha256=g2miuPexUcPEEMzmPQZqxEaQ0Q8YjUAkOvKaLn3KC-c,8219
-ingestr/src/stripe_analytics/helpers.py,sha256=8in6k1ndTon7xNh8QPDqThBWvKY9XQrmrJXveAOA6R4,13858
+ingestr/src/stripe_analytics/__init__.py,sha256=mK8dGKAlMPVqGE9gG30XfbvOvgVD0yWhNpt-D3iavDY,6385
+ingestr/src/stripe_analytics/helpers.py,sha256=O5ow8xORcyLhw1Yn6vFm__tASfmPOgR0TMVU9gXmxcE,11828
 ingestr/src/stripe_analytics/settings.py,sha256=xt1-ljwP4nLTNUa8l3KwFbtK8FtQHgHpzGF5uPKfRsw,2246
 ingestr/src/telemetry/event.py,sha256=W7bs4uVfPakQ5otmiqgqu1l5SqjYx1p87wudnWXckBc,949
 ingestr/src/testdata/fakebqcredentials.json,sha256=scc6TUc963KAbKTLZCfcmqVzbtzDCW1_8JNRnyAXyy8,628
@@ -143,8 +143,8 @@ ingestr/testdata/merge_expected.csv,sha256=DReHqWGnQMsf2PBv_Q2pfjsgvikYFnf1zYcQZ
 ingestr/testdata/merge_part1.csv,sha256=Pw8Z9IDKcNU0qQHx1z6BUf4rF_-SxKGFOvymCt4OY9I,185
 ingestr/testdata/merge_part2.csv,sha256=T_GiWxA81SN63_tMOIuemcvboEFeAmbKc7xRXvL9esw,287
 ingestr/tests/unit/test_smartsheets.py,sha256=eiC2CCO4iNJcuN36ONvqmEDryCA1bA1REpayHpu42lk,5058
-ingestr-0.13.59.dist-info/METADATA,sha256=8yM2vLMiUV_zBq15gg_1Vf6UHgtwoRzawb_tcT_K3Wc,14993
-ingestr-0.13.59.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-ingestr-0.13.59.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
-ingestr-0.13.59.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
-ingestr-0.13.59.dist-info/RECORD,,
+ingestr-0.13.60.dist-info/METADATA,sha256=FwdcfGIPPRKlSV8wJX1HAqHriGUZBl_XXi0Yco8O874,14993
+ingestr-0.13.60.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+ingestr-0.13.60.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
+ingestr-0.13.60.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
+ingestr-0.13.60.dist-info/RECORD,,

{ingestr-0.13.59.dist-info → ingestr-0.13.60.dist-info}/WHEEL RENAMED Viewed

File without changes

{ingestr-0.13.59.dist-info → ingestr-0.13.60.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{ingestr-0.13.59.dist-info → ingestr-0.13.60.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

ingestr 0.13.59__py3-none-any.whl → 0.13.60__py3-none-any.whl

Potentially problematic release.

ingestr 0.13.59py3-none-any.whl → 0.13.60py3-none-any.whl