ingestr 0.13.59__py3-none-any.whl → 0.13.60__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

ingestr/main.py CHANGED
@@ -543,6 +543,7 @@ def ingest(
543
543
  sql_reflection_level=sql_reflection_level.value,
544
544
  sql_limit=sql_limit,
545
545
  sql_exclude_columns=sql_exclude_columns,
546
+ extract_parallelism=extract_parallelism,
546
547
  )
547
548
 
548
549
  resource.for_each(dlt_source, lambda x: x.add_map(cast_set_to_list))
ingestr/src/buildinfo.py CHANGED
@@ -1 +1 @@
1
- version = "v0.13.59"
1
+ version = "v0.13.60"
ingestr/src/sources.py CHANGED
@@ -737,6 +737,7 @@ class StripeAnalyticsSource:
737
737
  endpoint,
738
738
  ],
739
739
  stripe_secret_key=api_key[0],
740
+ max_workers=kwargs.get("extract_parallelism", 4),
740
741
  ).with_resources(endpoint)
741
742
 
742
743
  raise ValueError(
@@ -10,7 +10,6 @@ from pendulum import DateTime
10
10
  from .helpers import (
11
11
  async_parallel_pagination,
12
12
  pagination,
13
- parallel_pagination,
14
13
  transform_date,
15
14
  )
16
15
 
@@ -55,53 +54,13 @@ def stripe_source(
55
54
  )(endpoint)
56
55
 
57
56
 
58
- @dlt.source(max_table_nesting=0)
59
- def parallel_stripe_source(
60
- endpoints: Tuple[str, ...],
61
- stripe_secret_key: str = dlt.secrets.value,
62
- start_date: Optional[DateTime] = None,
63
- end_date: Optional[DateTime] = None,
64
- max_workers: int = 12,
65
- ) -> Iterable[DltResource]:
66
- """
67
- Retrieves data from the Stripe API for the specified endpoints using parallel pagination.
68
-
69
- This source divides the date range across multiple workers to fetch data in parallel,
70
- which can significantly speed up data retrieval for large date ranges.
71
-
72
- Args:
73
- endpoints (Tuple[str, ...]): A tuple of endpoint names to retrieve data from.
74
- stripe_secret_key (str): The API access token for authentication. Defaults to the value in the `dlt.secrets` object.
75
- start_date (Optional[DateTime]): An optional start date to limit the data retrieved. Format: datetime(YYYY, MM, DD). Required for parallel processing.
76
- end_date (Optional[DateTime]): An optional end date to limit the data retrieved. Format: datetime(YYYY, MM, DD). Required for parallel processing.
77
- max_workers (int): Maximum number of worker threads for parallel fetching. Defaults to 4.
78
-
79
- Returns:
80
- Iterable[DltResource]: Resources with data that was created during the period greater than or equal to 'start_date' and less than 'end_date'.
81
- """
82
- stripe.api_key = stripe_secret_key
83
- stripe.api_version = "2022-11-15"
84
-
85
- def parallel_stripe_resource(
86
- endpoint: str,
87
- ) -> Generator[Dict[Any, Any], Any, None]:
88
- yield from parallel_pagination(endpoint, start_date, end_date, max_workers)
89
-
90
- for endpoint in endpoints:
91
- yield dlt.resource(
92
- parallel_stripe_resource,
93
- name=endpoint,
94
- write_disposition="replace",
95
- )(endpoint)
96
-
97
-
98
57
  @dlt.source(max_table_nesting=0)
99
58
  def async_stripe_source(
100
59
  endpoints: Tuple[str, ...],
101
60
  stripe_secret_key: str = dlt.secrets.value,
102
61
  start_date: Optional[DateTime] = None,
103
62
  end_date: Optional[DateTime] = None,
104
- max_workers: int = 40,
63
+ max_workers: int = 4,
105
64
  rate_limit_delay: float = 0.03,
106
65
  ) -> Iterable[DltResource]:
107
66
  """
@@ -43,67 +43,6 @@ def pagination(
43
43
  break
44
44
 
45
45
 
46
- def parallel_pagination(
47
- endpoint: str,
48
- start_date: Optional[Any] = None,
49
- end_date: Optional[Any] = None,
50
- max_workers: int = 4,
51
- ) -> Iterable[TDataItem]:
52
- """
53
- Retrieves data from an endpoint with parallel pagination by dividing date ranges across workers.
54
-
55
- Args:
56
- endpoint (str): The endpoint to retrieve data from.
57
- start_date (Optional[Any]): An optional start date to limit the data retrieved. Defaults to 2010-01-01 if None.
58
- end_date (Optional[Any]): An optional end date to limit the data retrieved. Defaults to today if None.
59
- max_workers (int): Maximum number of worker threads to use for parallel fetching. Defaults to 4.
60
-
61
- Returns:
62
- Iterable[TDataItem]: Data items retrieved from the endpoint.
63
- """
64
- # Set default date range if not provided: 2010 to today
65
- if not start_date:
66
- start_date = pendulum.datetime(2010, 1, 1)
67
- if not end_date:
68
- end_date = pendulum.now()
69
-
70
- # Convert dates to timestamps for processing
71
- start_ts = transform_date(start_date)
72
- end_ts = transform_date(end_date)
73
-
74
- # If date range is very small, use sequential pagination
75
- date_range_days = (end_ts - start_ts) / (24 * 60 * 60)
76
- if date_range_days < 30: # Less than 30 days
77
- yield from pagination(endpoint, start_date, end_date)
78
- return
79
-
80
- # Create time chunks with larger chunks for 2010s (less data expected)
81
- time_chunks = _create_adaptive_time_chunks(start_ts, end_ts, max_workers)
82
-
83
- # Use ThreadPoolExecutor to fetch data in parallel and yield as soon as ready
84
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
85
- # Submit all tasks
86
- future_to_chunk = {
87
- executor.submit(
88
- _fetch_chunk_data_streaming, endpoint, chunk_start, chunk_end
89
- ): (chunk_start, chunk_end)
90
- for chunk_start, chunk_end in time_chunks
91
- }
92
-
93
- # MAXIMUM SPEED - Yield results immediately as they complete
94
- for future in as_completed(future_to_chunk):
95
- chunk_start, chunk_end = future_to_chunk[future]
96
- try:
97
- chunk_data = future.result()
98
- # Yield all batches from this chunk immediately - NO ORDERING
99
- for batch in chunk_data:
100
- yield batch
101
-
102
- except Exception as exc:
103
- print(f"Chunk {chunk_start}-{chunk_end} generated an exception: {exc}")
104
- raise exc
105
-
106
-
107
46
  def _create_time_chunks(start_ts: int, end_ts: int, num_chunks: int) -> List[tuple]:
108
47
  """
109
48
  Divide a time range into equal chunks for parallel processing.
@@ -295,7 +234,6 @@ async def async_parallel_pagination(
295
234
 
296
235
  async def fetch_chunk_with_semaphore(chunk_start: int, chunk_end: int):
297
236
  async with semaphore:
298
- await asyncio.sleep(rate_limit_delay)
299
237
  return await _fetch_chunk_data_async_fast(endpoint, chunk_start, chunk_end)
300
238
 
301
239
  # Create all tasks
@@ -390,6 +328,10 @@ async def stripe_get_data_async(
390
328
  max_wait_time_ms = 10000
391
329
 
392
330
  while retry_count < max_retries:
331
+ # print(
332
+ # f"Fetching {resource} from {datetime.fromtimestamp(start_date).strftime('%Y-%m-%d %H:%M:%S') if start_date else 'None'} to {datetime.fromtimestamp(end_date).strftime('%Y-%m-%d %H:%M:%S') if end_date else 'None'}, retry {retry_count} of {max_retries}",
333
+ # flush=True,
334
+ # )
393
335
  try:
394
336
  resource_dict = await getattr(stripe, resource).list_async(
395
337
  created={"gte": start_date, "lt": end_date}, limit=100, **kwargs
@@ -399,6 +341,10 @@ async def stripe_get_data_async(
399
341
  retry_count += 1
400
342
  if retry_count < max_retries:
401
343
  wait_time = min(2**retry_count * 0.001, max_wait_time_ms)
344
+ print(
345
+ f"Got rate limited, sleeping {wait_time} seconds before retrying...",
346
+ flush=True,
347
+ )
402
348
  await asyncio.sleep(wait_time)
403
349
  else:
404
350
  # Re-raise the last exception if we've exhausted retries
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ingestr
3
- Version: 0.13.59
3
+ Version: 0.13.60
4
4
  Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
5
5
  Project-URL: Homepage, https://github.com/bruin-data/ingestr
6
6
  Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
@@ -1,8 +1,8 @@
1
1
  ingestr/conftest.py,sha256=Q03FIJIZpLBbpj55cfCHIKEjc1FCvWJhMF2cidUJKQU,1748
2
- ingestr/main.py,sha256=GkC1hdq8AVGrvolc95zMfjmibI95p2pmFkbgCOVf-Og,26311
2
+ ingestr/main.py,sha256=taDyHyaVSpB17iNLl8zA0gmr4CqDO-MSTQX1CaRBB9U,26364
3
3
  ingestr/src/.gitignore,sha256=8cX1AZTSI0TcdZFGTmS_oyBjpfCzhOEt0DdAo2dFIY8,203
4
4
  ingestr/src/blob.py,sha256=UUWMjHUuoR9xP1XZQ6UANQmnMVyDx3d0X4-2FQC271I,2138
5
- ingestr/src/buildinfo.py,sha256=RkwXqGVjCaeOi85qaAT-2wI5-IYpZt76x8qkp2dVM1o,21
5
+ ingestr/src/buildinfo.py,sha256=1sTup4WLO36DuLnh5cnxtmEDBjKKYxAOSisEvjELy1w,21
6
6
  ingestr/src/destinations.py,sha256=TcxM2rcwHfgMMP6U0yRNcfWKlEzkBbZbqCIDww7lkTY,16882
7
7
  ingestr/src/errors.py,sha256=Ufs4_DfE77_E3vnA1fOQdi6cmuLVNm7_SbFLkL1XPGk,686
8
8
  ingestr/src/factory.py,sha256=OKqjYqvHhgaOF48-eSNSabcfXt4Gmr1yZ8cFGizXh0g,6319
@@ -11,7 +11,7 @@ ingestr/src/http_client.py,sha256=bxqsk6nJNXCo-79gW04B53DQO-yr25vaSsqP0AKtjx4,73
11
11
  ingestr/src/loader.py,sha256=9NaWAyfkXdqAZSS-N72Iwo36Lbx4PyqIfaaH1dNdkFs,1712
12
12
  ingestr/src/partition.py,sha256=BrIP6wFJvyR7Nus_3ElnfxknUXeCipK_E_bB8kZowfc,969
13
13
  ingestr/src/resource.py,sha256=ZqmZxFQVGlF8rFPhBiUB08HES0yoTj8sZ--jKfaaVps,1164
14
- ingestr/src/sources.py,sha256=C2qPplmvRQdm1nzSPvGbMpYG6oGCbGZMTlVtVS48n6k,98977
14
+ ingestr/src/sources.py,sha256=sJmiiInFb-KCPsaIy4qus6lx59MDCOobWgxJ7lfKH08,99047
15
15
  ingestr/src/table_definition.py,sha256=REbAbqdlmUMUuRh8nEQRreWjPVOQ5ZcfqGkScKdCrmk,390
16
16
  ingestr/src/time.py,sha256=H_Fk2J4ShXyUM-EMY7MqCLZQhlnZMZvO952bmZPc4yE,254
17
17
  ingestr/src/version.py,sha256=J_2xgZ0mKlvuHcjdKCx2nlioneLH0I47JiU_Slr_Nwc,189
@@ -119,8 +119,8 @@ ingestr/src/solidgate/__init__.py,sha256=JdaXvAu5QGuf9-FY294vwCQCEmfrqIld9oqbzqC
119
119
  ingestr/src/solidgate/helpers.py,sha256=oePEc9nnvmN3IaKrfJCvyKL79xdGM0-gRTN3-8tY4Fc,4952
120
120
  ingestr/src/sql_database/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
121
121
  ingestr/src/sql_database/callbacks.py,sha256=sEFFmXxAURY3yeBjnawigDtq9LBCvi8HFqG4kLd7tMU,2002
122
- ingestr/src/stripe_analytics/__init__.py,sha256=g2miuPexUcPEEMzmPQZqxEaQ0Q8YjUAkOvKaLn3KC-c,8219
123
- ingestr/src/stripe_analytics/helpers.py,sha256=8in6k1ndTon7xNh8QPDqThBWvKY9XQrmrJXveAOA6R4,13858
122
+ ingestr/src/stripe_analytics/__init__.py,sha256=mK8dGKAlMPVqGE9gG30XfbvOvgVD0yWhNpt-D3iavDY,6385
123
+ ingestr/src/stripe_analytics/helpers.py,sha256=O5ow8xORcyLhw1Yn6vFm__tASfmPOgR0TMVU9gXmxcE,11828
124
124
  ingestr/src/stripe_analytics/settings.py,sha256=xt1-ljwP4nLTNUa8l3KwFbtK8FtQHgHpzGF5uPKfRsw,2246
125
125
  ingestr/src/telemetry/event.py,sha256=W7bs4uVfPakQ5otmiqgqu1l5SqjYx1p87wudnWXckBc,949
126
126
  ingestr/src/testdata/fakebqcredentials.json,sha256=scc6TUc963KAbKTLZCfcmqVzbtzDCW1_8JNRnyAXyy8,628
@@ -143,8 +143,8 @@ ingestr/testdata/merge_expected.csv,sha256=DReHqWGnQMsf2PBv_Q2pfjsgvikYFnf1zYcQZ
143
143
  ingestr/testdata/merge_part1.csv,sha256=Pw8Z9IDKcNU0qQHx1z6BUf4rF_-SxKGFOvymCt4OY9I,185
144
144
  ingestr/testdata/merge_part2.csv,sha256=T_GiWxA81SN63_tMOIuemcvboEFeAmbKc7xRXvL9esw,287
145
145
  ingestr/tests/unit/test_smartsheets.py,sha256=eiC2CCO4iNJcuN36ONvqmEDryCA1bA1REpayHpu42lk,5058
146
- ingestr-0.13.59.dist-info/METADATA,sha256=8yM2vLMiUV_zBq15gg_1Vf6UHgtwoRzawb_tcT_K3Wc,14993
147
- ingestr-0.13.59.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
148
- ingestr-0.13.59.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
149
- ingestr-0.13.59.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
150
- ingestr-0.13.59.dist-info/RECORD,,
146
+ ingestr-0.13.60.dist-info/METADATA,sha256=FwdcfGIPPRKlSV8wJX1HAqHriGUZBl_XXi0Yco8O874,14993
147
+ ingestr-0.13.60.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
148
+ ingestr-0.13.60.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
149
+ ingestr-0.13.60.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
150
+ ingestr-0.13.60.dist-info/RECORD,,