ingestr 0.13.2__py3-none-any.whl → 0.14.104__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. ingestr/conftest.py +72 -0
  2. ingestr/main.py +134 -87
  3. ingestr/src/adjust/__init__.py +4 -4
  4. ingestr/src/adjust/adjust_helpers.py +7 -3
  5. ingestr/src/airtable/__init__.py +3 -2
  6. ingestr/src/allium/__init__.py +128 -0
  7. ingestr/src/anthropic/__init__.py +277 -0
  8. ingestr/src/anthropic/helpers.py +525 -0
  9. ingestr/src/applovin/__init__.py +262 -0
  10. ingestr/src/applovin_max/__init__.py +117 -0
  11. ingestr/src/appsflyer/__init__.py +325 -0
  12. ingestr/src/appsflyer/client.py +49 -45
  13. ingestr/src/appstore/__init__.py +1 -0
  14. ingestr/src/arrow/__init__.py +9 -1
  15. ingestr/src/asana_source/__init__.py +1 -1
  16. ingestr/src/attio/__init__.py +102 -0
  17. ingestr/src/attio/helpers.py +65 -0
  18. ingestr/src/blob.py +38 -11
  19. ingestr/src/buildinfo.py +1 -0
  20. ingestr/src/chess/__init__.py +1 -1
  21. ingestr/src/clickup/__init__.py +85 -0
  22. ingestr/src/clickup/helpers.py +47 -0
  23. ingestr/src/collector/spinner.py +43 -0
  24. ingestr/src/couchbase_source/__init__.py +118 -0
  25. ingestr/src/couchbase_source/helpers.py +135 -0
  26. ingestr/src/cursor/__init__.py +83 -0
  27. ingestr/src/cursor/helpers.py +188 -0
  28. ingestr/src/destinations.py +520 -33
  29. ingestr/src/docebo/__init__.py +589 -0
  30. ingestr/src/docebo/client.py +435 -0
  31. ingestr/src/docebo/helpers.py +97 -0
  32. ingestr/src/elasticsearch/__init__.py +80 -0
  33. ingestr/src/elasticsearch/helpers.py +138 -0
  34. ingestr/src/errors.py +8 -0
  35. ingestr/src/facebook_ads/__init__.py +47 -28
  36. ingestr/src/facebook_ads/helpers.py +59 -37
  37. ingestr/src/facebook_ads/settings.py +2 -0
  38. ingestr/src/facebook_ads/utils.py +39 -0
  39. ingestr/src/factory.py +116 -2
  40. ingestr/src/filesystem/__init__.py +8 -3
  41. ingestr/src/filters.py +46 -3
  42. ingestr/src/fluxx/__init__.py +9906 -0
  43. ingestr/src/fluxx/helpers.py +209 -0
  44. ingestr/src/frankfurter/__init__.py +157 -0
  45. ingestr/src/frankfurter/helpers.py +48 -0
  46. ingestr/src/freshdesk/__init__.py +89 -0
  47. ingestr/src/freshdesk/freshdesk_client.py +137 -0
  48. ingestr/src/freshdesk/settings.py +9 -0
  49. ingestr/src/fundraiseup/__init__.py +95 -0
  50. ingestr/src/fundraiseup/client.py +81 -0
  51. ingestr/src/github/__init__.py +41 -6
  52. ingestr/src/github/helpers.py +5 -5
  53. ingestr/src/google_analytics/__init__.py +22 -4
  54. ingestr/src/google_analytics/helpers.py +124 -6
  55. ingestr/src/google_sheets/__init__.py +4 -4
  56. ingestr/src/google_sheets/helpers/data_processing.py +2 -2
  57. ingestr/src/hostaway/__init__.py +302 -0
  58. ingestr/src/hostaway/client.py +288 -0
  59. ingestr/src/http/__init__.py +35 -0
  60. ingestr/src/http/readers.py +114 -0
  61. ingestr/src/http_client.py +24 -0
  62. ingestr/src/hubspot/__init__.py +66 -23
  63. ingestr/src/hubspot/helpers.py +52 -22
  64. ingestr/src/hubspot/settings.py +14 -7
  65. ingestr/src/influxdb/__init__.py +46 -0
  66. ingestr/src/influxdb/client.py +34 -0
  67. ingestr/src/intercom/__init__.py +142 -0
  68. ingestr/src/intercom/helpers.py +674 -0
  69. ingestr/src/intercom/settings.py +279 -0
  70. ingestr/src/isoc_pulse/__init__.py +159 -0
  71. ingestr/src/jira_source/__init__.py +340 -0
  72. ingestr/src/jira_source/helpers.py +439 -0
  73. ingestr/src/jira_source/settings.py +170 -0
  74. ingestr/src/kafka/__init__.py +4 -1
  75. ingestr/src/kinesis/__init__.py +139 -0
  76. ingestr/src/kinesis/helpers.py +82 -0
  77. ingestr/src/klaviyo/{_init_.py → __init__.py} +5 -6
  78. ingestr/src/linear/__init__.py +634 -0
  79. ingestr/src/linear/helpers.py +111 -0
  80. ingestr/src/linkedin_ads/helpers.py +0 -1
  81. ingestr/src/loader.py +69 -0
  82. ingestr/src/mailchimp/__init__.py +126 -0
  83. ingestr/src/mailchimp/helpers.py +226 -0
  84. ingestr/src/mailchimp/settings.py +164 -0
  85. ingestr/src/masking.py +344 -0
  86. ingestr/src/mixpanel/__init__.py +62 -0
  87. ingestr/src/mixpanel/client.py +99 -0
  88. ingestr/src/monday/__init__.py +246 -0
  89. ingestr/src/monday/helpers.py +392 -0
  90. ingestr/src/monday/settings.py +328 -0
  91. ingestr/src/mongodb/__init__.py +72 -8
  92. ingestr/src/mongodb/helpers.py +915 -38
  93. ingestr/src/partition.py +32 -0
  94. ingestr/src/personio/__init__.py +331 -0
  95. ingestr/src/personio/helpers.py +86 -0
  96. ingestr/src/phantombuster/__init__.py +65 -0
  97. ingestr/src/phantombuster/client.py +87 -0
  98. ingestr/src/pinterest/__init__.py +82 -0
  99. ingestr/src/pipedrive/__init__.py +198 -0
  100. ingestr/src/pipedrive/helpers/__init__.py +23 -0
  101. ingestr/src/pipedrive/helpers/custom_fields_munger.py +102 -0
  102. ingestr/src/pipedrive/helpers/pages.py +115 -0
  103. ingestr/src/pipedrive/settings.py +27 -0
  104. ingestr/src/pipedrive/typing.py +3 -0
  105. ingestr/src/plusvibeai/__init__.py +335 -0
  106. ingestr/src/plusvibeai/helpers.py +544 -0
  107. ingestr/src/plusvibeai/settings.py +252 -0
  108. ingestr/src/quickbooks/__init__.py +117 -0
  109. ingestr/src/resource.py +40 -0
  110. ingestr/src/revenuecat/__init__.py +83 -0
  111. ingestr/src/revenuecat/helpers.py +237 -0
  112. ingestr/src/salesforce/__init__.py +156 -0
  113. ingestr/src/salesforce/helpers.py +64 -0
  114. ingestr/src/shopify/__init__.py +1 -17
  115. ingestr/src/smartsheets/__init__.py +82 -0
  116. ingestr/src/snapchat_ads/__init__.py +489 -0
  117. ingestr/src/snapchat_ads/client.py +72 -0
  118. ingestr/src/snapchat_ads/helpers.py +535 -0
  119. ingestr/src/socrata_source/__init__.py +83 -0
  120. ingestr/src/socrata_source/helpers.py +85 -0
  121. ingestr/src/socrata_source/settings.py +8 -0
  122. ingestr/src/solidgate/__init__.py +219 -0
  123. ingestr/src/solidgate/helpers.py +154 -0
  124. ingestr/src/sources.py +3132 -212
  125. ingestr/src/stripe_analytics/__init__.py +49 -21
  126. ingestr/src/stripe_analytics/helpers.py +286 -1
  127. ingestr/src/stripe_analytics/settings.py +62 -10
  128. ingestr/src/telemetry/event.py +10 -9
  129. ingestr/src/tiktok_ads/__init__.py +12 -6
  130. ingestr/src/tiktok_ads/tiktok_helpers.py +0 -1
  131. ingestr/src/trustpilot/__init__.py +48 -0
  132. ingestr/src/trustpilot/client.py +48 -0
  133. ingestr/src/version.py +6 -1
  134. ingestr/src/wise/__init__.py +68 -0
  135. ingestr/src/wise/client.py +63 -0
  136. ingestr/src/zoom/__init__.py +99 -0
  137. ingestr/src/zoom/helpers.py +102 -0
  138. ingestr/tests/unit/test_smartsheets.py +133 -0
  139. ingestr-0.14.104.dist-info/METADATA +563 -0
  140. ingestr-0.14.104.dist-info/RECORD +203 -0
  141. ingestr/src/appsflyer/_init_.py +0 -24
  142. ingestr-0.13.2.dist-info/METADATA +0 -302
  143. ingestr-0.13.2.dist-info/RECORD +0 -107
  144. {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/WHEEL +0 -0
  145. {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/entry_points.txt +0 -0
  146. {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,139 @@
1
+ """Reads messages from Kinesis queue."""
2
+
3
+ from typing import Iterable, List, Optional
4
+
5
+ import dlt
6
+ from dlt.common import json, pendulum
7
+ from dlt.common.configuration.specs import AwsCredentials
8
+ from dlt.common.time import ensure_pendulum_datetime
9
+ from dlt.common.typing import StrStr, TAnyDateTime, TDataItem
10
+ from dlt.common.utils import digest128
11
+
12
+ from .helpers import get_shard_iterator, get_stream_address, max_sequence_by_shard
13
+
14
+
15
+ @dlt.resource(
16
+ name=lambda args: args["stream_name"],
17
+ primary_key="kinesis_msg_id",
18
+ standalone=True,
19
+ max_table_nesting=0,
20
+ )
21
+ def kinesis_stream(
22
+ stream_name: str,
23
+ initial_at_timestamp: TAnyDateTime,
24
+ credentials: AwsCredentials,
25
+ last_msg: Optional[dlt.sources.incremental[StrStr]] = dlt.sources.incremental(
26
+ "kinesis", last_value_func=max_sequence_by_shard
27
+ ),
28
+ max_number_of_messages: int = None, # type: ignore
29
+ milliseconds_behind_latest: int = 1000,
30
+ parse_json: bool = True,
31
+ chunk_size: int = 1000,
32
+ ) -> Iterable[TDataItem]:
33
+ """Reads a kinesis stream and yields messages. Supports incremental loading. Parses messages as json by default.
34
+
35
+ Args:
36
+ stream_name (str): The name of the stream to read from. If not provided, the
37
+ value must be present in config/secrets
38
+ credentials (AwsCredentials): The credentials to use to connect to kinesis. If not provided,
39
+ the value from secrets or credentials present on the device will be used.
40
+ last_msg (Optional[dlt.sources.incremental]): An incremental over a mapping from shard_id to message sequence
41
+ that will be used to create shard iterators of type AFTER_SEQUENCE_NUMBER when loading incrementally.
42
+ initial_at_timestamp (TAnyDateTime): An initial timestamp used to generate AT_TIMESTAMP or LATEST iterator when timestamp value is 0
43
+ max_number_of_messages (int): Maximum number of messages to read in one run. Actual read may exceed that number by up to chunk_size. Defaults to None (no limit).
44
+ milliseconds_behind_latest (int): The number of milliseconds behind the top of the shard to stop reading messages, defaults to 1000.
45
+ parse_json (bool): If True, assumes that messages are json strings, parses them and returns instead of `data` (otherwise). Defaults to True.
46
+ chunk_size (int): The number of records to fetch at once. Defaults to 1000.
47
+ Yields:
48
+ Iterable[TDataItem]: Messages. Contain Kinesis envelope in `kinesis` and bytes data in `data` (if `parse_json` disabled)
49
+
50
+ """
51
+ session = credentials._to_botocore_session()
52
+ # the default timeouts are (60, 60) which is fine
53
+ kinesis_client = session.create_client("kinesis")
54
+ # normalize at_timestamp to pendulum
55
+ initial_at_datetime = (
56
+ None
57
+ if initial_at_timestamp is None
58
+ else ensure_pendulum_datetime(initial_at_timestamp)
59
+ )
60
+ # set it in state
61
+ resource_state = dlt.current.resource_state()
62
+ initial_at_datetime = resource_state.get(
63
+ "initial_at_timestamp", initial_at_datetime
64
+ )
65
+ # so next time we request shards at AT_TIMESTAMP that is now
66
+ resource_state["initial_at_timestamp"] = pendulum.now("UTC").subtract(seconds=1)
67
+
68
+ shards_list = kinesis_client.list_shards(**get_stream_address(stream_name))
69
+ shards: List[StrStr] = shards_list["Shards"]
70
+ while next_token := shards_list.get("NextToken"):
71
+ shards_list = kinesis_client.list_shards(NextToken=next_token)
72
+ shards.extend(shards_list)
73
+
74
+ shard_ids = [shard["ShardId"] for shard in shards]
75
+
76
+ # get next shard to fetch messages from
77
+ while shard_id := shard_ids.pop(0) if shard_ids else None:
78
+ shard_iterator, _ = get_shard_iterator(
79
+ kinesis_client,
80
+ stream_name,
81
+ shard_id,
82
+ last_msg, # type: ignore
83
+ initial_at_datetime, # type: ignore
84
+ )
85
+
86
+ while shard_iterator:
87
+ records = []
88
+ records_response = kinesis_client.get_records(
89
+ ShardIterator=shard_iterator,
90
+ Limit=chunk_size, # The size of data can be up to 1 MB, it must be controlled by the user
91
+ )
92
+
93
+ for record in records_response["Records"]:
94
+ sequence_number = record["SequenceNumber"]
95
+ content = record["Data"]
96
+
97
+ arrival_time = record["ApproximateArrivalTimestamp"]
98
+ arrival_timestamp = arrival_time.astimezone(pendulum.UTC)
99
+
100
+ message = {
101
+ "kinesis": {
102
+ "shard_id": shard_id,
103
+ "seq_no": sequence_number,
104
+ "ts": ensure_pendulum_datetime(arrival_timestamp),
105
+ "partition": record["PartitionKey"],
106
+ "stream_name": stream_name,
107
+ },
108
+ "kinesis_msg_id": digest128(shard_id + sequence_number),
109
+ }
110
+
111
+ if parse_json:
112
+ message.update(json.loadb(content))
113
+ else:
114
+ message["data"] = content
115
+ records.append(message)
116
+ yield records
117
+
118
+ # do not load more max_number_of_messages
119
+ if max_number_of_messages is not None:
120
+ max_number_of_messages -= len(records)
121
+ if max_number_of_messages <= 0:
122
+ return
123
+
124
+ # add child shards so we can request messages from them
125
+ child_shards = records_response.get("ChildShards", None)
126
+ if child_shards:
127
+ for child_shard in child_shards:
128
+ child_shard_id = child_shard["ShardId"]
129
+ if child_shard_id not in shards:
130
+ shard_ids.append(child_shard_id)
131
+
132
+ # gets 0 when no messages so we cutoff empty shards
133
+ records_ms_behind_latest = records_response.get("MillisBehindLatest", 0)
134
+ if records_ms_behind_latest < milliseconds_behind_latest:
135
+ # stop taking messages from shard
136
+ shard_iterator = None # type: ignore
137
+ else:
138
+ # continue taking messages
139
+ shard_iterator = records_response["NextShardIterator"]
@@ -0,0 +1,82 @@
1
+ from typing import Any, Sequence, Tuple
2
+
3
+ import dlt
4
+ from dlt.common import pendulum
5
+ from dlt.common.typing import DictStrAny, DictStrStr, StrAny, StrStr
6
+
7
+
8
+ def get_shard_iterator(
9
+ kinesis_client: Any,
10
+ stream_name: str,
11
+ shard_id: str,
12
+ last_msg: dlt.sources.incremental[StrStr],
13
+ initial_at_timestamp: pendulum.DateTime | None,
14
+ ) -> Tuple[str, StrAny]:
15
+ """Gets shard `shard_id` of `stream_name` iterator. If `last_msg` incremental is present it may
16
+ contain last message sequence for shard_id. in that case AFTER_SEQUENCE_NUMBER is created.
17
+ If no message sequence is present, `initial_at_timestamp` is used for AT_TIMESTAMP or LATEST.
18
+ The final fallback is TRIM_HORIZON
19
+ """
20
+ sequence_state = (
21
+ {} if last_msg is None else last_msg.last_value or last_msg.initial_value or {}
22
+ )
23
+ iterator_params: DictStrAny
24
+ msg_sequence = sequence_state.get(shard_id, None)
25
+ if msg_sequence:
26
+ iterator_params = dict(
27
+ ShardIteratorType="AFTER_SEQUENCE_NUMBER",
28
+ StartingSequenceNumber=msg_sequence,
29
+ )
30
+ elif initial_at_timestamp is None:
31
+ # Fetch all records from the beginning
32
+ iterator_params = dict(ShardIteratorType="TRIM_HORIZON")
33
+
34
+ elif initial_at_timestamp.timestamp() == 0.0:
35
+ # will sets to latest i.e only the messages at the tip of the stream are read
36
+ iterator_params = dict(ShardIteratorType="LATEST")
37
+ else:
38
+ iterator_params = dict(
39
+ ShardIteratorType="AT_TIMESTAMP", Timestamp=initial_at_timestamp.timestamp()
40
+ )
41
+
42
+ shard_iterator: StrStr = kinesis_client.get_shard_iterator(
43
+ **get_stream_address(stream_name), ShardId=shard_id, **iterator_params
44
+ )
45
+ return shard_iterator["ShardIterator"], iterator_params
46
+
47
+
48
+ def max_sequence_by_shard(values: Sequence[StrStr]) -> StrStr:
49
+ """A last_value_function that operates on mapping of shard_id:msg_sequence defining the max"""
50
+ last_value = None
51
+ # if tuple/list contains only one element then return it
52
+ if len(values) == 1:
53
+ item = values[0]
54
+ else:
55
+ # item is kinesis metadata, last_value is previous state of the shards
56
+ item, last_value = values
57
+
58
+ if last_value is None:
59
+ last_value = {}
60
+ else:
61
+ last_value = dict(last_value) # always make a copy
62
+ shard_id = item["shard_id"]
63
+ # we compare message sequence at shard_id
64
+ last_value[shard_id] = max(item["seq_no"], last_value.get(shard_id, ""))
65
+ return last_value
66
+
67
+
68
+ def get_stream_address(stream_name: str) -> DictStrStr:
69
+ """
70
+ Return address of stream, either as StreamName or StreamARN, when applicable.
71
+
72
+ Examples:
73
+ - customer_events
74
+ - arn:aws:kinesis:eu-central-1:842404475894:stream/customer_events
75
+
76
+ https://docs.aws.amazon.com/kinesis/latest/APIReference/API_StreamDescription.html#Streams-Type-StreamDescription-StreamName
77
+ https://docs.aws.amazon.com/kinesis/latest/APIReference/API_StreamDescription.html#Streams-Type-StreamDescription-StreamARN
78
+ """
79
+ if stream_name.startswith("arn:"):
80
+ return {"StreamARN": stream_name}
81
+ else:
82
+ return {"StreamName": stream_name}
@@ -18,7 +18,6 @@ def retry_on_limit(response: requests.Response, exception: BaseException) -> boo
18
18
 
19
19
  def create_client() -> requests.Session:
20
20
  return Client(
21
- request_timeout=10.0,
22
21
  raise_for_status=False,
23
22
  retry_condition=retry_on_limit,
24
23
  request_max_attempts=12,
@@ -31,7 +30,7 @@ def klaviyo_source(api_key: str, start_date: TAnyDateTime) -> Iterable[DltResour
31
30
  start_date_obj = ensure_pendulum_datetime(start_date)
32
31
  client = KlaviyoClient(api_key)
33
32
 
34
- @dlt.resource(write_disposition="append", primary_key="id", parallelized=True)
33
+ @dlt.resource(write_disposition="merge", primary_key="id", parallelized=True)
35
34
  def events(
36
35
  datetime=dlt.sources.incremental(
37
36
  "datetime",
@@ -136,7 +135,7 @@ def klaviyo_source(api_key: str, start_date: TAnyDateTime) -> Iterable[DltResour
136
135
  ) -> Iterable[TDataItem]:
137
136
  yield from client.fetch_catalog_item(create_client(), updated.start_value)
138
137
 
139
- @dlt.resource(write_disposition="append", primary_key="id", parallelized=True)
138
+ @dlt.resource(write_disposition="merge", primary_key="id", parallelized=True)
140
139
  def forms(
141
140
  updated_at=dlt.sources.incremental(
142
141
  "updated_at",
@@ -163,7 +162,7 @@ def klaviyo_source(api_key: str, start_date: TAnyDateTime) -> Iterable[DltResour
163
162
  ) -> Iterable[TDataItem]:
164
163
  yield from client.fetch_lists(create_client(), updated.start_value)
165
164
 
166
- @dlt.resource(write_disposition="append", primary_key="id", parallelized=True)
165
+ @dlt.resource(write_disposition="merge", primary_key="id", parallelized=True)
167
166
  def images(
168
167
  updated_at=dlt.sources.incremental(
169
168
  "updated_at",
@@ -189,7 +188,7 @@ def klaviyo_source(api_key: str, start_date: TAnyDateTime) -> Iterable[DltResour
189
188
  ) -> Iterable[TDataItem]:
190
189
  yield from client.fetch_segments(create_client(), updated.start_value)
191
190
 
192
- @dlt.resource(write_disposition="append", primary_key="id", parallelized=True)
191
+ @dlt.resource(write_disposition="merge", primary_key="id", parallelized=True)
193
192
  def flows(
194
193
  updated=dlt.sources.incremental(
195
194
  "updated",
@@ -204,7 +203,7 @@ def klaviyo_source(api_key: str, start_date: TAnyDateTime) -> Iterable[DltResour
204
203
  for start, end in intervals:
205
204
  yield lambda s=start, e=end: client.fetch_flows(create_client(), s, e)
206
205
 
207
- @dlt.resource(write_disposition="append", primary_key="id", parallelized=True)
206
+ @dlt.resource(write_disposition="merge", primary_key="id", parallelized=True)
208
207
  def templates(
209
208
  updated=dlt.sources.incremental(
210
209
  "updated",