ingestr 0.13.75__py3-none-any.whl → 0.14.98__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

Files changed (79) hide show
  1. ingestr/main.py +22 -3
  2. ingestr/src/adjust/__init__.py +4 -4
  3. ingestr/src/allium/__init__.py +128 -0
  4. ingestr/src/anthropic/__init__.py +277 -0
  5. ingestr/src/anthropic/helpers.py +525 -0
  6. ingestr/src/appstore/__init__.py +1 -0
  7. ingestr/src/asana_source/__init__.py +1 -1
  8. ingestr/src/buildinfo.py +1 -1
  9. ingestr/src/chess/__init__.py +1 -1
  10. ingestr/src/couchbase_source/__init__.py +118 -0
  11. ingestr/src/couchbase_source/helpers.py +135 -0
  12. ingestr/src/cursor/__init__.py +83 -0
  13. ingestr/src/cursor/helpers.py +188 -0
  14. ingestr/src/destinations.py +169 -1
  15. ingestr/src/docebo/__init__.py +589 -0
  16. ingestr/src/docebo/client.py +435 -0
  17. ingestr/src/docebo/helpers.py +97 -0
  18. ingestr/src/elasticsearch/helpers.py +138 -0
  19. ingestr/src/errors.py +8 -0
  20. ingestr/src/facebook_ads/__init__.py +26 -23
  21. ingestr/src/facebook_ads/helpers.py +47 -1
  22. ingestr/src/factory.py +48 -0
  23. ingestr/src/filesystem/__init__.py +8 -3
  24. ingestr/src/filters.py +9 -0
  25. ingestr/src/fluxx/__init__.py +9906 -0
  26. ingestr/src/fluxx/helpers.py +209 -0
  27. ingestr/src/frankfurter/__init__.py +157 -163
  28. ingestr/src/frankfurter/helpers.py +3 -3
  29. ingestr/src/freshdesk/__init__.py +25 -8
  30. ingestr/src/freshdesk/freshdesk_client.py +40 -5
  31. ingestr/src/fundraiseup/__init__.py +49 -0
  32. ingestr/src/fundraiseup/client.py +81 -0
  33. ingestr/src/github/__init__.py +6 -4
  34. ingestr/src/google_analytics/__init__.py +1 -1
  35. ingestr/src/hostaway/__init__.py +302 -0
  36. ingestr/src/hostaway/client.py +288 -0
  37. ingestr/src/http/__init__.py +35 -0
  38. ingestr/src/http/readers.py +114 -0
  39. ingestr/src/hubspot/__init__.py +6 -12
  40. ingestr/src/influxdb/__init__.py +1 -0
  41. ingestr/src/intercom/__init__.py +142 -0
  42. ingestr/src/intercom/helpers.py +674 -0
  43. ingestr/src/intercom/settings.py +279 -0
  44. ingestr/src/jira_source/__init__.py +340 -0
  45. ingestr/src/jira_source/helpers.py +439 -0
  46. ingestr/src/jira_source/settings.py +170 -0
  47. ingestr/src/klaviyo/__init__.py +5 -5
  48. ingestr/src/linear/__init__.py +553 -116
  49. ingestr/src/linear/helpers.py +77 -38
  50. ingestr/src/mailchimp/__init__.py +126 -0
  51. ingestr/src/mailchimp/helpers.py +226 -0
  52. ingestr/src/mailchimp/settings.py +164 -0
  53. ingestr/src/masking.py +344 -0
  54. ingestr/src/monday/__init__.py +246 -0
  55. ingestr/src/monday/helpers.py +392 -0
  56. ingestr/src/monday/settings.py +328 -0
  57. ingestr/src/mongodb/__init__.py +5 -2
  58. ingestr/src/mongodb/helpers.py +384 -10
  59. ingestr/src/plusvibeai/__init__.py +335 -0
  60. ingestr/src/plusvibeai/helpers.py +544 -0
  61. ingestr/src/plusvibeai/settings.py +252 -0
  62. ingestr/src/revenuecat/__init__.py +83 -0
  63. ingestr/src/revenuecat/helpers.py +237 -0
  64. ingestr/src/salesforce/__init__.py +15 -8
  65. ingestr/src/shopify/__init__.py +1 -1
  66. ingestr/src/smartsheets/__init__.py +33 -5
  67. ingestr/src/socrata_source/__init__.py +83 -0
  68. ingestr/src/socrata_source/helpers.py +85 -0
  69. ingestr/src/socrata_source/settings.py +8 -0
  70. ingestr/src/sources.py +1418 -54
  71. ingestr/src/stripe_analytics/__init__.py +2 -19
  72. ingestr/src/wise/__init__.py +68 -0
  73. ingestr/src/wise/client.py +63 -0
  74. ingestr/tests/unit/test_smartsheets.py +6 -9
  75. {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/METADATA +24 -12
  76. {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/RECORD +79 -37
  77. {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/WHEEL +0 -0
  78. {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/entry_points.txt +0 -0
  79. {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/licenses/LICENSE.md +0 -0
ingestr/main.py CHANGED
@@ -1,3 +1,4 @@
1
+ import warnings
1
2
  from datetime import datetime
2
3
  from enum import Enum
3
4
  from typing import Optional
@@ -8,6 +9,14 @@ from typing_extensions import Annotated
8
9
 
9
10
  from ingestr.src.telemetry.event import track
10
11
 
12
+ try:
13
+ from duckdb_engine import DuckDBEngineWarning
14
+
15
+ warnings.filterwarnings("ignore", category=DuckDBEngineWarning)
16
+ except ImportError:
17
+ # duckdb-engine not installed
18
+ pass
19
+
11
20
  app = typer.Typer(
12
21
  name="ingestr",
13
22
  help="ingestr is the CLI tool to ingest data from one source to another",
@@ -273,6 +282,13 @@ def ingest(
273
282
  envvar=["STAGING_BUCKET", "INGESTR_STAGING_BUCKET"],
274
283
  ),
275
284
  ] = None, # type: ignore
285
+ mask: Annotated[
286
+ Optional[list[str]],
287
+ typer.Option(
288
+ help="Column masking configuration in format 'column:algorithm[:param]'. Can be specified multiple times.",
289
+ envvar=["MASK", "INGESTR_MASK"],
290
+ ),
291
+ ] = [], # type: ignore
276
292
  ):
277
293
  import hashlib
278
294
  import tempfile
@@ -293,6 +309,7 @@ def ingest(
293
309
  from ingestr.src.filters import (
294
310
  cast_set_to_list,
295
311
  cast_spanner_types,
312
+ create_masking_filter,
296
313
  handle_mysql_empty_dates,
297
314
  )
298
315
  from ingestr.src.sources import MongoDbSource
@@ -506,7 +523,6 @@ def ingest(
506
523
 
507
524
  if factory.source_scheme == "sqlite":
508
525
  source_table = "main." + source_table.split(".")[-1]
509
-
510
526
 
511
527
  if (
512
528
  incremental_key
@@ -554,6 +570,10 @@ def ingest(
554
570
  if factory.source_scheme.startswith("spanner"):
555
571
  resource.for_each(dlt_source, lambda x: x.add_map(cast_spanner_types))
556
572
 
573
+ if mask:
574
+ masking_filter = create_masking_filter(mask)
575
+ resource.for_each(dlt_source, lambda x: x.add_map(masking_filter))
576
+
557
577
  if yield_limit:
558
578
  resource.for_each(dlt_source, lambda x: x.add_limit(yield_limit))
559
579
 
@@ -600,10 +620,9 @@ def ingest(
600
620
  if factory.source_scheme == "influxdb":
601
621
  if primary_key:
602
622
  write_disposition = "merge"
603
-
604
623
 
605
624
  start_time = datetime.now()
606
-
625
+
607
626
  run_info: LoadInfo = pipeline.run(
608
627
  dlt_source,
609
628
  **destination.dlt_run_params(
@@ -46,7 +46,7 @@ def adjust_source(
46
46
  filters: Optional[dict] = None,
47
47
  ) -> Sequence[DltResource]:
48
48
  @dlt.resource(write_disposition="merge", merge_key="day")
49
- def campaigns():
49
+ def campaigns() -> DltResource:
50
50
  adjust_api = AdjustAPI(api_key=api_key)
51
51
  yield from adjust_api.fetch_report_data(
52
52
  start_date=start_date,
@@ -57,12 +57,12 @@ def adjust_source(
57
57
  )
58
58
 
59
59
  @dlt.resource(write_disposition="replace", primary_key="id")
60
- def events():
60
+ def events() -> DltResource:
61
61
  adjust_api = AdjustAPI(api_key=api_key)
62
62
  yield adjust_api.fetch_events()
63
63
 
64
64
  @dlt.resource(write_disposition="merge", merge_key="day")
65
- def creatives():
65
+ def creatives() -> DltResource:
66
66
  adjust_api = AdjustAPI(api_key=api_key)
67
67
  yield from adjust_api.fetch_report_data(
68
68
  start_date=start_date,
@@ -95,7 +95,7 @@ def adjust_source(
95
95
  primary_key=dimensions,
96
96
  columns=type_hints,
97
97
  )
98
- def custom():
98
+ def custom() -> DltResource:
99
99
  adjust_api = AdjustAPI(api_key=api_key)
100
100
  yield from adjust_api.fetch_report_data(
101
101
  start_date=start_date,
@@ -0,0 +1,128 @@
1
+ """
2
+ Allium source for data extraction via REST API.
3
+
4
+ This source provides access to Allium blockchain data via asynchronous query execution.
5
+ """
6
+
7
+ import time
8
+ from typing import Any, Iterator
9
+
10
+ import dlt
11
+
12
+ from ingestr.src.http_client import create_client
13
+
14
+
15
+ @dlt.source(max_table_nesting=0, name="allium_source")
16
+ def allium_source(
17
+ api_key: str,
18
+ query_id: str,
19
+ parameters: dict[str, Any] | None = None,
20
+ limit: int | None = None,
21
+ compute_profile: str | None = None,
22
+ ) -> Any:
23
+ """
24
+ Allium data source for blockchain data extraction.
25
+
26
+ This source connects to Allium API, runs async queries, and fetches results.
27
+
28
+ Args:
29
+ api_key: Allium API key for authentication
30
+ query_id: The query ID to execute (e.g., 'abc123')
31
+ parameters: Optional parameters for the query (e.g., {'start_date': '2025-02-01', 'end_date': '2025-02-02'})
32
+ limit: Limit the number of rows in the result (max 250,000)
33
+ compute_profile: Compute profile identifier
34
+
35
+ Yields:
36
+ DltResource: Data resources for Allium query results
37
+ """
38
+ base_url = "https://api.allium.so/api/v1/explorer"
39
+ session = create_client()
40
+ headers = {"X-API-Key": api_key}
41
+
42
+ @dlt.resource(
43
+ name="query_results",
44
+ write_disposition="replace",
45
+ )
46
+ def fetch_query_results() -> Iterator[dict[str, Any]]:
47
+ """
48
+ Fetch query results from Allium.
49
+
50
+ This function:
51
+ 1. Starts an async query execution
52
+ 2. Polls for completion status
53
+ 3. Fetches and yields the results
54
+ """
55
+ # Step 1: Start async query execution
56
+ run_config: dict[str, Any] = {}
57
+ if limit is not None:
58
+ run_config["limit"] = limit
59
+ if compute_profile is not None:
60
+ run_config["compute_profile"] = compute_profile
61
+
62
+ run_payload = {"parameters": parameters or {}, "run_config": run_config}
63
+
64
+ run_response = session.post(
65
+ f"{base_url}/queries/{query_id}/run-async",
66
+ json=run_payload,
67
+ headers=headers,
68
+ )
69
+
70
+ run_data = run_response.json()
71
+
72
+ if "run_id" not in run_data:
73
+ raise ValueError(f"Failed to start query execution: {run_data}")
74
+
75
+ run_id = run_data["run_id"]
76
+
77
+ # Step 2: Poll for completion
78
+ max_retries = 8640 # Max 12 hours with 5-second intervals
79
+ retry_count = 0
80
+ poll_interval = 5 # seconds
81
+
82
+ while retry_count < max_retries:
83
+ status_response = session.get(
84
+ f"{base_url}/query-runs/{run_id}/status",
85
+ headers=headers,
86
+ )
87
+ status_response.raise_for_status()
88
+ status_data = status_response.json()
89
+
90
+ # Handle both string and dict responses
91
+ if isinstance(status_data, str):
92
+ status = status_data
93
+ else:
94
+ status = status_data.get("status")
95
+
96
+ if status == "success":
97
+ break
98
+ elif status == "failed":
99
+ error_msg = (
100
+ status_data.get("error", "Unknown error")
101
+ if isinstance(status_data, dict)
102
+ else "Unknown error"
103
+ )
104
+ raise ValueError(f"Query execution failed: {error_msg}")
105
+ elif status in ["pending", "running", "queued"]:
106
+ time.sleep(poll_interval)
107
+ retry_count += 1
108
+ else:
109
+ raise ValueError(f"Unknown status: {status}")
110
+
111
+ if retry_count >= max_retries:
112
+ raise TimeoutError(
113
+ f"Query execution timed out after {max_retries * poll_interval} seconds"
114
+ )
115
+
116
+ # Step 3: Fetch results
117
+ results_response = session.get(
118
+ f"{base_url}/query-runs/{run_id}/results",
119
+ headers=headers,
120
+ params={"f": "json"},
121
+ )
122
+ results_response.raise_for_status()
123
+ query_output = results_response.json()
124
+
125
+ # Extract and yield all data
126
+ yield query_output.get("data", [])
127
+
128
+ return (fetch_query_results,)
@@ -0,0 +1,277 @@
1
+ """Anthropic source for loading Claude Code usage analytics and other Anthropic API data."""
2
+
3
+ from typing import Any, Dict, Iterator, Optional, Sequence
4
+
5
+ import dlt
6
+ import pendulum
7
+ from dlt.sources import DltResource
8
+
9
+ from .helpers import (
10
+ fetch_api_keys,
11
+ fetch_claude_code_usage,
12
+ fetch_cost_report,
13
+ fetch_invites,
14
+ fetch_organization_info,
15
+ fetch_usage_report,
16
+ fetch_users,
17
+ fetch_workspace_members,
18
+ fetch_workspaces,
19
+ )
20
+
21
+
22
+ @dlt.source(max_table_nesting=0)
23
+ def anthropic_source(
24
+ api_key: str,
25
+ initial_start_date: Optional[pendulum.DateTime] = None,
26
+ end_date: Optional[pendulum.DateTime] = None,
27
+ ) -> Sequence[DltResource]:
28
+ """
29
+ Load data from Anthropic APIs.
30
+
31
+ Currently supports:
32
+ - Claude Code Usage Analytics
33
+
34
+ Args:
35
+ api_key: Anthropic Admin API key (starts with sk-ant-admin...)
36
+ initial_start_date: Start date for data retrieval (defaults to 2023-01-01)
37
+ end_date: Optional end date for data retrieval
38
+
39
+ Returns:
40
+ Sequence of DLT resources with Anthropic data
41
+ """
42
+
43
+ # Default start date to 2023-01-01 if not provided
44
+ start_date: pendulum.DateTime = (
45
+ initial_start_date
46
+ if initial_start_date is not None
47
+ else pendulum.datetime(2023, 1, 1)
48
+ )
49
+
50
+ # Prepare end_value for incremental
51
+ end_value_str = None
52
+ if end_date is not None:
53
+ end_value_str = end_date.to_date_string()
54
+
55
+ @dlt.resource(
56
+ name="claude_code_usage",
57
+ write_disposition="merge",
58
+ primary_key=["date", "actor_type", "actor_id", "terminal_type"],
59
+ )
60
+ def claude_code_usage(
61
+ date: dlt.sources.incremental[str] = dlt.sources.incremental(
62
+ "date",
63
+ initial_value=start_date.to_date_string(),
64
+ end_value=end_value_str,
65
+ ),
66
+ ) -> Iterator[Dict[str, Any]]:
67
+ """
68
+ Load Claude Code usage analytics data incrementally by date.
69
+
70
+ Yields flattened records with:
71
+ - date: The date of the usage data
72
+ - actor_type: Type of actor (user_actor or api_actor)
73
+ - actor_id: Email address or API key name
74
+ - organization_id: Organization UUID
75
+ - customer_type: api or subscription
76
+ - terminal_type: Terminal/environment type
77
+ - Core metrics (sessions, lines of code, commits, PRs)
78
+ - Tool actions (accepted/rejected counts by tool)
79
+ - Model usage and costs
80
+ """
81
+
82
+ # Get the date range from the incremental state
83
+ start_value = date.last_value if date.last_value else date.initial_value
84
+ start_date_parsed = (
85
+ pendulum.parse(start_value) if start_value else pendulum.now()
86
+ )
87
+
88
+ # Ensure we have a DateTime object
89
+ if isinstance(start_date_parsed, pendulum.DateTime):
90
+ start_date = start_date_parsed
91
+ elif isinstance(start_date_parsed, pendulum.Date):
92
+ start_date = pendulum.datetime(
93
+ start_date_parsed.year, start_date_parsed.month, start_date_parsed.day
94
+ )
95
+ else:
96
+ start_date = pendulum.now()
97
+
98
+ end_filter = pendulum.now()
99
+ if date.end_value:
100
+ end_filter_parsed = pendulum.parse(date.end_value)
101
+ # Ensure we have a DateTime object
102
+ if isinstance(end_filter_parsed, pendulum.DateTime):
103
+ end_filter = end_filter_parsed
104
+ elif isinstance(end_filter_parsed, pendulum.Date):
105
+ end_filter = pendulum.datetime(
106
+ end_filter_parsed.year,
107
+ end_filter_parsed.month,
108
+ end_filter_parsed.day,
109
+ )
110
+
111
+ # Iterate through each day in the range
112
+ current_date = start_date
113
+ while current_date.date() <= end_filter.date():
114
+ # Fetch data for the current date
115
+ for record in fetch_claude_code_usage(
116
+ api_key, current_date.to_date_string()
117
+ ):
118
+ yield record
119
+
120
+ # Move to the next day
121
+ current_date = current_date.add(days=1)
122
+
123
+ @dlt.resource(
124
+ name="usage_report",
125
+ write_disposition="merge",
126
+ primary_key=["bucket", "api_key_id", "workspace_id", "model", "service_tier"],
127
+ )
128
+ def usage_report() -> Iterator[Dict[str, Any]]:
129
+ """
130
+ Load usage report data from the messages endpoint.
131
+
132
+ Yields records with token usage and server tool usage metrics.
133
+ """
134
+
135
+ # Convert dates to ISO format with timezone
136
+ start_iso = start_date.to_iso8601_string()
137
+ end_iso = (
138
+ end_date.to_iso8601_string()
139
+ if end_date
140
+ else pendulum.now().to_iso8601_string()
141
+ )
142
+
143
+ for record in fetch_usage_report(
144
+ api_key,
145
+ starting_at=start_iso,
146
+ ending_at=end_iso,
147
+ bucket_width="1h", # Hourly buckets by default
148
+ ):
149
+ yield record
150
+
151
+ @dlt.resource(
152
+ name="cost_report",
153
+ write_disposition="merge",
154
+ primary_key=["bucket", "workspace_id", "description"],
155
+ )
156
+ def cost_report() -> Iterator[Dict[str, Any]]:
157
+ """
158
+ Load cost report data.
159
+
160
+ Yields records with cost breakdowns by workspace and description.
161
+ """
162
+
163
+ # Convert dates to ISO format with timezone
164
+ start_iso = start_date.to_iso8601_string()
165
+ end_iso = (
166
+ end_date.to_iso8601_string()
167
+ if end_date
168
+ else pendulum.now().to_iso8601_string()
169
+ )
170
+
171
+ for record in fetch_cost_report(
172
+ api_key,
173
+ starting_at=start_iso,
174
+ ending_at=end_iso,
175
+ ):
176
+ yield record
177
+
178
+ @dlt.resource(
179
+ name="organization",
180
+ write_disposition="replace",
181
+ )
182
+ def organization() -> Iterator[Dict[str, Any]]:
183
+ """
184
+ Load organization information.
185
+
186
+ Yields a single record with organization details.
187
+ """
188
+ org_info = fetch_organization_info(api_key)
189
+ if org_info:
190
+ yield org_info
191
+
192
+ @dlt.resource(
193
+ name="workspaces",
194
+ write_disposition="replace",
195
+ primary_key=["id"],
196
+ )
197
+ def workspaces() -> Iterator[Dict[str, Any]]:
198
+ """
199
+ Load all workspaces in the organization.
200
+
201
+ Yields records with workspace details including name, type, and creation date.
202
+ """
203
+ for workspace in fetch_workspaces(api_key):
204
+ yield workspace
205
+
206
+ @dlt.resource(
207
+ name="api_keys",
208
+ write_disposition="replace",
209
+ primary_key=["id"],
210
+ )
211
+ def api_keys() -> Iterator[Dict[str, Any]]:
212
+ """
213
+ Load all API keys in the organization.
214
+
215
+ Yields records with API key details including name, status, and creation date.
216
+ """
217
+ for api_key_record in fetch_api_keys(api_key):
218
+ yield api_key_record
219
+
220
+ @dlt.resource(
221
+ name="invites",
222
+ write_disposition="replace",
223
+ primary_key=["id"],
224
+ )
225
+ def invites() -> Iterator[Dict[str, Any]]:
226
+ """
227
+ Load all pending invites in the organization.
228
+
229
+ Yields records with invite details including email, role, and expiration.
230
+ """
231
+ for invite in fetch_invites(api_key):
232
+ yield invite
233
+
234
+ @dlt.resource(
235
+ name="users",
236
+ write_disposition="replace",
237
+ primary_key=["id"],
238
+ )
239
+ def users() -> Iterator[Dict[str, Any]]:
240
+ """
241
+ Load all users in the organization.
242
+
243
+ Yields records with user details including email, name, and role.
244
+ """
245
+ for user in fetch_users(api_key):
246
+ yield user
247
+
248
+ @dlt.resource(
249
+ name="workspace_members",
250
+ write_disposition="replace",
251
+ primary_key=["workspace_id", "user_id"],
252
+ )
253
+ def workspace_members() -> Iterator[Dict[str, Any]]:
254
+ """
255
+ Load workspace members for all workspaces.
256
+
257
+ Yields records with workspace membership details.
258
+ """
259
+ # First get all workspaces
260
+ for workspace in fetch_workspaces(api_key):
261
+ workspace_id = workspace.get("id")
262
+ if workspace_id:
263
+ # Get members for each workspace
264
+ for member in fetch_workspace_members(api_key, workspace_id):
265
+ yield member
266
+
267
+ return [
268
+ claude_code_usage,
269
+ usage_report,
270
+ cost_report,
271
+ organization,
272
+ workspaces,
273
+ api_keys,
274
+ invites,
275
+ users,
276
+ workspace_members,
277
+ ]