ingestr 0.13.75__py3-none-any.whl → 0.14.98__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

Files changed (79) hide show
  1. ingestr/main.py +22 -3
  2. ingestr/src/adjust/__init__.py +4 -4
  3. ingestr/src/allium/__init__.py +128 -0
  4. ingestr/src/anthropic/__init__.py +277 -0
  5. ingestr/src/anthropic/helpers.py +525 -0
  6. ingestr/src/appstore/__init__.py +1 -0
  7. ingestr/src/asana_source/__init__.py +1 -1
  8. ingestr/src/buildinfo.py +1 -1
  9. ingestr/src/chess/__init__.py +1 -1
  10. ingestr/src/couchbase_source/__init__.py +118 -0
  11. ingestr/src/couchbase_source/helpers.py +135 -0
  12. ingestr/src/cursor/__init__.py +83 -0
  13. ingestr/src/cursor/helpers.py +188 -0
  14. ingestr/src/destinations.py +169 -1
  15. ingestr/src/docebo/__init__.py +589 -0
  16. ingestr/src/docebo/client.py +435 -0
  17. ingestr/src/docebo/helpers.py +97 -0
  18. ingestr/src/elasticsearch/helpers.py +138 -0
  19. ingestr/src/errors.py +8 -0
  20. ingestr/src/facebook_ads/__init__.py +26 -23
  21. ingestr/src/facebook_ads/helpers.py +47 -1
  22. ingestr/src/factory.py +48 -0
  23. ingestr/src/filesystem/__init__.py +8 -3
  24. ingestr/src/filters.py +9 -0
  25. ingestr/src/fluxx/__init__.py +9906 -0
  26. ingestr/src/fluxx/helpers.py +209 -0
  27. ingestr/src/frankfurter/__init__.py +157 -163
  28. ingestr/src/frankfurter/helpers.py +3 -3
  29. ingestr/src/freshdesk/__init__.py +25 -8
  30. ingestr/src/freshdesk/freshdesk_client.py +40 -5
  31. ingestr/src/fundraiseup/__init__.py +49 -0
  32. ingestr/src/fundraiseup/client.py +81 -0
  33. ingestr/src/github/__init__.py +6 -4
  34. ingestr/src/google_analytics/__init__.py +1 -1
  35. ingestr/src/hostaway/__init__.py +302 -0
  36. ingestr/src/hostaway/client.py +288 -0
  37. ingestr/src/http/__init__.py +35 -0
  38. ingestr/src/http/readers.py +114 -0
  39. ingestr/src/hubspot/__init__.py +6 -12
  40. ingestr/src/influxdb/__init__.py +1 -0
  41. ingestr/src/intercom/__init__.py +142 -0
  42. ingestr/src/intercom/helpers.py +674 -0
  43. ingestr/src/intercom/settings.py +279 -0
  44. ingestr/src/jira_source/__init__.py +340 -0
  45. ingestr/src/jira_source/helpers.py +439 -0
  46. ingestr/src/jira_source/settings.py +170 -0
  47. ingestr/src/klaviyo/__init__.py +5 -5
  48. ingestr/src/linear/__init__.py +553 -116
  49. ingestr/src/linear/helpers.py +77 -38
  50. ingestr/src/mailchimp/__init__.py +126 -0
  51. ingestr/src/mailchimp/helpers.py +226 -0
  52. ingestr/src/mailchimp/settings.py +164 -0
  53. ingestr/src/masking.py +344 -0
  54. ingestr/src/monday/__init__.py +246 -0
  55. ingestr/src/monday/helpers.py +392 -0
  56. ingestr/src/monday/settings.py +328 -0
  57. ingestr/src/mongodb/__init__.py +5 -2
  58. ingestr/src/mongodb/helpers.py +384 -10
  59. ingestr/src/plusvibeai/__init__.py +335 -0
  60. ingestr/src/plusvibeai/helpers.py +544 -0
  61. ingestr/src/plusvibeai/settings.py +252 -0
  62. ingestr/src/revenuecat/__init__.py +83 -0
  63. ingestr/src/revenuecat/helpers.py +237 -0
  64. ingestr/src/salesforce/__init__.py +15 -8
  65. ingestr/src/shopify/__init__.py +1 -1
  66. ingestr/src/smartsheets/__init__.py +33 -5
  67. ingestr/src/socrata_source/__init__.py +83 -0
  68. ingestr/src/socrata_source/helpers.py +85 -0
  69. ingestr/src/socrata_source/settings.py +8 -0
  70. ingestr/src/sources.py +1418 -54
  71. ingestr/src/stripe_analytics/__init__.py +2 -19
  72. ingestr/src/wise/__init__.py +68 -0
  73. ingestr/src/wise/client.py +63 -0
  74. ingestr/tests/unit/test_smartsheets.py +6 -9
  75. {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/METADATA +24 -12
  76. {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/RECORD +79 -37
  77. {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/WHEEL +0 -0
  78. {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/entry_points.txt +0 -0
  79. {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,435 @@
1
+ """Docebo API Client for handling authentication and paginated requests."""
2
+
3
+ from typing import Any, Dict, Iterator, Optional
4
+
5
+ from ingestr.src.docebo.helpers import normalize_docebo_dates
6
+ from ingestr.src.http_client import create_client
7
+
8
+
9
+ class DoceboClient:
10
+ """Client for interacting with Docebo LMS API."""
11
+
12
+ def __init__(
13
+ self,
14
+ base_url: str,
15
+ client_id: str,
16
+ client_secret: str,
17
+ username: Optional[str] = None,
18
+ password: Optional[str] = None,
19
+ ):
20
+ """
21
+ Initialize Docebo API client.
22
+
23
+ Args:
24
+ base_url: The base URL of your Docebo instance
25
+ client_id: OAuth2 client ID
26
+ client_secret: OAuth2 client secret
27
+ username: Optional username for password grant type
28
+ password: Optional password for password grant type
29
+ """
30
+ self.base_url = base_url.rstrip("/")
31
+ self.client_id = client_id
32
+ self.client_secret = client_secret
33
+ self.username = username
34
+ self.password = password
35
+ self._access_token = None
36
+ # Use shared HTTP client with retry logic
37
+ self.client = create_client(retry_status_codes=[429, 500, 502, 503, 504])
38
+
39
+ def get_access_token(self) -> str:
40
+ """
41
+ Get or refresh OAuth2 access token.
42
+
43
+ Returns:
44
+ Access token string
45
+
46
+ Raises:
47
+ Exception: If authentication fails
48
+ """
49
+ if self._access_token:
50
+ return self._access_token
51
+
52
+ auth_endpoint = f"{self.base_url}/oauth2/token"
53
+
54
+ # Use client_credentials grant type if no username/password provided
55
+ if not self.username or not self.password:
56
+ data = {
57
+ "client_id": self.client_id,
58
+ "client_secret": self.client_secret,
59
+ "grant_type": "client_credentials",
60
+ "scope": "api",
61
+ }
62
+ else:
63
+ data = {
64
+ "client_id": self.client_id,
65
+ "client_secret": self.client_secret,
66
+ "username": self.username,
67
+ "password": self.password,
68
+ "grant_type": "password",
69
+ "scope": "api",
70
+ }
71
+
72
+ response = self.client.post(url=auth_endpoint, data=data)
73
+ response.raise_for_status()
74
+ token_data = response.json()
75
+ self._access_token = token_data.get("access_token")
76
+ if not self._access_token:
77
+ raise Exception("Failed to obtain access token from Docebo")
78
+
79
+ return self._access_token
80
+
81
+ def get_paginated_data(
82
+ self,
83
+ endpoint: str,
84
+ page_size: int = 200,
85
+ params: Optional[Dict[str, Any]] = None,
86
+ ) -> Iterator[list[Dict[str, Any]]]:
87
+ """
88
+ Fetch paginated data from a Docebo API endpoint.
89
+
90
+ Args:
91
+ endpoint: API endpoint path (e.g., "manage/v1/user")
92
+ page_size: Number of items per page
93
+ params: Additional query parameters
94
+
95
+ Yields:
96
+ Batches of items from the API
97
+ """
98
+ url = f"{self.base_url}/{endpoint}"
99
+ headers = {"authorization": f"Bearer {self.get_access_token()}"}
100
+
101
+ page = 1
102
+ has_more_data = True
103
+
104
+ while has_more_data:
105
+ request_params = {"page": page, "page_size": page_size}
106
+ if params:
107
+ request_params.update(params)
108
+
109
+ response = self.client.get(url=url, headers=headers, params=request_params)
110
+ response.raise_for_status()
111
+ data = response.json()
112
+
113
+ # Handle paginated response structure
114
+ if "data" in data:
115
+ # Most Docebo endpoints return data in this structure
116
+ if "items" in data["data"]:
117
+ items = data["data"]["items"]
118
+ if items:
119
+ # Normalize dates for each item before yielding
120
+ normalized_items = [
121
+ normalize_docebo_dates(item) for item in items
122
+ ]
123
+ yield normalized_items
124
+
125
+ # Check for more pages
126
+ has_more_data = data["data"].get("has_more_data", False)
127
+ if has_more_data and "total_page_count" in data["data"]:
128
+ total_pages = data["data"]["total_page_count"]
129
+ if page >= total_pages:
130
+ has_more_data = False
131
+ # Some endpoints might return data directly as a list
132
+ elif isinstance(data["data"], list):
133
+ items = data["data"]
134
+ if items:
135
+ # Normalize dates for each item before yielding
136
+ normalized_items = [
137
+ normalize_docebo_dates(item) for item in items
138
+ ]
139
+ yield normalized_items
140
+ # For direct list responses, check if we got a full page
141
+ has_more_data = len(items) == page_size
142
+ else:
143
+ has_more_data = False
144
+ # Some endpoints might return items directly
145
+ elif isinstance(data, list):
146
+ if data:
147
+ # Normalize dates for each item before yielding
148
+ normalized_items = [normalize_docebo_dates(item) for item in data]
149
+ yield normalized_items
150
+ has_more_data = len(data) == page_size
151
+ else:
152
+ has_more_data = False
153
+
154
+ page += 1
155
+
156
+ def fetch_users(self) -> Iterator[list[Dict[str, Any]]]:
157
+ """
158
+ Fetch all users from Docebo.
159
+
160
+ Yields:
161
+ Batches of user data
162
+ """
163
+ yield from self.get_paginated_data("manage/v1/user")
164
+
165
+ def fetch_courses(self, page_size: int = 200) -> Iterator[list[Dict[str, Any]]]:
166
+ """
167
+ Fetch all courses from Docebo.
168
+
169
+ Yields:
170
+ Batches of course data
171
+ """
172
+ yield from self.get_paginated_data("learn/v1/courses", page_size=page_size)
173
+
174
+ # Phase 1: Core User and Organization Resources
175
+ def fetch_user_fields(self) -> Iterator[list[Dict[str, Any]]]:
176
+ """
177
+ Fetch all user fields from Docebo.
178
+
179
+ Yields:
180
+ Batches of user field definitions
181
+ """
182
+ yield from self.get_paginated_data("manage/v1/user_fields")
183
+
184
+ def fetch_branches(self) -> Iterator[list[Dict[str, Any]]]:
185
+ """
186
+ Fetch all branches/organizational units from Docebo.
187
+
188
+ Yields:
189
+ Batches of branch/org chart data
190
+ """
191
+ yield from self.get_paginated_data("manage/v1/orgchart")
192
+
193
+ # Phase 2: Group Management
194
+ def fetch_groups(self) -> Iterator[list[Dict[str, Any]]]:
195
+ """
196
+ Fetch all groups/audiences from Docebo.
197
+
198
+ Yields:
199
+ Batches of group data
200
+ """
201
+ yield from self.get_paginated_data("audiences/v1/audience")
202
+
203
+ def fetch_all_group_members(self) -> Iterator[list[Dict[str, Any]]]:
204
+ """
205
+ Fetch all group members for all groups.
206
+
207
+ Yields:
208
+ Batches of group member data with group_id included
209
+ """
210
+ # First fetch all groups
211
+ all_groups: list[Dict[str, Any]] = []
212
+ for group_batch in self.fetch_groups():
213
+ all_groups.extend(group_batch)
214
+
215
+ # Then fetch members for each group
216
+ for group in all_groups:
217
+ group_id = (
218
+ group.get("group_id") or group.get("audience_id") or group.get("id")
219
+ )
220
+ if group_id:
221
+ try:
222
+ for member_batch in self.get_paginated_data(
223
+ f"manage/v1/group/{group_id}/members"
224
+ ):
225
+ # Add group_id to each member record
226
+ for member in member_batch:
227
+ member["group_id"] = group_id
228
+ yield member_batch
229
+ except Exception as e:
230
+ print(f"Error fetching members for group {group_id}: {e}")
231
+ continue
232
+
233
+ # Phase 3: Advanced Course Resources
234
+ def fetch_course_fields(self) -> Iterator[list[Dict[str, Any]]]:
235
+ """
236
+ Fetch all course field definitions from Docebo.
237
+
238
+ Yields:
239
+ Batches of course field data
240
+ """
241
+ yield from self.get_paginated_data("learn/v1/courses/field")
242
+
243
+ def fetch_all_course_learning_objects(self) -> Iterator[list[Dict[str, Any]]]:
244
+ """
245
+ Fetch learning objects for all courses.
246
+
247
+ Yields:
248
+ Batches of learning object data
249
+ """
250
+ # First fetch all courses
251
+ all_courses: list[Dict[str, Any]] = []
252
+ for course_batch in self.fetch_courses():
253
+ all_courses.extend(course_batch)
254
+
255
+ # Then fetch learning objects for each course
256
+ for course in all_courses:
257
+ course_id = course.get("id_course") or course.get("course_id")
258
+ if course_id:
259
+ try:
260
+ endpoint = f"learn/v1/courses/{course_id}/los"
261
+ for lo_batch in self.get_paginated_data(endpoint):
262
+ # Add course_id to each learning object
263
+ for lo in lo_batch:
264
+ if "course_id" not in lo:
265
+ lo["course_id"] = course_id
266
+ yield lo_batch
267
+ except Exception as e:
268
+ print(
269
+ f"Error fetching learning objects for course {course_id}: {e}"
270
+ )
271
+ continue
272
+
273
+ # Phase 4: Learning Plans
274
+ def fetch_learning_plans(self) -> Iterator[list[Dict[str, Any]]]:
275
+ """
276
+ Fetch all learning plans from Docebo.
277
+
278
+ Yields:
279
+ Batches of learning plan data
280
+ """
281
+ yield from self.get_paginated_data("learningplan/v1/learningplans")
282
+
283
+ def fetch_learning_plan_enrollments(self) -> Iterator[list[Dict[str, Any]]]:
284
+ """
285
+ Fetch all learning plan enrollments.
286
+
287
+ Yields:
288
+ Batches of learning plan enrollment data
289
+ """
290
+ yield from self.get_paginated_data(
291
+ "learningplan/v1/learningplans/enrollments",
292
+ params={"extra_fields[]": "enrollment_status"},
293
+ )
294
+
295
+ def fetch_all_learning_plan_course_enrollments(
296
+ self,
297
+ ) -> Iterator[list[Dict[str, Any]]]:
298
+ """
299
+ Fetch course enrollments for all learning plans.
300
+
301
+ Yields:
302
+ Batches of learning plan course enrollment data
303
+ """
304
+ # First fetch all learning plans
305
+ all_plans: list[Dict[str, Any]] = []
306
+ for plan_batch in self.fetch_learning_plans():
307
+ all_plans.extend(plan_batch)
308
+
309
+ # Then fetch course enrollments for each learning plan
310
+ for plan in all_plans:
311
+ plan_id = (
312
+ plan.get("id_path") or plan.get("learning_plan_id") or plan.get("id")
313
+ )
314
+ if plan_id:
315
+ try:
316
+ endpoint = (
317
+ f"learningplan/v1/learningplans/{plan_id}/courses/enrollments"
318
+ )
319
+ for enrollment_batch in self.get_paginated_data(
320
+ endpoint, params={"enrollment_level[]": "student"}
321
+ ):
322
+ # Add learning_plan_id to each enrollment
323
+ for enrollment in enrollment_batch:
324
+ enrollment["learning_plan_id"] = plan_id
325
+ yield enrollment_batch
326
+ except Exception as e:
327
+ print(
328
+ f"Error fetching course enrollments for learning plan {plan_id}: {e}"
329
+ )
330
+ continue
331
+
332
+ # Phase 5: Enrollments and Surveys
333
+ def fetch_all_course_enrollments(self) -> Iterator[list[Dict[str, Any]]]:
334
+ """
335
+ Fetch enrollments for all courses.
336
+
337
+ Yields:
338
+ Batches of course enrollment data
339
+ """
340
+ # First fetch all courses
341
+ all_courses: list[Dict[str, Any]] = []
342
+ for course_batch in self.fetch_courses():
343
+ all_courses.extend(course_batch)
344
+
345
+ # Then fetch enrollments for each course
346
+ for course in all_courses:
347
+ course_id = course.get("id_course") or course.get("course_id")
348
+ if course_id:
349
+ try:
350
+ endpoint = f"course/v1/courses/{course_id}/enrollments"
351
+ for enrollment_batch in self.get_paginated_data(
352
+ endpoint, params={"level[]": "3"}
353
+ ):
354
+ # Add course_id to each enrollment
355
+ for enrollment in enrollment_batch:
356
+ enrollment["course_id"] = course_id
357
+ yield enrollment_batch
358
+ except Exception as e:
359
+ print(f"Error fetching enrollments for course {course_id}: {e}")
360
+ continue
361
+
362
+ # Additional Resources
363
+ def fetch_sessions(self) -> Iterator[list[Dict[str, Any]]]:
364
+ """
365
+ Fetch all ILT/classroom sessions for all courses.
366
+
367
+ Yields:
368
+ Batches of session data
369
+ """
370
+ # First fetch all courses
371
+ all_courses: list[Dict[str, Any]] = []
372
+ for course_batch in self.fetch_courses():
373
+ all_courses.extend(course_batch)
374
+
375
+ # Then fetch sessions for each course
376
+ for course in all_courses:
377
+ course_id = course.get("id_course") or course.get("course_id")
378
+ if course_id:
379
+ try:
380
+ endpoint = f"learn/v1/courses/{course_id}/sessions"
381
+ for session_batch in self.get_paginated_data(endpoint):
382
+ # Add course_id to each session
383
+ for session in session_batch:
384
+ session["course_id"] = course_id
385
+ yield session_batch
386
+ except Exception:
387
+ # Many courses may not have sessions, so just continue
388
+ continue
389
+
390
+ def fetch_categories(self) -> Iterator[list[Dict[str, Any]]]:
391
+ """
392
+ Fetch all course categories.
393
+
394
+ Yields:
395
+ Batches of category data
396
+ """
397
+ yield from self.get_paginated_data("learn/v1/categories")
398
+
399
+ def fetch_certifications(self) -> Iterator[list[Dict[str, Any]]]:
400
+ """
401
+ Fetch all certifications.
402
+
403
+ Yields:
404
+ Batches of certification data
405
+ """
406
+ yield from self.get_paginated_data("learn/v1/certification")
407
+
408
+ def fetch_external_training(self) -> Iterator[list[Dict[str, Any]]]:
409
+ """
410
+ Fetch all external training records.
411
+
412
+ Yields:
413
+ Batches of external training data
414
+ """
415
+ yield from self.get_paginated_data("learn/v1/external_training")
416
+
417
+ def fetch_survey_answers_for_poll(
418
+ self, poll_id: int, course_id: int
419
+ ) -> Dict[str, Any]:
420
+ """
421
+ Fetch survey answers for a specific poll.
422
+
423
+ Args:
424
+ poll_id: The poll/survey ID
425
+ course_id: The course ID containing the poll
426
+
427
+ Returns:
428
+ Survey answer data or empty dict if no answers
429
+ """
430
+ url = f"{self.base_url}/learn/v1/survey/{poll_id}/answer"
431
+ headers = {"authorization": f"Bearer {self.get_access_token()}"}
432
+ params = {"id_course": course_id}
433
+
434
+ response = self.client.get(url, headers=headers, params=params)
435
+ return normalize_docebo_dates(response.json().get("data", {}))
@@ -0,0 +1,97 @@
1
+ """Helper functions for Docebo API data processing."""
2
+
3
+ from datetime import datetime
4
+ from typing import Any, Dict, Union
5
+
6
+
7
+ def normalize_date_field(date_value: Any) -> Union[datetime, str, None]:
8
+ """
9
+ Normalize a single date field that may contain invalid dates.
10
+
11
+ Args:
12
+ date_value: The date value to normalize (string, datetime, or None)
13
+
14
+ Returns:
15
+ Normalized datetime object or None for invalid/empty dates
16
+ """
17
+ # Unix epoch datetime (1970-01-01 00:00:00 UTC)
18
+ epoch_datetime = datetime(1970, 1, 1)
19
+
20
+ # Handle string dates
21
+ if isinstance(date_value, str):
22
+ # Handle '0000-00-00' or '0000-00-00 00:00:00'
23
+ if date_value.startswith("0000-00-00"):
24
+ return epoch_datetime
25
+ # Handle other invalid date formats
26
+ elif date_value in ["", "0", "null", "NULL"]:
27
+ return None
28
+ # Try to parse valid date strings
29
+ else:
30
+ try:
31
+ # Try common date formats
32
+ for fmt in [
33
+ "%Y-%m-%d %H:%M:%S",
34
+ "%Y-%m-%d",
35
+ "%Y/%m/%d %H:%M:%S",
36
+ "%Y/%m/%d",
37
+ ]:
38
+ try:
39
+ return datetime.strptime(date_value, fmt)
40
+ except ValueError:
41
+ continue
42
+ # If no format matches, return the original string
43
+ return date_value
44
+ except Exception:
45
+ return date_value
46
+ # Handle datetime objects - pass through
47
+ elif isinstance(date_value, datetime):
48
+ return date_value
49
+ # Handle cases where the field might be None or empty
50
+ elif not date_value:
51
+ return None
52
+
53
+ # Return the original value for other types
54
+ return date_value
55
+
56
+
57
+ def normalize_docebo_dates(item: Dict[str, Any]) -> Dict[str, Any]:
58
+ """
59
+ Normalize Docebo date fields that contain '0000-00-00' to Unix epoch (1970-01-01).
60
+
61
+ Args:
62
+ item: Dictionary containing data from Docebo API
63
+
64
+ Returns:
65
+ Dictionary with normalized date fields
66
+ """
67
+ # Date fields that might contain '0000-00-00'
68
+ # Add more fields as needed for different resources
69
+ date_fields = [
70
+ "last_access_date",
71
+ "last_update",
72
+ "creation_date",
73
+ "date_begin", # Course field
74
+ "date_end", # Course field
75
+ "date_publish", # Course field
76
+ "date_unpublish", # Course field
77
+ "enrollment_date", # Enrollment field
78
+ "completion_date", # Enrollment field
79
+ "date_assigned", # Assignment field
80
+ "date_completed", # Completion field
81
+ "survey_date", # Survey field
82
+ "start_date", # Course/Plan field
83
+ "end_date", # Course/Plan field
84
+ "date_created", # Generic creation date
85
+ "created_on", # Learning plan field
86
+ "updated_on", # Learning plan field
87
+ "date_modified", # Generic modification date
88
+ "expire_date", # Expiration date
89
+ "date_last_updated", # Update date
90
+ "date", # Generic date field (used in survey answers)
91
+ ]
92
+
93
+ for field in date_fields:
94
+ if field in item:
95
+ item[field] = normalize_date_field(item[field])
96
+
97
+ return item
@@ -0,0 +1,138 @@
1
+ """Elasticsearch destination helpers"""
2
+
3
+ import json
4
+ import logging
5
+ from typing import Any, Dict, Iterator
6
+ from urllib.parse import urlparse
7
+
8
+ import dlt
9
+
10
+ from elasticsearch import Elasticsearch
11
+ from elasticsearch.helpers import bulk
12
+
13
+ # Suppress Elasticsearch transport logging
14
+ logging.getLogger("elasticsearch.transport").setLevel(logging.WARNING)
15
+ logging.getLogger("elastic_transport.transport").setLevel(logging.WARNING)
16
+
17
+
18
+ def process_file_items(file_path: str) -> Iterator[Dict[str, Any]]:
19
+ """Process items from a file path (JSONL format)."""
20
+ with open(file_path, "r") as f:
21
+ for line in f:
22
+ if line.strip():
23
+ doc = json.loads(line.strip())
24
+ # Clean DLT metadata
25
+ cleaned_doc = {
26
+ k: v for k, v in doc.items() if not k.startswith("_dlt_")
27
+ }
28
+ yield cleaned_doc
29
+
30
+
31
+ def process_iterable_items(items: Any) -> Iterator[Dict[str, Any]]:
32
+ """Process items from an iterable."""
33
+ for item in items:
34
+ if isinstance(item, dict):
35
+ # Clean DLT metadata
36
+ cleaned_item = {k: v for k, v in item.items() if not k.startswith("_dlt_")}
37
+ yield cleaned_item
38
+
39
+
40
+ @dlt.destination(
41
+ name="elasticsearch",
42
+ loader_file_format="typed-jsonl",
43
+ batch_size=1000,
44
+ naming_convention="snake_case",
45
+ )
46
+ def elasticsearch_insert(
47
+ items, table, connection_string: str = dlt.secrets.value
48
+ ) -> None:
49
+ """Insert data into Elasticsearch index.
50
+
51
+ Args:
52
+ items: Data items (file path or iterable)
53
+ table: Table metadata containing name and schema info
54
+ connection_string: Elasticsearch connection string
55
+ """
56
+ # Parse connection string
57
+ parsed = urlparse(connection_string)
58
+
59
+ # Build Elasticsearch client configuration
60
+ actual_url = connection_string
61
+ secure = True # Default to HTTPS (secure by default)
62
+
63
+ if connection_string.startswith("elasticsearch://"):
64
+ actual_url = connection_string.replace("elasticsearch://", "")
65
+
66
+ # Parse to check for query parameters
67
+ temp_parsed = urlparse("http://" + actual_url)
68
+ from urllib.parse import parse_qs
69
+
70
+ query_params = parse_qs(temp_parsed.query)
71
+
72
+ # Check ?secure parameter (defaults to true)
73
+ if "secure" in query_params:
74
+ secure = query_params["secure"][0].lower() in ["true", "1", "yes"]
75
+
76
+ # Remove query params from URL for ES client
77
+ actual_url = actual_url.split("?")[0]
78
+
79
+ # Add scheme
80
+ scheme = "https" if secure else "http"
81
+ actual_url = f"{scheme}://{actual_url}"
82
+
83
+ parsed = urlparse(actual_url)
84
+
85
+ es_config: Dict[str, Any] = {
86
+ "hosts": [actual_url],
87
+ "verify_certs": secure,
88
+ "ssl_show_warn": False,
89
+ }
90
+
91
+ # Add authentication if present
92
+ if parsed.username and parsed.password:
93
+ es_config["http_auth"] = (parsed.username, parsed.password)
94
+
95
+ # Get index name from table metadata
96
+ index_name = table["name"]
97
+
98
+ # Connect to Elasticsearch
99
+ client = Elasticsearch(**es_config)
100
+
101
+ # Replace mode: delete existing index if it exists
102
+ if client.indices.exists(index=index_name):
103
+ client.indices.delete(index=index_name)
104
+
105
+ # Process and insert documents
106
+ if isinstance(items, str):
107
+ documents = process_file_items(items)
108
+ else:
109
+ documents = process_iterable_items(items)
110
+
111
+ # Prepare documents for bulk insert as generator
112
+ def doc_generator():
113
+ for doc in documents:
114
+ es_doc: Dict[str, Any] = {"_index": index_name, "_source": doc.copy()}
115
+
116
+ # Use _id if present, otherwise let ES generate one
117
+ if "_id" in doc:
118
+ es_doc["_id"] = str(doc["_id"])
119
+ # Remove _id from source since it's metadata
120
+ if "_id" in es_doc["_source"]:
121
+ del es_doc["_source"]["_id"]
122
+ elif "id" in doc:
123
+ es_doc["_id"] = str(doc["id"])
124
+
125
+ yield es_doc
126
+
127
+ # Bulk insert
128
+ try:
129
+ _, failed_items = bulk(client, doc_generator(), request_timeout=60)
130
+ if failed_items:
131
+ failed_count = (
132
+ len(failed_items) if isinstance(failed_items, list) else failed_items
133
+ )
134
+ raise Exception(
135
+ f"Failed to insert {failed_count} documents: {failed_items}"
136
+ )
137
+ except Exception as e:
138
+ raise Exception(f"Elasticsearch bulk insert failed: {str(e)}")
ingestr/src/errors.py CHANGED
@@ -1,3 +1,6 @@
1
+ import requests
2
+
3
+
1
4
  class MissingValueError(Exception):
2
5
  def __init__(self, value, source):
3
6
  super().__init__(f"{value} is required to connect to {source}")
@@ -16,3 +19,8 @@ class InvalidBlobTableError(Exception):
16
19
  f"Invalid source table for {source} "
17
20
  "Ensure that the table is in the format {bucket-name}/{file glob}"
18
21
  )
22
+
23
+
24
+ class HTTPError(Exception):
25
+ def __init__(self, source: requests.HTTPError):
26
+ super().__init__(f"HTTP {source.response.status_code}: {source.response.text}")