ingestr 0.13.94__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

@@ -44,9 +44,8 @@ def docebo_source(
44
44
  @dlt.resource(
45
45
  name="users",
46
46
  write_disposition="replace",
47
- primary_key="user_id",
48
47
  columns={
49
- "user_id": {"data_type": "text", "nullable": False},
48
+ "user_id": {"data_type": "text", "nullable": True},
50
49
  "username": {"data_type": "text", "nullable": True},
51
50
  "first_name": {"data_type": "text", "nullable": True},
52
51
  "last_name": {"data_type": "text", "nullable": True},
@@ -85,10 +84,9 @@ def docebo_source(
85
84
  @dlt.resource(
86
85
  name="courses",
87
86
  write_disposition="replace",
88
- primary_key="id_course",
89
87
  parallelized=True,
90
88
  columns={
91
- "id_course": {"data_type": "bigint", "nullable": False},
89
+ "id_course": {"data_type": "bigint", "nullable": True},
92
90
  "name": {"data_type": "text", "nullable": True},
93
91
  "uidCourse": {"data_type": "text", "nullable": True},
94
92
  "description": {"data_type": "text", "nullable": True},
@@ -130,17 +128,12 @@ def docebo_source(
130
128
  for course in courses_batch:
131
129
  yield normalize_docebo_dates(course)
132
130
 
133
- # normalized_courses = [normalize_docebo_dates(course) for course in courses_batch]
134
- # print("yielding a batch for courses")
135
- # yield normalized_courses
136
-
137
- # Phase 1: Core User and Organization Resources
138
131
  @dlt.resource(
139
132
  name="user_fields",
140
133
  write_disposition="replace",
141
134
  primary_key="id",
142
135
  columns={
143
- "id": {"data_type": "bigint", "nullable": False},
136
+ "id": {"data_type": "bigint", "nullable": True},
144
137
  "name": {"data_type": "text", "nullable": True},
145
138
  "type": {"data_type": "text", "nullable": True},
146
139
  "mandatory": {"data_type": "bool", "nullable": True},
@@ -161,9 +154,8 @@ def docebo_source(
161
154
  @dlt.resource(
162
155
  name="branches",
163
156
  write_disposition="replace",
164
- primary_key="id_org",
165
157
  columns={
166
- "id_org": {"data_type": "bigint", "nullable": False},
158
+ "id_org": {"data_type": "bigint", "nullable": True},
167
159
  "id_parent": {"data_type": "bigint", "nullable": True},
168
160
  "lft": {"data_type": "bigint", "nullable": True},
169
161
  "rgt": {"data_type": "bigint", "nullable": True},
@@ -185,7 +177,7 @@ def docebo_source(
185
177
  write_disposition="replace",
186
178
  primary_key="group_id",
187
179
  columns={
188
- "group_id": {"data_type": "bigint", "nullable": False},
180
+ "group_id": {"data_type": "bigint", "nullable": True},
189
181
  "name": {"data_type": "text", "nullable": True},
190
182
  "description": {"data_type": "text", "nullable": True},
191
183
  "language": {"data_type": "text", "nullable": True},
@@ -210,8 +202,8 @@ def docebo_source(
210
202
  write_disposition="replace",
211
203
  primary_key=["group_id", "user_id"],
212
204
  columns={
213
- "group_id": {"data_type": "bigint", "nullable": False},
214
- "user_id": {"data_type": "text", "nullable": False},
205
+ "group_id": {"data_type": "bigint", "nullable": True},
206
+ "user_id": {"data_type": "text", "nullable": True},
215
207
  "username": {"data_type": "text", "nullable": True},
216
208
  "first_name": {"data_type": "text", "nullable": True},
217
209
  "last_name": {"data_type": "text", "nullable": True},
@@ -232,7 +224,7 @@ def docebo_source(
232
224
  write_disposition="replace",
233
225
  primary_key="field_id",
234
226
  columns={
235
- "field_id": {"data_type": "bigint", "nullable": False},
227
+ "field_id": {"data_type": "bigint", "nullable": True},
236
228
  "type_field": {"data_type": "text", "nullable": True},
237
229
  "name_field": {"data_type": "text", "nullable": True},
238
230
  "is_mandatory": {"data_type": "bool", "nullable": True},
@@ -252,11 +244,10 @@ def docebo_source(
252
244
  name="learning_objects",
253
245
  data_from=courses,
254
246
  write_disposition="replace",
255
- primary_key=["course_id", "id_org"],
256
247
  parallelized=True,
257
248
  columns={
258
- "course_id": {"data_type": "bigint", "nullable": False},
259
- "id_org": {"data_type": "bigint", "nullable": False},
249
+ "course_id": {"data_type": "bigint", "nullable": True},
250
+ "id_org": {"data_type": "bigint", "nullable": True},
260
251
  "object_id": {"data_type": "bigint", "nullable": True},
261
252
  "lo_code": {"data_type": "text", "nullable": True},
262
253
  "lo_name": {"data_type": "text", "nullable": True},
@@ -286,9 +277,8 @@ def docebo_source(
286
277
  @dlt.resource(
287
278
  name="learning_plans",
288
279
  write_disposition="replace",
289
- primary_key="learning_plan_id",
290
280
  columns={
291
- "learning_plan_id": {"data_type": "bigint", "nullable": False},
281
+ "learning_plan_id": {"data_type": "bigint", "nullable": True},
292
282
  "uuid": {"data_type": "text", "nullable": True},
293
283
  "code": {"data_type": "text", "nullable": True},
294
284
  "title": {"data_type": "text", "nullable": True},
@@ -316,10 +306,9 @@ def docebo_source(
316
306
  @dlt.resource(
317
307
  name="learning_plan_enrollments",
318
308
  write_disposition="replace",
319
- primary_key=["id_path", "id_user"],
320
309
  columns={
321
- "id_path": {"data_type": "bigint", "nullable": False},
322
- "id_user": {"data_type": "text", "nullable": False},
310
+ "id_path": {"data_type": "bigint", "nullable": True},
311
+ "id_user": {"data_type": "text", "nullable": True},
323
312
  "enrollment_date": {"data_type": "timestamp", "nullable": True},
324
313
  "completion_date": {"data_type": "timestamp", "nullable": True},
325
314
  "enrollment_status": {"data_type": "text", "nullable": True},
@@ -339,11 +328,10 @@ def docebo_source(
339
328
  @dlt.resource(
340
329
  name="learning_plan_course_enrollments",
341
330
  write_disposition="replace",
342
- primary_key=["learning_plan_id", "course_id", "user_id"],
343
331
  columns={
344
- "learning_plan_id": {"data_type": "bigint", "nullable": False},
345
- "course_id": {"data_type": "bigint", "nullable": False},
346
- "user_id": {"data_type": "text", "nullable": False},
332
+ "learning_plan_id": {"data_type": "bigint", "nullable": True},
333
+ "course_id": {"data_type": "bigint", "nullable": True},
334
+ "user_id": {"data_type": "text", "nullable": True},
347
335
  "enrollment_date": {"data_type": "timestamp", "nullable": True},
348
336
  "completion_date": {"data_type": "timestamp", "nullable": True},
349
337
  "status": {"data_type": "text", "nullable": True},
@@ -362,10 +350,9 @@ def docebo_source(
362
350
  @dlt.resource(
363
351
  name="course_enrollments",
364
352
  write_disposition="replace",
365
- primary_key=["course_id", "user_id"],
366
353
  columns={
367
- "course_id": {"data_type": "bigint", "nullable": False},
368
- "user_id": {"data_type": "text", "nullable": False},
354
+ "course_id": {"data_type": "bigint", "nullable": True},
355
+ "user_id": {"data_type": "text", "nullable": True},
369
356
  "enrollment_date": {"data_type": "timestamp", "nullable": True},
370
357
  "completion_date": {"data_type": "timestamp", "nullable": True},
371
358
  "status": {"data_type": "text", "nullable": True},
@@ -388,10 +375,9 @@ def docebo_source(
388
375
  @dlt.resource(
389
376
  name="sessions",
390
377
  write_disposition="replace",
391
- primary_key=["course_id", "session_id"],
392
378
  columns={
393
- "course_id": {"data_type": "bigint", "nullable": False},
394
- "session_id": {"data_type": "bigint", "nullable": False},
379
+ "course_id": {"data_type": "bigint", "nullable": True},
380
+ "session_id": {"data_type": "bigint", "nullable": True},
395
381
  "name": {"data_type": "text", "nullable": True},
396
382
  "code": {"data_type": "text", "nullable": True},
397
383
  "date_start": {"data_type": "timestamp", "nullable": True},
@@ -416,9 +402,8 @@ def docebo_source(
416
402
  @dlt.resource(
417
403
  name="categories",
418
404
  write_disposition="replace",
419
- primary_key="id_cat",
420
405
  columns={
421
- "id_cat": {"data_type": "bigint", "nullable": False},
406
+ "id_cat": {"data_type": "bigint", "nullable": True},
422
407
  "code": {"data_type": "text", "nullable": True},
423
408
  "description": {"data_type": "text", "nullable": True},
424
409
  "id_parent": {"data_type": "bigint", "nullable": True},
@@ -437,9 +422,8 @@ def docebo_source(
437
422
  @dlt.resource(
438
423
  name="certifications",
439
424
  write_disposition="replace",
440
- primary_key="id_cert",
441
425
  columns={
442
- "id_cert": {"data_type": "bigint", "nullable": False},
426
+ "id_cert": {"data_type": "bigint", "nullable": True},
443
427
  "code": {"data_type": "text", "nullable": True},
444
428
  "title": {"data_type": "text", "nullable": True},
445
429
  "description": {"data_type": "text", "nullable": True},
@@ -463,9 +447,8 @@ def docebo_source(
463
447
  @dlt.resource(
464
448
  name="external_training",
465
449
  write_disposition="replace",
466
- primary_key="external_training_id",
467
450
  columns={
468
- "external_training_id": {"data_type": "bigint", "nullable": False},
451
+ "external_training_id": {"data_type": "bigint", "nullable": True},
469
452
  "user_id": {"data_type": "text", "nullable": True},
470
453
  "title": {"data_type": "text", "nullable": True},
471
454
  "description": {"data_type": "text", "nullable": True},
@@ -495,10 +478,9 @@ def docebo_source(
495
478
  write_disposition="replace",
496
479
  name="polls",
497
480
  parallelized=True,
498
- primary_key=["poll_id", "course_id"],
499
481
  columns={
500
- "poll_id": {"data_type": "bigint", "nullable": False},
501
- "course_id": {"data_type": "bigint", "nullable": False},
482
+ "poll_id": {"data_type": "bigint", "nullable": True},
483
+ "course_id": {"data_type": "bigint", "nullable": True},
502
484
  "poll_title": {"data_type": "text", "nullable": True},
503
485
  "object_type": {"data_type": "text", "nullable": True},
504
486
  "lo_type": {"data_type": "text", "nullable": True},
@@ -533,10 +515,10 @@ def docebo_source(
533
515
  parallelized=True,
534
516
  name="survey_answers",
535
517
  columns={
536
- "course_id": {"data_type": "bigint", "nullable": False},
537
- "poll_id": {"data_type": "bigint", "nullable": False},
518
+ "course_id": {"data_type": "bigint", "nullable": True},
519
+ "poll_id": {"data_type": "bigint", "nullable": True},
538
520
  "poll_title": {"data_type": "text", "nullable": True},
539
- "question_id": {"data_type": "bigint", "nullable": False},
521
+ "question_id": {"data_type": "bigint", "nullable": True},
540
522
  "question_type": {"data_type": "text", "nullable": True},
541
523
  "question_title": {"data_type": "text", "nullable": True},
542
524
  "answer": {"data_type": "text", "nullable": True},
@@ -0,0 +1,112 @@
1
+ """Elasticsearch destination helpers"""
2
+
3
+ import json
4
+ from typing import Any, Dict, Iterator
5
+ from urllib.parse import urlparse
6
+
7
+ import dlt
8
+
9
+ from elasticsearch import Elasticsearch
10
+ from elasticsearch.helpers import bulk
11
+
12
+
13
+ def process_file_items(file_path: str) -> Iterator[Dict[str, Any]]:
14
+ """Process items from a file path (JSONL format)."""
15
+ with open(file_path, "r") as f:
16
+ for line in f:
17
+ if line.strip():
18
+ doc = json.loads(line.strip())
19
+ # Clean DLT metadata
20
+ cleaned_doc = {
21
+ k: v for k, v in doc.items() if not k.startswith("_dlt_")
22
+ }
23
+ yield cleaned_doc
24
+
25
+
26
+ def process_iterable_items(items: Any) -> Iterator[Dict[str, Any]]:
27
+ """Process items from an iterable."""
28
+ for item in items:
29
+ if isinstance(item, dict):
30
+ # Clean DLT metadata
31
+ cleaned_item = {k: v for k, v in item.items() if not k.startswith("_dlt_")}
32
+ yield cleaned_item
33
+
34
+
35
+ @dlt.destination(
36
+ name="elasticsearch",
37
+ loader_file_format="typed-jsonl",
38
+ batch_size=1000,
39
+ naming_convention="snake_case",
40
+ )
41
+ def elasticsearch_insert(
42
+ items, table, connection_string: str = dlt.secrets.value
43
+ ) -> None:
44
+ """Insert data into Elasticsearch index.
45
+
46
+ Args:
47
+ items: Data items (file path or iterable)
48
+ table: Table metadata containing name and schema info
49
+ connection_string: Elasticsearch connection string
50
+ """
51
+ # Parse connection string
52
+ parsed = urlparse(connection_string)
53
+
54
+ # Build Elasticsearch client configuration
55
+ hosts = [
56
+ {
57
+ "host": parsed.hostname or "localhost",
58
+ "port": parsed.port or 9200,
59
+ "scheme": parsed.scheme or "http",
60
+ }
61
+ ]
62
+
63
+ es_config: Dict[str, Any] = {"hosts": hosts}
64
+
65
+ # Add authentication if present
66
+ if parsed.username and parsed.password:
67
+ es_config["http_auth"] = (parsed.username, parsed.password)
68
+
69
+ # Get index name from table metadata
70
+ index_name = table["name"]
71
+
72
+ # Connect to Elasticsearch
73
+ client = Elasticsearch(**es_config)
74
+
75
+ # Replace mode: delete existing index if it exists
76
+ if client.indices.exists(index=index_name):
77
+ client.indices.delete(index=index_name)
78
+
79
+ # Process and insert documents
80
+ if isinstance(items, str):
81
+ documents = process_file_items(items)
82
+ else:
83
+ documents = process_iterable_items(items)
84
+
85
+ # Prepare documents for bulk insert as generator
86
+ def doc_generator():
87
+ for doc in documents:
88
+ es_doc: Dict[str, Any] = {"_index": index_name, "_source": doc.copy()}
89
+
90
+ # Use _id if present, otherwise let ES generate one
91
+ if "_id" in doc:
92
+ es_doc["_id"] = str(doc["_id"])
93
+ # Remove _id from source since it's metadata
94
+ if "_id" in es_doc["_source"]:
95
+ del es_doc["_source"]["_id"]
96
+ elif "id" in doc:
97
+ es_doc["_id"] = str(doc["id"])
98
+
99
+ yield es_doc
100
+
101
+ # Bulk insert
102
+ try:
103
+ _, failed_items = bulk(client, doc_generator(), request_timeout=60)
104
+ if failed_items:
105
+ failed_count = (
106
+ len(failed_items) if isinstance(failed_items, list) else failed_items
107
+ )
108
+ raise Exception(
109
+ f"Failed to insert {failed_count} documents: {failed_items}"
110
+ )
111
+ except Exception as e:
112
+ raise Exception(f"Elasticsearch bulk insert failed: {str(e)}")
@@ -179,7 +179,12 @@ def facebook_insights_source(
179
179
  start_date = date_start.last_value
180
180
  if date_start.end_value:
181
181
  end_date_val = pendulum.instance(date_start.end_value)
182
- end_date = end_date_val if isinstance(end_date_val, pendulum.Date) else end_date_val.date()
182
+
183
+ end_date = (
184
+ end_date_val
185
+ if isinstance(end_date_val, pendulum.Date)
186
+ else end_date_val.date()
187
+ )
183
188
  else:
184
189
  end_date = pendulum.now().date()
185
190
 
ingestr/src/factory.py CHANGED
@@ -11,7 +11,9 @@ from ingestr.src.destinations import (
11
11
  CsvDestination,
12
12
  DatabricksDestination,
13
13
  DuckDBDestination,
14
+ ElasticsearchDestination,
14
15
  GCSDestination,
16
+ MongoDBDestination,
15
17
  MotherduckDestination,
16
18
  MsSQLDestination,
17
19
  MySqlDestination,
@@ -21,10 +23,12 @@ from ingestr.src.destinations import (
21
23
  SnowflakeDestination,
22
24
  SqliteDestination,
23
25
  SynapseDestination,
26
+ TrinoDestination,
24
27
  )
25
28
  from ingestr.src.sources import (
26
29
  AdjustSource,
27
30
  AirtableSource,
31
+ AnthropicSource,
28
32
  AppleAppStoreSource,
29
33
  ApplovinMaxSource,
30
34
  AppLovinSource,
@@ -41,6 +45,7 @@ from ingestr.src.sources import (
41
45
  FluxxSource,
42
46
  FrankfurterSource,
43
47
  FreshdeskSource,
48
+ FundraiseupSource,
44
49
  GCSSource,
45
50
  GitHubSource,
46
51
  GoogleAdsSource,
@@ -106,6 +111,7 @@ SQL_SOURCE_SCHEMES = [
106
111
  "databricks",
107
112
  "db2",
108
113
  "spanner",
114
+ "trino",
109
115
  ]
110
116
 
111
117
 
@@ -144,6 +150,7 @@ class SourceDestinationFactory:
144
150
  source_scheme: str
145
151
  destination_scheme: str
146
152
  sources: Dict[str, Type[SourceProtocol]] = {
153
+ "anthropic": AnthropicSource,
147
154
  "csv": LocalCsvSource,
148
155
  "docebo": DoceboSource,
149
156
  "mongodb": MongoDbSource,
@@ -185,6 +192,7 @@ class SourceDestinationFactory:
185
192
  "pipedrive": PipedriveSource,
186
193
  "frankfurter": FrankfurterSource,
187
194
  "freshdesk": FreshdeskSource,
195
+ "fundraiseup": FundraiseupSource,
188
196
  "trustpilot": TrustpilotSource,
189
197
  "phantombuster": PhantombusterSource,
190
198
  "elasticsearch": ElasticsearchSource,
@@ -221,11 +229,15 @@ class SourceDestinationFactory:
221
229
  "athena": AthenaDestination,
222
230
  "clickhouse+native": ClickhouseDestination,
223
231
  "clickhouse": ClickhouseDestination,
232
+ "elasticsearch": ElasticsearchDestination,
233
+ "mongodb": MongoDBDestination,
234
+ "mongodb+srv": MongoDBDestination,
224
235
  "s3": S3Destination,
225
236
  "gs": GCSDestination,
226
237
  "sqlite": SqliteDestination,
227
238
  "mysql": MySqlDestination,
228
239
  "mysql+pymysql": MySqlDestination,
240
+ "trino": TrinoDestination,
229
241
  }
230
242
 
231
243
  def __init__(self, source_uri: str, destination_uri: str):