PyPI - ingestr - Versions diffs - 0.13.94__py3-none-any.whl → 0.14.0__py3-none-any.whl - Mend

ingestr 0.13.94py3-none-any.whl → 0.14.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ingestr might be problematic. Click here for more details.

Files changed (23) hide show

ingestr/src/adjust/__init__.py +4 -4
ingestr/src/anthropic/__init__.py +277 -0
ingestr/src/anthropic/helpers.py +525 -0
ingestr/src/buildinfo.py +1 -1
ingestr/src/destinations.py +142 -0
ingestr/src/docebo/__init__.py +28 -46
ingestr/src/elasticsearch/helpers.py +112 -0
ingestr/src/facebook_ads/__init__.py +6 -1
ingestr/src/factory.py +12 -0
ingestr/src/fluxx/__init__.py +4032 -11242
ingestr/src/frankfurter/__init__.py +157 -157
ingestr/src/fundraiseup/__init__.py +49 -0
ingestr/src/fundraiseup/client.py +81 -0
ingestr/src/google_analytics/__init__.py +1 -1
ingestr/src/mongodb/__init__.py +1 -1
ingestr/src/mongodb/helpers.py +69 -1
ingestr/src/sources.py +80 -0
ingestr/tests/unit/test_smartsheets.py +1 -1
{ingestr-0.13.94.dist-info → ingestr-0.14.0.dist-info}/METADATA +12 -7
{ingestr-0.13.94.dist-info → ingestr-0.14.0.dist-info}/RECORD +23 -18
{ingestr-0.13.94.dist-info → ingestr-0.14.0.dist-info}/WHEEL +0 -0
{ingestr-0.13.94.dist-info → ingestr-0.14.0.dist-info}/entry_points.txt +0 -0
{ingestr-0.13.94.dist-info → ingestr-0.14.0.dist-info}/licenses/LICENSE.md +0 -0

ingestr/src/docebo/__init__.py CHANGED Viewed

@@ -44,9 +44,8 @@ def docebo_source(
     @dlt.resource(
         name="users",
         write_disposition="replace",
-        primary_key="user_id",
         columns={
-            "user_id": {"data_type": "text", "nullable": False},
+            "user_id": {"data_type": "text", "nullable": True},
             "username": {"data_type": "text", "nullable": True},
             "first_name": {"data_type": "text", "nullable": True},
             "last_name": {"data_type": "text", "nullable": True},
@@ -85,10 +84,9 @@ def docebo_source(
     @dlt.resource(
         name="courses",
         write_disposition="replace",
-        primary_key="id_course",
         parallelized=True,
         columns={
-            "id_course": {"data_type": "bigint", "nullable": False},
+            "id_course": {"data_type": "bigint", "nullable": True},
             "name": {"data_type": "text", "nullable": True},
             "uidCourse": {"data_type": "text", "nullable": True},
             "description": {"data_type": "text", "nullable": True},
@@ -130,17 +128,12 @@ def docebo_source(
             for course in courses_batch:
                 yield normalize_docebo_dates(course)
-            # normalized_courses = [normalize_docebo_dates(course) for course in courses_batch]
-            # print("yielding a batch for courses")
-            # yield normalized_courses
-    # Phase 1: Core User and Organization Resources
     @dlt.resource(
         name="user_fields",
         write_disposition="replace",
         primary_key="id",
         columns={
-            "id": {"data_type": "bigint", "nullable": False},
+            "id": {"data_type": "bigint", "nullable": True},
             "name": {"data_type": "text", "nullable": True},
             "type": {"data_type": "text", "nullable": True},
             "mandatory": {"data_type": "bool", "nullable": True},
@@ -161,9 +154,8 @@ def docebo_source(
     @dlt.resource(
         name="branches",
         write_disposition="replace",
-        primary_key="id_org",
         columns={
-            "id_org": {"data_type": "bigint", "nullable": False},
+            "id_org": {"data_type": "bigint", "nullable": True},
             "id_parent": {"data_type": "bigint", "nullable": True},
             "lft": {"data_type": "bigint", "nullable": True},
             "rgt": {"data_type": "bigint", "nullable": True},
@@ -185,7 +177,7 @@ def docebo_source(
         write_disposition="replace",
         primary_key="group_id",
         columns={
-            "group_id": {"data_type": "bigint", "nullable": False},
+            "group_id": {"data_type": "bigint", "nullable": True},
             "name": {"data_type": "text", "nullable": True},
             "description": {"data_type": "text", "nullable": True},
             "language": {"data_type": "text", "nullable": True},
@@ -210,8 +202,8 @@ def docebo_source(
         write_disposition="replace",
         primary_key=["group_id", "user_id"],
         columns={
-            "group_id": {"data_type": "bigint", "nullable": False},
-            "user_id": {"data_type": "text", "nullable": False},
+            "group_id": {"data_type": "bigint", "nullable": True},
+            "user_id": {"data_type": "text", "nullable": True},
             "username": {"data_type": "text", "nullable": True},
             "first_name": {"data_type": "text", "nullable": True},
             "last_name": {"data_type": "text", "nullable": True},
@@ -232,7 +224,7 @@ def docebo_source(
         write_disposition="replace",
         primary_key="field_id",
         columns={
-            "field_id": {"data_type": "bigint", "nullable": False},
+            "field_id": {"data_type": "bigint", "nullable": True},
             "type_field": {"data_type": "text", "nullable": True},
             "name_field": {"data_type": "text", "nullable": True},
             "is_mandatory": {"data_type": "bool", "nullable": True},
@@ -252,11 +244,10 @@ def docebo_source(
         name="learning_objects",
         data_from=courses,
         write_disposition="replace",
-        primary_key=["course_id", "id_org"],
         parallelized=True,
         columns={
-            "course_id": {"data_type": "bigint", "nullable": False},
-            "id_org": {"data_type": "bigint", "nullable": False},
+            "course_id": {"data_type": "bigint", "nullable": True},
+            "id_org": {"data_type": "bigint", "nullable": True},
             "object_id": {"data_type": "bigint", "nullable": True},
             "lo_code": {"data_type": "text", "nullable": True},
             "lo_name": {"data_type": "text", "nullable": True},
@@ -286,9 +277,8 @@ def docebo_source(
     @dlt.resource(
         name="learning_plans",
         write_disposition="replace",
-        primary_key="learning_plan_id",
         columns={
-            "learning_plan_id": {"data_type": "bigint", "nullable": False},
+            "learning_plan_id": {"data_type": "bigint", "nullable": True},
             "uuid": {"data_type": "text", "nullable": True},
             "code": {"data_type": "text", "nullable": True},
             "title": {"data_type": "text", "nullable": True},
@@ -316,10 +306,9 @@ def docebo_source(
     @dlt.resource(
         name="learning_plan_enrollments",
         write_disposition="replace",
-        primary_key=["id_path", "id_user"],
         columns={
-            "id_path": {"data_type": "bigint", "nullable": False},
-            "id_user": {"data_type": "text", "nullable": False},
+            "id_path": {"data_type": "bigint", "nullable": True},
+            "id_user": {"data_type": "text", "nullable": True},
             "enrollment_date": {"data_type": "timestamp", "nullable": True},
             "completion_date": {"data_type": "timestamp", "nullable": True},
             "enrollment_status": {"data_type": "text", "nullable": True},
@@ -339,11 +328,10 @@ def docebo_source(
     @dlt.resource(
         name="learning_plan_course_enrollments",
         write_disposition="replace",
-        primary_key=["learning_plan_id", "course_id", "user_id"],
         columns={
-            "learning_plan_id": {"data_type": "bigint", "nullable": False},
-            "course_id": {"data_type": "bigint", "nullable": False},
-            "user_id": {"data_type": "text", "nullable": False},
+            "learning_plan_id": {"data_type": "bigint", "nullable": True},
+            "course_id": {"data_type": "bigint", "nullable": True},
+            "user_id": {"data_type": "text", "nullable": True},
             "enrollment_date": {"data_type": "timestamp", "nullable": True},
             "completion_date": {"data_type": "timestamp", "nullable": True},
             "status": {"data_type": "text", "nullable": True},
@@ -362,10 +350,9 @@ def docebo_source(
     @dlt.resource(
         name="course_enrollments",
         write_disposition="replace",
-        primary_key=["course_id", "user_id"],
         columns={
-            "course_id": {"data_type": "bigint", "nullable": False},
-            "user_id": {"data_type": "text", "nullable": False},
+            "course_id": {"data_type": "bigint", "nullable": True},
+            "user_id": {"data_type": "text", "nullable": True},
             "enrollment_date": {"data_type": "timestamp", "nullable": True},
             "completion_date": {"data_type": "timestamp", "nullable": True},
             "status": {"data_type": "text", "nullable": True},
@@ -388,10 +375,9 @@ def docebo_source(
     @dlt.resource(
         name="sessions",
         write_disposition="replace",
-        primary_key=["course_id", "session_id"],
         columns={
-            "course_id": {"data_type": "bigint", "nullable": False},
-            "session_id": {"data_type": "bigint", "nullable": False},
+            "course_id": {"data_type": "bigint", "nullable": True},
+            "session_id": {"data_type": "bigint", "nullable": True},
             "name": {"data_type": "text", "nullable": True},
             "code": {"data_type": "text", "nullable": True},
             "date_start": {"data_type": "timestamp", "nullable": True},
@@ -416,9 +402,8 @@ def docebo_source(
     @dlt.resource(
         name="categories",
         write_disposition="replace",
-        primary_key="id_cat",
         columns={
-            "id_cat": {"data_type": "bigint", "nullable": False},
+            "id_cat": {"data_type": "bigint", "nullable": True},
             "code": {"data_type": "text", "nullable": True},
             "description": {"data_type": "text", "nullable": True},
             "id_parent": {"data_type": "bigint", "nullable": True},
@@ -437,9 +422,8 @@ def docebo_source(
     @dlt.resource(
         name="certifications",
         write_disposition="replace",
-        primary_key="id_cert",
         columns={
-            "id_cert": {"data_type": "bigint", "nullable": False},
+            "id_cert": {"data_type": "bigint", "nullable": True},
             "code": {"data_type": "text", "nullable": True},
             "title": {"data_type": "text", "nullable": True},
             "description": {"data_type": "text", "nullable": True},
@@ -463,9 +447,8 @@ def docebo_source(
     @dlt.resource(
         name="external_training",
         write_disposition="replace",
-        primary_key="external_training_id",
         columns={
-            "external_training_id": {"data_type": "bigint", "nullable": False},
+            "external_training_id": {"data_type": "bigint", "nullable": True},
             "user_id": {"data_type": "text", "nullable": True},
             "title": {"data_type": "text", "nullable": True},
             "description": {"data_type": "text", "nullable": True},
@@ -495,10 +478,9 @@ def docebo_source(
         write_disposition="replace",
         name="polls",
         parallelized=True,
-        primary_key=["poll_id", "course_id"],
         columns={
-            "poll_id": {"data_type": "bigint", "nullable": False},
-            "course_id": {"data_type": "bigint", "nullable": False},
+            "poll_id": {"data_type": "bigint", "nullable": True},
+            "course_id": {"data_type": "bigint", "nullable": True},
             "poll_title": {"data_type": "text", "nullable": True},
             "object_type": {"data_type": "text", "nullable": True},
             "lo_type": {"data_type": "text", "nullable": True},
@@ -533,10 +515,10 @@ def docebo_source(
         parallelized=True,
         name="survey_answers",
         columns={
-            "course_id": {"data_type": "bigint", "nullable": False},
-            "poll_id": {"data_type": "bigint", "nullable": False},
+            "course_id": {"data_type": "bigint", "nullable": True},
+            "poll_id": {"data_type": "bigint", "nullable": True},
             "poll_title": {"data_type": "text", "nullable": True},
-            "question_id": {"data_type": "bigint", "nullable": False},
+            "question_id": {"data_type": "bigint", "nullable": True},
             "question_type": {"data_type": "text", "nullable": True},
             "question_title": {"data_type": "text", "nullable": True},
             "answer": {"data_type": "text", "nullable": True},

ingestr/src/elasticsearch/helpers.py ADDED Viewed

@@ -0,0 +1,112 @@
+"""Elasticsearch destination helpers"""
+import json
+from typing import Any, Dict, Iterator
+from urllib.parse import urlparse
+import dlt
+from elasticsearch import Elasticsearch
+from elasticsearch.helpers import bulk
+def process_file_items(file_path: str) -> Iterator[Dict[str, Any]]:
+    """Process items from a file path (JSONL format)."""
+    with open(file_path, "r") as f:
+        for line in f:
+            if line.strip():
+                doc = json.loads(line.strip())
+                # Clean DLT metadata
+                cleaned_doc = {
+                    k: v for k, v in doc.items() if not k.startswith("_dlt_")
+                }
+                yield cleaned_doc
+def process_iterable_items(items: Any) -> Iterator[Dict[str, Any]]:
+    """Process items from an iterable."""
+    for item in items:
+        if isinstance(item, dict):
+            # Clean DLT metadata
+            cleaned_item = {k: v for k, v in item.items() if not k.startswith("_dlt_")}
+            yield cleaned_item
+@dlt.destination(
+    name="elasticsearch",
+    loader_file_format="typed-jsonl",
+    batch_size=1000,
+    naming_convention="snake_case",
+)
+def elasticsearch_insert(
+    items, table, connection_string: str = dlt.secrets.value
+) -> None:
+    """Insert data into Elasticsearch index.
+    Args:
+        items: Data items (file path or iterable)
+        table: Table metadata containing name and schema info
+        connection_string: Elasticsearch connection string
+    """
+    # Parse connection string
+    parsed = urlparse(connection_string)
+    # Build Elasticsearch client configuration
+    hosts = [
+        {
+            "host": parsed.hostname or "localhost",
+            "port": parsed.port or 9200,
+            "scheme": parsed.scheme or "http",
+        }
+    ]
+    es_config: Dict[str, Any] = {"hosts": hosts}
+    # Add authentication if present
+    if parsed.username and parsed.password:
+        es_config["http_auth"] = (parsed.username, parsed.password)
+    # Get index name from table metadata
+    index_name = table["name"]
+    # Connect to Elasticsearch
+    client = Elasticsearch(**es_config)
+    # Replace mode: delete existing index if it exists
+    if client.indices.exists(index=index_name):
+        client.indices.delete(index=index_name)
+    # Process and insert documents
+    if isinstance(items, str):
+        documents = process_file_items(items)
+    else:
+        documents = process_iterable_items(items)
+    # Prepare documents for bulk insert as generator
+    def doc_generator():
+        for doc in documents:
+            es_doc: Dict[str, Any] = {"_index": index_name, "_source": doc.copy()}
+            # Use _id if present, otherwise let ES generate one
+            if "_id" in doc:
+                es_doc["_id"] = str(doc["_id"])
+                # Remove _id from source since it's metadata
+                if "_id" in es_doc["_source"]:
+                    del es_doc["_source"]["_id"]
+            elif "id" in doc:
+                es_doc["_id"] = str(doc["id"])
+            yield es_doc
+    # Bulk insert
+    try:
+        _, failed_items = bulk(client, doc_generator(), request_timeout=60)
+        if failed_items:
+            failed_count = (
+                len(failed_items) if isinstance(failed_items, list) else failed_items
+            )
+            raise Exception(
+                f"Failed to insert {failed_count} documents: {failed_items}"
+            )
+    except Exception as e:
+        raise Exception(f"Elasticsearch bulk insert failed: {str(e)}")

ingestr/src/facebook_ads/__init__.py CHANGED Viewed

@@ -179,7 +179,12 @@ def facebook_insights_source(
         start_date = date_start.last_value
         if date_start.end_value:
             end_date_val = pendulum.instance(date_start.end_value)
-            end_date = end_date_val if isinstance(end_date_val, pendulum.Date) else end_date_val.date()
+            end_date = (
+                end_date_val
+                if isinstance(end_date_val, pendulum.Date)
+                else end_date_val.date()
+            )
         else:
             end_date = pendulum.now().date()

ingestr/src/factory.py CHANGED Viewed

@@ -11,7 +11,9 @@ from ingestr.src.destinations import (
     CsvDestination,
     DatabricksDestination,
     DuckDBDestination,
+    ElasticsearchDestination,
     GCSDestination,
+    MongoDBDestination,
     MotherduckDestination,
     MsSQLDestination,
     MySqlDestination,
@@ -21,10 +23,12 @@ from ingestr.src.destinations import (
     SnowflakeDestination,
     SqliteDestination,
     SynapseDestination,
+    TrinoDestination,
 )
 from ingestr.src.sources import (
     AdjustSource,
     AirtableSource,
+    AnthropicSource,
     AppleAppStoreSource,
     ApplovinMaxSource,
     AppLovinSource,
@@ -41,6 +45,7 @@ from ingestr.src.sources import (
     FluxxSource,
     FrankfurterSource,
     FreshdeskSource,
+    FundraiseupSource,
     GCSSource,
     GitHubSource,
     GoogleAdsSource,
@@ -106,6 +111,7 @@ SQL_SOURCE_SCHEMES = [
     "databricks",
     "db2",
     "spanner",
+    "trino",
 ]
@@ -144,6 +150,7 @@ class SourceDestinationFactory:
     source_scheme: str
     destination_scheme: str
     sources: Dict[str, Type[SourceProtocol]] = {
+        "anthropic": AnthropicSource,
         "csv": LocalCsvSource,
         "docebo": DoceboSource,
         "mongodb": MongoDbSource,
@@ -185,6 +192,7 @@ class SourceDestinationFactory:
         "pipedrive": PipedriveSource,
         "frankfurter": FrankfurterSource,
         "freshdesk": FreshdeskSource,
+        "fundraiseup": FundraiseupSource,
         "trustpilot": TrustpilotSource,
         "phantombuster": PhantombusterSource,
         "elasticsearch": ElasticsearchSource,
@@ -221,11 +229,15 @@ class SourceDestinationFactory:
         "athena": AthenaDestination,
         "clickhouse+native": ClickhouseDestination,
         "clickhouse": ClickhouseDestination,
+        "elasticsearch": ElasticsearchDestination,
+        "mongodb": MongoDBDestination,
+        "mongodb+srv": MongoDBDestination,
         "s3": S3Destination,
         "gs": GCSDestination,
         "sqlite": SqliteDestination,
         "mysql": MySqlDestination,
         "mysql+pymysql": MySqlDestination,
+        "trino": TrinoDestination,
     }
     def __init__(self, source_uri: str, destination_uri: str):

ingestr 0.13.94__py3-none-any.whl → 0.14.0__py3-none-any.whl

Potentially problematic release.

ingestr 0.13.94py3-none-any.whl → 0.14.0py3-none-any.whl