PyPI - ingestr - Versions diffs - 0.13.78__py3-none-any.whl → 0.13.80__py3-none-any.whl - Mend

ingestr 0.13.78py3-none-any.whl → 0.13.80py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ingestr might be problematic. Click here for more details.

Files changed (21) hide show

ingestr/main.py +10 -3
ingestr/src/buildinfo.py +1 -1
ingestr/src/destinations.py +18 -0
ingestr/src/facebook_ads/__init__.py +6 -2
ingestr/src/facebook_ads/helpers.py +1 -1
ingestr/src/factory.py +5 -0
ingestr/src/freshdesk/__init__.py +23 -8
ingestr/src/freshdesk/freshdesk_client.py +16 -5
ingestr/src/github/__init__.py +5 -3
ingestr/src/github/helpers.py +1 -0
ingestr/src/influxdb/__init__.py +1 -0
ingestr/src/linear/__init__.py +61 -43
ingestr/src/linear/helpers.py +19 -36
ingestr/src/mongodb/__init__.py +3 -0
ingestr/src/mongodb/helpers.py +178 -11
ingestr/src/sources.py +311 -25
{ingestr-0.13.78.dist-info → ingestr-0.13.80.dist-info}/METADATA +6 -1
{ingestr-0.13.78.dist-info → ingestr-0.13.80.dist-info}/RECORD +21 -21
{ingestr-0.13.78.dist-info → ingestr-0.13.80.dist-info}/WHEEL +0 -0
{ingestr-0.13.78.dist-info → ingestr-0.13.80.dist-info}/entry_points.txt +0 -0
{ingestr-0.13.78.dist-info → ingestr-0.13.80.dist-info}/licenses/LICENSE.md +0 -0

ingestr/main.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import warnings
 from datetime import datetime
 from enum import Enum
 from typing import Optional
@@ -8,6 +9,14 @@ from typing_extensions import Annotated
 from ingestr.src.telemetry.event import track
+try:
+    from duckdb_engine import DuckDBEngineWarning
+    warnings.filterwarnings("ignore", category=DuckDBEngineWarning)
+except ImportError:
+    # duckdb-engine not installed
+    pass
 app = typer.Typer(
     name="ingestr",
     help="ingestr is the CLI tool to ingest data from one source to another",
@@ -506,7 +515,6 @@ def ingest(
         if factory.source_scheme == "sqlite":
             source_table = "main." + source_table.split(".")[-1]
         if (
             incremental_key
@@ -600,10 +608,9 @@ def ingest(
         if factory.source_scheme == "influxdb":
             if primary_key:
                 write_disposition = "merge"
         start_time = datetime.now()
         run_info: LoadInfo = pipeline.run(
             dlt_source,
             **destination.dlt_run_params(

ingestr/src/buildinfo.py CHANGED Viewed

	@@ -1 +1 @@
1	- version = "v0.13.78"
1	+ version = "v0.13.80"

ingestr/src/destinations.py CHANGED Viewed

@@ -147,6 +147,24 @@ class DuckDBDestination(GenericSqlDestination):
         return dlt.destinations.duckdb(uri, **kwargs)
+class MotherduckDestination(GenericSqlDestination):
+    def dlt_dest(self, uri: str, **kwargs):
+        from urllib.parse import parse_qs, urlparse
+        parsed = urlparse(uri)
+        query = parse_qs(parsed.query)
+        token = query.get("token", [None])[0]
+        from dlt.destinations.impl.motherduck.configuration import MotherDuckCredentials
+        creds = {
+            "password": token,
+        }
+        if parsed.path.lstrip("/"):
+            creds["database"] = parsed.path.lstrip("/")
+        return dlt.destinations.motherduck(MotherDuckCredentials(creds), **kwargs)
 def handle_datetimeoffset(dto_value: bytes) -> datetime.datetime:
     # ref: https://github.com/mkleehammer/pyodbc/issues/134#issuecomment-281739794
     tup = struct.unpack(

ingestr/src/facebook_ads/__init__.py CHANGED Viewed

@@ -26,7 +26,6 @@ from .settings import (
     DEFAULT_LEAD_FIELDS,
     INSIGHT_FIELDS_TYPES,
     INSIGHTS_BREAKDOWNS_OPTIONS,
-    INSIGHTS_PRIMARY_KEY,
     INVALID_INSIGHTS_FIELDS,
     TInsightsBreakdownOptions,
     TInsightsLevels,
@@ -118,6 +117,9 @@ def facebook_insights_source(
     app_api_version: str = None,
     start_date: pendulum.DateTime | None = None,
     end_date: pendulum.DateTime | None = None,
+    insights_max_wait_to_finish_seconds: int = 60 * 60 * 4,
+    insights_max_wait_to_start_seconds: int = 60 * 30,
+    insights_max_async_sleep_seconds: int = 20,
 ) -> DltResource:
     """Incrementally loads insight reports with defined granularity level, fields, breakdowns etc.
@@ -207,7 +209,9 @@ def facebook_insights_source(
             }
             job = execute_job(
                 account.get_insights(params=query, is_async=True),
-                insights_max_async_sleep_seconds=20,
+                insights_max_async_sleep_seconds=insights_max_async_sleep_seconds,
+                insights_max_wait_to_finish_seconds=insights_max_wait_to_finish_seconds,
+                insights_max_wait_to_start_seconds=insights_max_wait_to_start_seconds,
             )
             output = list(map(process_report_item, job.get_result()))
             yield output

ingestr/src/facebook_ads/helpers.py CHANGED Viewed

@@ -144,7 +144,7 @@ def execute_job(
             raise InsightsJobTimeout(
                 "facebook_insights",
                 pretty_error_message.format(
-                    job_id, insights_max_wait_to_finish_seconds // 60
+                    job_id, insights_max_wait_to_finish_seconds
                 ),
             )

ingestr/src/factory.py CHANGED Viewed

@@ -12,6 +12,7 @@ from ingestr.src.destinations import (
     DatabricksDestination,
     DuckDBDestination,
     GCSDestination,
+    MotherduckDestination,
     MsSQLDestination,
     MySqlDestination,
     PostgresDestination,
@@ -85,6 +86,8 @@ SQL_SOURCE_SCHEMES = [
     "mysql",
     "mysql+pymysql",
     "mysql+mysqlconnector",
+    "md",
+    "motherduck",
     "postgres",
     "postgresql",
     "postgresql+psycopg2",
@@ -195,6 +198,8 @@ class SourceDestinationFactory:
         "cratedb": CrateDBDestination,
         "databricks": DatabricksDestination,
         "duckdb": DuckDBDestination,
+        "motherduck": MotherduckDestination,
+        "md": MotherduckDestination,
         "mssql": MsSQLDestination,
         "postgres": PostgresDestination,
         "postgresql": PostgresDestination,

ingestr/src/freshdesk/__init__.py CHANGED Viewed

@@ -4,6 +4,8 @@ etc. to the database"""
 from typing import Any, Dict, Generator, Iterable, List, Optional
 import dlt
+import pendulum
+from dlt.common.time import ensure_pendulum_datetime
 from dlt.sources import DltResource
 from .freshdesk_client import FreshdeskClient
@@ -12,10 +14,12 @@ from .settings import DEFAULT_ENDPOINTS
 @dlt.source()
 def freshdesk_source(
-    endpoints: Optional[List[str]] = None,
+    domain: str,
+    api_secret_key: str,
+    start_date: pendulum.DateTime,
+    end_date: Optional[pendulum.DateTime] = None,
     per_page: int = 100,
-    domain: str = dlt.secrets.value,
-    api_secret_key: str = dlt.secrets.value,
+    endpoints: Optional[List[str]] = None,
 ) -> Iterable[DltResource]:
     """
     Retrieves data from specified Freshdesk API endpoints.
@@ -39,7 +43,11 @@ def freshdesk_source(
     def incremental_resource(
         endpoint: str,
         updated_at: Optional[Any] = dlt.sources.incremental(
-            "updated_at", initial_value="2022-01-01T00:00:00Z"
+            "updated_at",
+            initial_value=start_date.isoformat(),
+            end_value=end_date.isoformat() if end_date else None,
+            range_start="closed",
+            range_end="closed",
         ),
     ) -> Generator[Dict[Any, Any], Any, None]:
         """
@@ -48,15 +56,22 @@ def freshdesk_source(
         to ensure incremental loading.
         """
-        # Retrieve the last updated timestamp to fetch only new or updated records.
-        if updated_at is not None:
-            updated_at = updated_at.last_value
+        if updated_at.last_value is not None:
+            start_date = ensure_pendulum_datetime(updated_at.last_value)
+        else:
+            start_date = start_date
+        if updated_at.end_value is not None:
+            end_date = ensure_pendulum_datetime(updated_at.end_value)
+        else:
+            end_date = pendulum.now(tz="UTC")
         # Use the FreshdeskClient instance to fetch paginated responses
         yield from freshdesk.paginated_response(
             endpoint=endpoint,
             per_page=per_page,
-            updated_at=updated_at,
+            start_date=start_date,
+            end_date=end_date,
         )
     # Set default endpoints if not provided

ingestr/src/freshdesk/freshdesk_client.py CHANGED Viewed

@@ -2,8 +2,9 @@
 import logging
 import time
-from typing import Any, Dict, Iterable, Optional
+from typing import Any, Dict, Iterable
+import pendulum
 from dlt.common.typing import TDataItem
 from dlt.sources.helpers import requests
@@ -67,7 +68,8 @@ class FreshdeskClient:
         self,
         endpoint: str,
         per_page: int,
-        updated_at: Optional[str] = None,
+        start_date: pendulum.DateTime,
+        end_date: pendulum.DateTime,
     ) -> Iterable[TDataItem]:
         """
         Fetches a paginated response from a specified endpoint.
@@ -88,8 +90,8 @@ class FreshdeskClient:
                 param_key = (
                     "updated_since" if endpoint == "tickets" else "_updated_since"
                 )
-                if updated_at:
-                    params[param_key] = updated_at
+                params[param_key] = start_date.to_iso8601_string()
             # Handle requests with rate-limiting
             # A maximum of 300 pages (30000 tickets) will be returned.
@@ -98,5 +100,14 @@ class FreshdeskClient:
             if not data:
                 break  # Stop if no data or max page limit reached
-            yield data
+            filtered_data = [
+                item
+                for item in data
+                if "updated_at" in item
+                and pendulum.parse(item["updated_at"]) <= end_date
+            ]
+            if not filtered_data:
+                break
+            yield filtered_data
             page += 1

ingestr/src/github/__init__.py CHANGED Viewed

@@ -91,7 +91,9 @@ def github_repo_events(
     """
     # use naming function in table name to generate separate tables for each event
-    @dlt.resource(primary_key= "id", table_name=lambda i: i["type"], write_disposition="merge")
+    @dlt.resource(
+        primary_key="id", table_name=lambda i: i["type"], write_disposition="merge"
+    )
     def repo_events(
         last_created_at: dlt.sources.incremental[str] = dlt.sources.incremental(
             "created_at",
@@ -105,7 +107,7 @@ def github_repo_events(
         repos_path = (
             f"/repos/{urllib.parse.quote(owner)}/{urllib.parse.quote(name)}/events"
         )
         # Get the date range from the incremental state
         start_filter = pendulum.parse(
             last_created_at.last_value or last_created_at.initial_value
@@ -115,7 +117,7 @@ def github_repo_events(
             if last_created_at.end_value
             else pendulum.now()
         )
         for page in get_rest_pages(access_token, repos_path + "?per_page=100"):
             # Filter events by date range
             filtered_events = []

ingestr/src/github/helpers.py CHANGED Viewed

@@ -61,6 +61,7 @@ def get_stargazers(
             page_items,
         )
 def get_reactions_data(
     node_type: str,
     owner: str,

ingestr/src/influxdb/__init__.py CHANGED Viewed

@@ -7,6 +7,7 @@ from dlt.sources import DltResource
 from .client import InfluxClient
 @dlt.source(max_table_nesting=0)
 def influxdb_source(
     measurement: str,

ingestr/src/linear/__init__.py CHANGED Viewed

@@ -3,7 +3,22 @@ from typing import Any, Dict, Iterable, Iterator
 import dlt
 import pendulum
-from .helpers import _normalize_issue, _normalize_team, _paginate
+from .helpers import _paginate, normalize_dictionaries
+def _get_date_range(updated_at, start_date):
+    """Extract current start and end dates from incremental state."""
+    if updated_at.last_value:
+        current_start_date = pendulum.parse(updated_at.last_value)
+    else:
+        current_start_date = pendulum.parse(start_date)
+    if updated_at.end_value:
+        current_end_date = pendulum.parse(updated_at.end_value)
+    else:
+        current_end_date = pendulum.now(tz="UTC")
+    return current_start_date, current_end_date
 ISSUES_QUERY = """
 query Issues($cursor: String) {
@@ -84,7 +99,25 @@ query Users($cursor: String) {
   }
 }
 """
+WORKFLOW_STATES_QUERY = """
+query WorkflowStates($cursor: String) {
+  workflowStates(first: 50, after: $cursor) {
+    nodes {
+      archivedAt
+      color
+      createdAt
+      id
+      inheritedFrom { id }
+      name
+      position
+      team { id  }
+      type
+      updatedAt
+    }
+    pageInfo { hasNextPage endCursor }
+  }
+}
+"""
 @dlt.source(name="linear", max_table_nesting=0)
 def linear_source(
@@ -102,20 +135,12 @@ def linear_source(
             range_end="closed",
         ),
     ) -> Iterator[Dict[str, Any]]:
-        if updated_at.last_value:
-            current_start_date = pendulum.parse(updated_at.last_value)
-        else:
-            current_start_date = pendulum.parse(start_date)
-        if updated_at.end_value:
-            current_end_date = pendulum.parse(updated_at.end_value)
-        else:
-            current_end_date = pendulum.now(tz="UTC")
+        current_start_date, current_end_date = _get_date_range(updated_at, start_date)
         for item in _paginate(api_key, ISSUES_QUERY, "issues"):
             if pendulum.parse(item["updatedAt"]) >= current_start_date:
                 if pendulum.parse(item["updatedAt"]) <= current_end_date:
-                    yield _normalize_issue(item)
+                    yield normalize_dictionaries(item)
     @dlt.resource(name="projects", primary_key="id", write_disposition="merge")
     def projects(
@@ -127,20 +152,12 @@ def linear_source(
             range_end="closed",
         ),
     ) -> Iterator[Dict[str, Any]]:
-        if updated_at.last_value:
-            current_start_date = pendulum.parse(updated_at.last_value)
-        else:
-            current_start_date = pendulum.parse(start_date)
-        if updated_at.end_value:
-            current_end_date = pendulum.parse(updated_at.end_value)
-        else:
-            current_end_date = pendulum.now(tz="UTC")
+        current_start_date, current_end_date = _get_date_range(updated_at, start_date)
         for item in _paginate(api_key, PROJECTS_QUERY, "projects"):
             if pendulum.parse(item["updatedAt"]) >= current_start_date:
                 if pendulum.parse(item["updatedAt"]) <= current_end_date:
-                    yield item
+                    yield normalize_dictionaries(item)
     @dlt.resource(name="teams", primary_key="id", write_disposition="merge")
     def teams(
@@ -153,21 +170,13 @@ def linear_source(
         ),
     ) -> Iterator[Dict[str, Any]]:
         print(start_date)
-        if updated_at.last_value:
-            current_start_date = pendulum.parse(updated_at.last_value)
-        else:
-            current_start_date = pendulum.parse(start_date)
+        current_start_date, current_end_date = _get_date_range(updated_at, start_date)
         print(current_start_date)
-        if updated_at.end_value:
-            current_end_date = pendulum.parse(updated_at.end_value)
-        else:
-            current_end_date = pendulum.now(tz="UTC")
         for item in _paginate(api_key, TEAMS_QUERY, "teams"):
             if pendulum.parse(item["updatedAt"]) >= current_start_date:
                 if pendulum.parse(item["updatedAt"]) <= current_end_date:
-                    yield _normalize_team(item)
+                    yield normalize_dictionaries(item)
     @dlt.resource(name="users", primary_key="id", write_disposition="merge")
     def users(
@@ -179,19 +188,28 @@ def linear_source(
             range_end="closed",
         ),
     ) -> Iterator[Dict[str, Any]]:
-        if updated_at.last_value:
-            current_start_date = pendulum.parse(updated_at.last_value)
-        else:
-            current_start_date = pendulum.parse(start_date)
-        if updated_at.end_value:
-            current_end_date = pendulum.parse(updated_at.end_value)
-        else:
-            current_end_date = pendulum.now(tz="UTC")
+        current_start_date, current_end_date = _get_date_range(updated_at, start_date)
         for item in _paginate(api_key, USERS_QUERY, "users"):
             if pendulum.parse(item["updatedAt"]) >= current_start_date:
                 if pendulum.parse(item["updatedAt"]) <= current_end_date:
-                    yield item
+                    yield normalize_dictionaries(item)
+    @dlt.resource(name="workflow_states", primary_key="id", write_disposition="merge")
+    def workflow_states(
+        updated_at: dlt.sources.incremental[str] = dlt.sources.incremental(
+            "updatedAt",
+            initial_value=start_date.isoformat(),
+            end_value=end_date.isoformat() if end_date else None,
+            range_start="closed",
+            range_end="closed",
+        ),
+    ) -> Iterator[Dict[str, Any]]:
+        current_start_date, current_end_date = _get_date_range(updated_at, start_date)
+        for item in _paginate(api_key, WORKFLOW_STATES_QUERY, "workflowStates"):
+            if pendulum.parse(item["updatedAt"]) >= current_start_date:
+                if pendulum.parse(item["updatedAt"]) <= current_end_date:
+                    yield normalize_dictionaries(item)
+    return [issues, projects, teams, users, workflow_states]
-    return issues, projects, teams, users

ingestr/src/linear/helpers.py CHANGED Viewed

@@ -32,41 +32,24 @@ def _paginate(api_key: str, query: str, root: str) -> Iterator[Dict[str, Any]]:
         cursor = data["pageInfo"]["endCursor"]
-def _normalize_issue(item: Dict[str, Any]) -> Dict[str, Any]:
-    field_mapping = {
-        "assignee": "assignee_id",
-        "creator": "creator_id",
-        "state": "state_id",
-        "cycle": "cycle_id",
-        "project": "project_id",
-    }
-    for key, value in field_mapping.items():
-        if item.get(key):
-            item[value] = item[key]["id"]
-            del item[key]
-        else:
-            item[value] = None
-            del item[key]
-    json_fields = [
-        "comments",
-        "subscribers",
-        "attachments",
-        "labels",
-        "subtasks",
-        "projects",
-        "memberships",
-        "members",
-    ]
-    for field in json_fields:
-        if item.get(field):
-            item[f"{field}"] = item[field].get("nodes", [])
-    return item
-def _normalize_team(item: Dict[str, Any]) -> Dict[str, Any]:
-    json_fields = ["memberships", "members", "projects"]
-    for field in json_fields:
-        if item.get(field):
-            item[f"{field}"] = item[field].get("nodes", [])
-    return item
+def normalize_dictionaries(item: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Automatically normalize dictionary fields by detecting their structure:
+    - Convert nested objects with 'id' field to {field_name}_id
+    - Convert objects with 'nodes' field to arrays
+    """
+    normalized_item = item.copy()
+    for key, value in list(normalized_item.items()):
+        if isinstance(value, dict):
+            # If the dict has an 'id' field, replace with {key}_id
+            if 'id' in value:
+                normalized_item[f"{key}_id"] = value['id']
+                del normalized_item[key]
+            # If the dict has 'nodes' field, extract the nodes array
+            elif 'nodes' in value:
+                normalized_item[key] = value['nodes']
+    return normalized_item

ingestr/src/mongodb/__init__.py CHANGED Viewed

@@ -106,6 +106,7 @@ def mongodb_collection(
     filter_: Optional[Dict[str, Any]] = None,
     projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = dlt.config.value,
     pymongoarrow_schema: Optional[Any] = None,
+    custom_query: Optional[List[Dict[str, Any]]] = None,
 ) -> Any:
     """
     A DLT source which loads a collection from a mongo database using PyMongo.
@@ -132,6 +133,7 @@ def mongodb_collection(
                 exclude (dict) - {"released": False, "runtime": False}
             Note: Can't mix include and exclude statements '{"title": True, "released": False}`
         pymongoarrow_schema (pymongoarrow.schema.Schema): Mapping of expected field types to convert BSON to Arrow
+        custom_query (Optional[List[Dict[str, Any]]]): Custom MongoDB aggregation pipeline to execute instead of find()
     Returns:
         Iterable[DltResource]: A list of DLT resources for each collection to be loaded.
@@ -161,4 +163,5 @@ def mongodb_collection(
         filter_=filter_ or {},
         projection=projection,
         pymongoarrow_schema=pymongoarrow_schema,
+        custom_query=custom_query,
     )

ingestr 0.13.78__py3-none-any.whl → 0.13.80__py3-none-any.whl

Potentially problematic release.

ingestr 0.13.78py3-none-any.whl → 0.13.80py3-none-any.whl