PyPI - ingestr - Versions diffs - 0.14.4__py3-none-any.whl → 0.14.6__py3-none-any.whl - Mend

ingestr 0.14.4py3-none-any.whl → 0.14.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ingestr might be problematic. Click here for more details.

Files changed (18) hide show

ingestr/src/buildinfo.py +1 -1
ingestr/src/factory.py +7 -0
ingestr/src/http/__init__.py +35 -0
ingestr/src/http/readers.py +114 -0
ingestr/src/jira_source/__init__.py +27 -1
ingestr/src/jira_source/helpers.py +8 -21
ingestr/src/monday/__init__.py +246 -0
ingestr/src/monday/helpers.py +392 -0
ingestr/src/monday/settings.py +328 -0
ingestr/src/plusvibeai/__init__.py +335 -0
ingestr/src/plusvibeai/helpers.py +544 -0
ingestr/src/plusvibeai/settings.py +252 -0
ingestr/src/sources.py +130 -0
{ingestr-0.14.4.dist-info → ingestr-0.14.6.dist-info}/METADATA +1 -1
{ingestr-0.14.4.dist-info → ingestr-0.14.6.dist-info}/RECORD +18 -10
{ingestr-0.14.4.dist-info → ingestr-0.14.6.dist-info}/WHEEL +0 -0
{ingestr-0.14.4.dist-info → ingestr-0.14.6.dist-info}/entry_points.txt +0 -0
{ingestr-0.14.4.dist-info → ingestr-0.14.6.dist-info}/licenses/LICENSE.md +0 -0

ingestr/src/buildinfo.py CHANGED Viewed

	@@ -1 +1 @@
1	- version = "v0.14.4"
1	+ version = "v0.14.6"

ingestr/src/factory.py CHANGED Viewed

@@ -52,6 +52,7 @@ from ingestr.src.sources import (
     GoogleAnalyticsSource,
     GoogleSheetsSource,
     GorgiasSource,
+    HttpSource,
     HubspotSource,
     InfluxDBSource,
     IntercomSource,
@@ -64,12 +65,14 @@ from ingestr.src.sources import (
     LinkedInAdsSource,
     LocalCsvSource,
     MixpanelSource,
+    MondaySource,
     MongoDbSource,
     NotionSource,
     PersonioSource,
     PhantombusterSource,
     PinterestSource,
     PipedriveSource,
+    PlusVibeAISource,
     QuickBooksSource,
     RevenueCatSource,
     S3Source,
@@ -155,6 +158,8 @@ class SourceDestinationFactory:
         "anthropic": AnthropicSource,
         "csv": LocalCsvSource,
         "docebo": DoceboSource,
+        "http": HttpSource,
+        "https": HttpSource,
         "mongodb": MongoDbSource,
         "mongodb+srv": MongoDbSource,
         "notion": NotionSource,
@@ -212,6 +217,8 @@ class SourceDestinationFactory:
         "clickup": ClickupSource,
         "influxdb": InfluxDBSource,
         "wise": WiseSource,
+        "plusvibeai": PlusVibeAISource,
+        "monday": MondaySource,
     }
     destinations: Dict[str, Type[DestinationProtocol]] = {
         "bigquery": BigQueryDestination,

ingestr/src/http/__init__.py ADDED Viewed

@@ -0,0 +1,35 @@
+"""HTTP source for reading CSV, JSON, and Parquet files from public URLs"""
+from typing import Any, Optional
+import dlt
+from dlt.sources import DltResource
+from .readers import HttpReader
+@dlt.source
+def http_source(
+    url: str,
+    file_format: Optional[str] = None,
+    **kwargs: Any,
+) -> DltResource:
+    """Source for reading files from HTTP URLs.
+    Supports CSV, JSON, and Parquet file formats.
+    Args:
+        url (str): The HTTP(S) URL to the file
+        file_format (str, optional): File format ('csv', 'json', 'parquet').
+            If not provided, will be inferred from URL extension.
+        **kwargs: Additional arguments passed to the reader functions
+    Returns:
+        DltResource: A dlt resource that yields the file data
+    """
+    reader = HttpReader(url, file_format)
+    return dlt.resource(
+        reader.read_file(**kwargs),
+        name="http_data",
+    )

ingestr/src/http/readers.py ADDED Viewed

@@ -0,0 +1,114 @@
+"""Readers for HTTP file sources"""
+import io
+from typing import Any, Iterator, Optional
+from urllib.parse import urlparse
+import requests
+from dlt.sources import TDataItems
+class HttpReader:
+    """Reader for HTTP-based file sources"""
+    def __init__(self, url: str, file_format: Optional[str] = None):
+        self.url = url
+        self.file_format = file_format or self._infer_format(url)
+        if self.file_format not in ["csv", "json", "parquet"]:
+            raise ValueError(
+                f"Unsupported file format: {self.file_format}. "
+                "Supported formats: csv, json, parquet"
+            )
+    def _infer_format(self, url: str) -> str:
+        """Infer file format from URL extension"""
+        parsed = urlparse(url)
+        path = parsed.path.lower()
+        if path.endswith(".csv"):
+            return "csv"
+        elif path.endswith(".json") or path.endswith(".jsonl"):
+            return "json"
+        elif path.endswith(".parquet"):
+            return "parquet"
+        else:
+            raise ValueError(
+                f"Cannot infer file format from URL: {url}. "
+                "Please specify file_format parameter."
+            )
+    def _download_file(self) -> bytes:
+        """Download file from URL"""
+        response = requests.get(self.url, stream=True, timeout=30)
+        response.raise_for_status()
+        return response.content
+    def read_file(self, **kwargs: Any) -> Iterator[TDataItems]:
+        """Read file and yield data in chunks"""
+        content = self._download_file()
+        if self.file_format == "csv":
+            yield from self._read_csv(content, **kwargs)
+        elif self.file_format == "json":
+            yield from self._read_json(content, **kwargs)
+        elif self.file_format == "parquet":
+            yield from self._read_parquet(content, **kwargs)
+    def _read_csv(
+        self, content: bytes, chunksize: int = 10000, **pandas_kwargs: Any
+    ) -> Iterator[TDataItems]:
+        """Read CSV file with Pandas chunk by chunk"""
+        import pandas as pd  # type: ignore
+        kwargs = {**{"header": "infer", "chunksize": chunksize}, **pandas_kwargs}
+        file_obj = io.BytesIO(content)
+        for df in pd.read_csv(file_obj, **kwargs):
+            yield df.to_dict(orient="records")
+    def _read_json(
+        self, content: bytes, chunksize: int = 1000, **kwargs: Any
+    ) -> Iterator[TDataItems]:
+        """Read JSON or JSONL file"""
+        from dlt.common import json
+        file_obj = io.BytesIO(content)
+        text = file_obj.read().decode("utf-8")
+        # Try to detect if it's JSONL format (one JSON object per line)
+        lines = text.strip().split("\n")
+        if len(lines) > 1:
+            # Likely JSONL format
+            lines_chunk = []
+            for line in lines:
+                if line.strip():
+                    lines_chunk.append(json.loads(line))
+                    if len(lines_chunk) >= chunksize:
+                        yield lines_chunk
+                        lines_chunk = []
+            if lines_chunk:
+                yield lines_chunk
+        else:
+            # Single JSON object or array
+            data = json.loads(text)
+            if isinstance(data, list):
+                # Chunk the list
+                for i in range(0, len(data), chunksize):
+                    yield data[i : i + chunksize]
+            else:
+                # Single object
+                yield [data]
+    def _read_parquet(
+        self, content: bytes, chunksize: int = 10000, **kwargs: Any
+    ) -> Iterator[TDataItems]:
+        """Read Parquet file"""
+        from pyarrow import parquet as pq  # type: ignore
+        file_obj = io.BytesIO(content)
+        parquet_file = pq.ParquetFile(file_obj)
+        for batch in parquet_file.iter_batches(batch_size=chunksize):
+            yield batch.to_pylist()

ingestr/src/jira_source/__init__.py CHANGED Viewed

@@ -37,6 +37,7 @@ def jira_source() -> Any:
         resolutions,
         project_versions,
         project_components,
+        events,
     ]
@@ -65,7 +66,11 @@ def projects(
     yield from client.get_projects(expand=expand, recent=recent)
-@dlt.resource(write_disposition="merge", primary_key="id")
+@dlt.resource(
+    write_disposition="merge",
+    primary_key="id",
+    max_table_nesting=2,
+)
 def issues(
     base_url: str = dlt.secrets.value,
     email: str = dlt.secrets.value,
@@ -312,3 +317,24 @@ def project_components(
         return []
     return list(client.get_project_components(project_key))
+@dlt.resource(write_disposition="replace")
+def events(
+    base_url: str = dlt.secrets.value,
+    email: str = dlt.secrets.value,
+    api_token: str = dlt.secrets.value,
+) -> Iterable[TDataItem]:
+    """
+    Fetches all event types from Jira (e.g., Issue Created, Issue Updated, etc.).
+    Args:
+        base_url (str): Jira instance URL
+        email (str): User email for authentication
+        api_token (str): API token for authentication
+    Yields:
+        dict: The event data.
+    """
+    client = get_client(base_url, email, api_token)
+    yield from client.get_events()

ingestr/src/jira_source/helpers.py CHANGED Viewed

@@ -98,8 +98,6 @@ class JiraClient:
         for attempt in range(max_retries + 1):
             try:
-                logger.debug(f"Making request to {url} (attempt {attempt + 1})")
                 response = requests.request(
                     method=method,
                     url=url,
@@ -214,10 +212,6 @@ class JiraClient:
         consecutive_empty_pages = 0
         max_empty_pages = 3
-        logger.info(
-            f"Starting paginated request to {endpoint} with page_size={page_size}"
-        )
         while True:
             try:
                 response = self._make_request(endpoint, params)
@@ -238,7 +232,6 @@ class JiraClient:
                     is_last = True
                 else:
                     # Single item response
-                    logger.debug(f"Received single item response from {endpoint}")
                     yield response
                     break
@@ -253,27 +246,18 @@ class JiraClient:
                 else:
                     consecutive_empty_pages = 0
-                logger.debug(
-                    f"Retrieved {len(items)} items from {endpoint} (page {params['startAt'] // page_size + 1})"
-                )
                 for item in items:
                     if max_results and total_returned >= max_results:
-                        logger.info(f"Reached max_results limit of {max_results}")
                         return
                     yield item
                     total_returned += 1
                 # Check if we've reached the end
                 if is_last or len(items) < page_size:
-                    logger.debug(f"Reached end of pagination for {endpoint}")
                     break
                 # Check if we've got all available items
                 if total and total_returned >= total:
-                    logger.debug(
-                        f"Retrieved all {total} available items from {endpoint}"
-                    )
                     break
                 # Move to next page
@@ -295,10 +279,6 @@ class JiraClient:
                 )
                 raise JiraAPIError(f"Pagination failed: {str(e)}")
-        logger.info(
-            f"Completed pagination for {endpoint}, returned {total_returned} items"
-        )
     def search_issues(
         self,
         jql: str,
@@ -327,7 +307,7 @@ class JiraClient:
             params["expand"] = expand
         yield from self.get_paginated(
-            "search", params=params, page_size=page_size, max_results=max_results
+            "search/jql", params=params, page_size=page_size, max_results=max_results
         )
     def get_projects(
@@ -433,6 +413,13 @@ class JiraClient:
         """
         yield from self.get_paginated(f"project/{project_key}/component")
+    def get_events(self) -> Iterator[Dict[str, Any]]:
+        """Get all events (issue events like created, updated, etc.)."""
+        response = self._make_request("events")
+        if isinstance(response, list):
+            for event in response:
+                yield event
 def get_client(
     base_url: str, email: str, api_token: str, timeout: int = REQUEST_TIMEOUT

ingestr/src/monday/__init__.py ADDED Viewed

@@ -0,0 +1,246 @@
+"""
+Monday.com source for data extraction via GraphQL API.
+This source provides access to Monday.com app installation data.
+"""
+from typing import Any, Iterable, Iterator, Optional
+import dlt
+from dlt.sources import DltResource
+from .helpers import MondayClient, normalize_dict
+@dlt.source(max_table_nesting=0, name="monday_source")
+def monday_source(
+    api_token: str,
+    params: list[str],
+    start_date: Optional[str] = None,
+    end_date: Optional[str] = None,
+) -> Iterable[DltResource]:
+    """
+    Monday.com data source.
+    Args:
+        api_token: Monday.com API token for authentication
+        params: Table-specific parameters in format [table_type, ...params]
+        start_date: Optional start date for date-filtered queries (YYYY-MM-DD)
+        end_date: Optional end date for date-filtered queries (YYYY-MM-DD)
+    Yields:
+        DltResource: Data resource for the requested table
+    """
+    monday_client = MondayClient(api_token)
+    @dlt.resource(
+        name="account",
+        write_disposition="replace",
+    )
+    def fetch_account() -> Iterator[dict[str, Any]]:
+        """
+        Fetch account information from Monday.com.
+        Table format: account (no parameters needed)
+        """
+        if len(params) != 0:
+            raise ValueError("Account table must be in the format `account`")
+        yield normalize_dict(monday_client.get_account())
+    @dlt.resource(
+        name="account_roles",
+        write_disposition="replace",
+    )
+    def fetch_account_roles() -> Iterator[dict[str, Any]]:
+        """
+        Fetch account roles from Monday.com.
+        Table format: account_roles (no parameters needed)
+        """
+        if len(params) != 0:
+            raise ValueError(
+                "Account roles table must be in the format `account_roles`"
+            )
+        yield from monday_client.get_account_roles()
+    @dlt.resource(
+        name="users",
+        write_disposition="replace",
+    )
+    def fetch_users() -> Iterator[dict[str, Any]]:
+        """
+        Fetch users from Monday.com.
+        Table format: users (no parameters needed)
+        """
+        if len(params) != 0:
+            raise ValueError("Users table must be in the format `users`")
+        yield from monday_client.get_users()
+    @dlt.resource(
+        name="boards",
+        write_disposition="merge",
+        primary_key="id",
+    )
+    def fetch_boards(
+        updated_at: dlt.sources.incremental[str] = dlt.sources.incremental(
+            "updated_at", initial_value=start_date
+        ),
+    ) -> Iterator[dict[str, Any]]:
+        """
+        Fetch boards from Monday.com.
+        Table format: boards (no parameters needed)
+        """
+        if len(params) != 0:
+            raise ValueError("Boards table must be in the format `boards`")
+        yield from monday_client.get_boards()
+    @dlt.resource(
+        name="workspaces",
+        write_disposition="replace",
+    )
+    def fetch_workspaces() -> Iterator[dict[str, Any]]:
+        """
+        Fetch workspaces from Monday.com.
+        Table format: workspaces (no parameters needed)
+        """
+        if len(params) != 0:
+            raise ValueError("Workspaces table must be in the format `workspaces`")
+        yield from monday_client.get_workspaces()
+    @dlt.resource(
+        name="webhooks",
+        write_disposition="replace",
+    )
+    def fetch_webhooks() -> Iterator[dict[str, Any]]:
+        """
+        Fetch webhooks from Monday.com.
+        Table format: webhooks (no parameters needed)
+        """
+        if len(params) != 0:
+            raise ValueError("Webhooks table must be in the format `webhooks`")
+        yield from monday_client.get_webhooks()
+    @dlt.resource(
+        name="updates",
+        write_disposition="merge",
+        primary_key="id",
+    )
+    def fetch_updates(
+        updated_at: dlt.sources.incremental[str] = dlt.sources.incremental(
+            "updated_at", initial_value=start_date
+        ),
+    ) -> Iterator[dict[str, Any]]:
+        """
+        Fetch updates from Monday.com.
+        Table format: updates (no parameters needed)
+        Requires start_date and end_date parameters
+        """
+        if len(params) != 0:
+            raise ValueError("Updates table must be in the format `updates`")
+        yield from monday_client.get_updates(start_date=start_date, end_date=end_date)
+    @dlt.resource(
+        name="teams",
+        write_disposition="replace",
+    )
+    def fetch_teams() -> Iterator[dict[str, Any]]:
+        """
+        Fetch teams from Monday.com.
+        Table format: teams (no parameters needed)
+        """
+        if len(params) != 0:
+            raise ValueError("Teams table must be in the format `teams`")
+        yield from monday_client.get_teams()
+    @dlt.resource(
+        name="tags",
+        write_disposition="replace",
+    )
+    def fetch_tags() -> Iterator[dict[str, Any]]:
+        """
+        Fetch tags from Monday.com.
+        Table format: tags (no parameters needed)
+        """
+        if len(params) != 0:
+            raise ValueError("Tags table must be in the format `tags`")
+        yield from monday_client.get_tags()
+    @dlt.resource(
+        name="custom_activities",
+        write_disposition="replace",
+    )
+    def fetch_custom_activities() -> Iterator[dict[str, Any]]:
+        """
+        Fetch custom activities from Monday.com.
+        Table format: custom_activities (no parameters needed)
+        """
+        if len(params) != 0:
+            raise ValueError(
+                "Custom activities table must be in the format `custom_activities`"
+            )
+        yield from monday_client.get_custom_activities()
+    @dlt.resource(
+        name="board_columns",
+        write_disposition="replace",
+    )
+    def fetch_board_columns() -> Iterator[dict[str, Any]]:
+        """
+        Fetch board columns from Monday.com.
+        Table format: board_columns (no parameters needed)
+        """
+        if len(params) != 0:
+            raise ValueError(
+                "Board columns table must be in the format `board_columns`"
+            )
+        yield from monday_client.get_board_columns()
+    @dlt.resource(
+        name="board_views",
+        write_disposition="replace",
+    )
+    def fetch_board_views() -> Iterator[dict[str, Any]]:
+        """
+        Fetch board views from Monday.com.
+        Table format: board_views (no parameters needed)
+        """
+        if len(params) != 0:
+            raise ValueError("Board views table must be in the format `board_views`")
+        yield from monday_client.get_board_views()
+    return (
+        fetch_account,
+        fetch_account_roles,
+        fetch_users,
+        fetch_boards,
+        fetch_workspaces,
+        fetch_webhooks,
+        fetch_updates,
+        fetch_teams,
+        fetch_tags,
+        fetch_custom_activities,
+        fetch_board_columns,
+        fetch_board_views,
+    )

ingestr 0.14.4__py3-none-any.whl → 0.14.6__py3-none-any.whl

Potentially problematic release.

ingestr 0.14.4py3-none-any.whl → 0.14.6py3-none-any.whl