ingestr 0.14.4__py3-none-any.whl → 0.14.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

ingestr/src/buildinfo.py CHANGED
@@ -1 +1 @@
1
- version = "v0.14.4"
1
+ version = "v0.14.6"
ingestr/src/factory.py CHANGED
@@ -52,6 +52,7 @@ from ingestr.src.sources import (
52
52
  GoogleAnalyticsSource,
53
53
  GoogleSheetsSource,
54
54
  GorgiasSource,
55
+ HttpSource,
55
56
  HubspotSource,
56
57
  InfluxDBSource,
57
58
  IntercomSource,
@@ -64,12 +65,14 @@ from ingestr.src.sources import (
64
65
  LinkedInAdsSource,
65
66
  LocalCsvSource,
66
67
  MixpanelSource,
68
+ MondaySource,
67
69
  MongoDbSource,
68
70
  NotionSource,
69
71
  PersonioSource,
70
72
  PhantombusterSource,
71
73
  PinterestSource,
72
74
  PipedriveSource,
75
+ PlusVibeAISource,
73
76
  QuickBooksSource,
74
77
  RevenueCatSource,
75
78
  S3Source,
@@ -155,6 +158,8 @@ class SourceDestinationFactory:
155
158
  "anthropic": AnthropicSource,
156
159
  "csv": LocalCsvSource,
157
160
  "docebo": DoceboSource,
161
+ "http": HttpSource,
162
+ "https": HttpSource,
158
163
  "mongodb": MongoDbSource,
159
164
  "mongodb+srv": MongoDbSource,
160
165
  "notion": NotionSource,
@@ -212,6 +217,8 @@ class SourceDestinationFactory:
212
217
  "clickup": ClickupSource,
213
218
  "influxdb": InfluxDBSource,
214
219
  "wise": WiseSource,
220
+ "plusvibeai": PlusVibeAISource,
221
+ "monday": MondaySource,
215
222
  }
216
223
  destinations: Dict[str, Type[DestinationProtocol]] = {
217
224
  "bigquery": BigQueryDestination,
@@ -0,0 +1,35 @@
1
+ """HTTP source for reading CSV, JSON, and Parquet files from public URLs"""
2
+
3
+ from typing import Any, Optional
4
+
5
+ import dlt
6
+ from dlt.sources import DltResource
7
+
8
+ from .readers import HttpReader
9
+
10
+
11
+ @dlt.source
12
+ def http_source(
13
+ url: str,
14
+ file_format: Optional[str] = None,
15
+ **kwargs: Any,
16
+ ) -> DltResource:
17
+ """Source for reading files from HTTP URLs.
18
+
19
+ Supports CSV, JSON, and Parquet file formats.
20
+
21
+ Args:
22
+ url (str): The HTTP(S) URL to the file
23
+ file_format (str, optional): File format ('csv', 'json', 'parquet').
24
+ If not provided, will be inferred from URL extension.
25
+ **kwargs: Additional arguments passed to the reader functions
26
+
27
+ Returns:
28
+ DltResource: A dlt resource that yields the file data
29
+ """
30
+ reader = HttpReader(url, file_format)
31
+
32
+ return dlt.resource(
33
+ reader.read_file(**kwargs),
34
+ name="http_data",
35
+ )
@@ -0,0 +1,114 @@
1
+ """Readers for HTTP file sources"""
2
+
3
+ import io
4
+ from typing import Any, Iterator, Optional
5
+ from urllib.parse import urlparse
6
+
7
+ import requests
8
+ from dlt.sources import TDataItems
9
+
10
+
11
+ class HttpReader:
12
+ """Reader for HTTP-based file sources"""
13
+
14
+ def __init__(self, url: str, file_format: Optional[str] = None):
15
+ self.url = url
16
+ self.file_format = file_format or self._infer_format(url)
17
+
18
+ if self.file_format not in ["csv", "json", "parquet"]:
19
+ raise ValueError(
20
+ f"Unsupported file format: {self.file_format}. "
21
+ "Supported formats: csv, json, parquet"
22
+ )
23
+
24
+ def _infer_format(self, url: str) -> str:
25
+ """Infer file format from URL extension"""
26
+ parsed = urlparse(url)
27
+ path = parsed.path.lower()
28
+
29
+ if path.endswith(".csv"):
30
+ return "csv"
31
+ elif path.endswith(".json") or path.endswith(".jsonl"):
32
+ return "json"
33
+ elif path.endswith(".parquet"):
34
+ return "parquet"
35
+ else:
36
+ raise ValueError(
37
+ f"Cannot infer file format from URL: {url}. "
38
+ "Please specify file_format parameter."
39
+ )
40
+
41
+ def _download_file(self) -> bytes:
42
+ """Download file from URL"""
43
+ response = requests.get(self.url, stream=True, timeout=30)
44
+ response.raise_for_status()
45
+ return response.content
46
+
47
+ def read_file(self, **kwargs: Any) -> Iterator[TDataItems]:
48
+ """Read file and yield data in chunks"""
49
+ content = self._download_file()
50
+
51
+ if self.file_format == "csv":
52
+ yield from self._read_csv(content, **kwargs)
53
+ elif self.file_format == "json":
54
+ yield from self._read_json(content, **kwargs)
55
+ elif self.file_format == "parquet":
56
+ yield from self._read_parquet(content, **kwargs)
57
+
58
+ def _read_csv(
59
+ self, content: bytes, chunksize: int = 10000, **pandas_kwargs: Any
60
+ ) -> Iterator[TDataItems]:
61
+ """Read CSV file with Pandas chunk by chunk"""
62
+ import pandas as pd # type: ignore
63
+
64
+ kwargs = {**{"header": "infer", "chunksize": chunksize}, **pandas_kwargs}
65
+
66
+ file_obj = io.BytesIO(content)
67
+ for df in pd.read_csv(file_obj, **kwargs):
68
+ yield df.to_dict(orient="records")
69
+
70
+ def _read_json(
71
+ self, content: bytes, chunksize: int = 1000, **kwargs: Any
72
+ ) -> Iterator[TDataItems]:
73
+ """Read JSON or JSONL file"""
74
+ from dlt.common import json
75
+
76
+ file_obj = io.BytesIO(content)
77
+ text = file_obj.read().decode("utf-8")
78
+
79
+ # Try to detect if it's JSONL format (one JSON object per line)
80
+ lines = text.strip().split("\n")
81
+
82
+ if len(lines) > 1:
83
+ # Likely JSONL format
84
+ lines_chunk = []
85
+ for line in lines:
86
+ if line.strip():
87
+ lines_chunk.append(json.loads(line))
88
+ if len(lines_chunk) >= chunksize:
89
+ yield lines_chunk
90
+ lines_chunk = []
91
+ if lines_chunk:
92
+ yield lines_chunk
93
+ else:
94
+ # Single JSON object or array
95
+ data = json.loads(text)
96
+ if isinstance(data, list):
97
+ # Chunk the list
98
+ for i in range(0, len(data), chunksize):
99
+ yield data[i : i + chunksize]
100
+ else:
101
+ # Single object
102
+ yield [data]
103
+
104
+ def _read_parquet(
105
+ self, content: bytes, chunksize: int = 10000, **kwargs: Any
106
+ ) -> Iterator[TDataItems]:
107
+ """Read Parquet file"""
108
+ from pyarrow import parquet as pq # type: ignore
109
+
110
+ file_obj = io.BytesIO(content)
111
+ parquet_file = pq.ParquetFile(file_obj)
112
+
113
+ for batch in parquet_file.iter_batches(batch_size=chunksize):
114
+ yield batch.to_pylist()
@@ -37,6 +37,7 @@ def jira_source() -> Any:
37
37
  resolutions,
38
38
  project_versions,
39
39
  project_components,
40
+ events,
40
41
  ]
41
42
 
42
43
 
@@ -65,7 +66,11 @@ def projects(
65
66
  yield from client.get_projects(expand=expand, recent=recent)
66
67
 
67
68
 
68
- @dlt.resource(write_disposition="merge", primary_key="id")
69
+ @dlt.resource(
70
+ write_disposition="merge",
71
+ primary_key="id",
72
+ max_table_nesting=2,
73
+ )
69
74
  def issues(
70
75
  base_url: str = dlt.secrets.value,
71
76
  email: str = dlt.secrets.value,
@@ -312,3 +317,24 @@ def project_components(
312
317
  return []
313
318
 
314
319
  return list(client.get_project_components(project_key))
320
+
321
+
322
+ @dlt.resource(write_disposition="replace")
323
+ def events(
324
+ base_url: str = dlt.secrets.value,
325
+ email: str = dlt.secrets.value,
326
+ api_token: str = dlt.secrets.value,
327
+ ) -> Iterable[TDataItem]:
328
+ """
329
+ Fetches all event types from Jira (e.g., Issue Created, Issue Updated, etc.).
330
+
331
+ Args:
332
+ base_url (str): Jira instance URL
333
+ email (str): User email for authentication
334
+ api_token (str): API token for authentication
335
+
336
+ Yields:
337
+ dict: The event data.
338
+ """
339
+ client = get_client(base_url, email, api_token)
340
+ yield from client.get_events()
@@ -98,8 +98,6 @@ class JiraClient:
98
98
 
99
99
  for attempt in range(max_retries + 1):
100
100
  try:
101
- logger.debug(f"Making request to {url} (attempt {attempt + 1})")
102
-
103
101
  response = requests.request(
104
102
  method=method,
105
103
  url=url,
@@ -214,10 +212,6 @@ class JiraClient:
214
212
  consecutive_empty_pages = 0
215
213
  max_empty_pages = 3
216
214
 
217
- logger.info(
218
- f"Starting paginated request to {endpoint} with page_size={page_size}"
219
- )
220
-
221
215
  while True:
222
216
  try:
223
217
  response = self._make_request(endpoint, params)
@@ -238,7 +232,6 @@ class JiraClient:
238
232
  is_last = True
239
233
  else:
240
234
  # Single item response
241
- logger.debug(f"Received single item response from {endpoint}")
242
235
  yield response
243
236
  break
244
237
 
@@ -253,27 +246,18 @@ class JiraClient:
253
246
  else:
254
247
  consecutive_empty_pages = 0
255
248
 
256
- logger.debug(
257
- f"Retrieved {len(items)} items from {endpoint} (page {params['startAt'] // page_size + 1})"
258
- )
259
-
260
249
  for item in items:
261
250
  if max_results and total_returned >= max_results:
262
- logger.info(f"Reached max_results limit of {max_results}")
263
251
  return
264
252
  yield item
265
253
  total_returned += 1
266
254
 
267
255
  # Check if we've reached the end
268
256
  if is_last or len(items) < page_size:
269
- logger.debug(f"Reached end of pagination for {endpoint}")
270
257
  break
271
258
 
272
259
  # Check if we've got all available items
273
260
  if total and total_returned >= total:
274
- logger.debug(
275
- f"Retrieved all {total} available items from {endpoint}"
276
- )
277
261
  break
278
262
 
279
263
  # Move to next page
@@ -295,10 +279,6 @@ class JiraClient:
295
279
  )
296
280
  raise JiraAPIError(f"Pagination failed: {str(e)}")
297
281
 
298
- logger.info(
299
- f"Completed pagination for {endpoint}, returned {total_returned} items"
300
- )
301
-
302
282
  def search_issues(
303
283
  self,
304
284
  jql: str,
@@ -327,7 +307,7 @@ class JiraClient:
327
307
  params["expand"] = expand
328
308
 
329
309
  yield from self.get_paginated(
330
- "search", params=params, page_size=page_size, max_results=max_results
310
+ "search/jql", params=params, page_size=page_size, max_results=max_results
331
311
  )
332
312
 
333
313
  def get_projects(
@@ -433,6 +413,13 @@ class JiraClient:
433
413
  """
434
414
  yield from self.get_paginated(f"project/{project_key}/component")
435
415
 
416
+ def get_events(self) -> Iterator[Dict[str, Any]]:
417
+ """Get all events (issue events like created, updated, etc.)."""
418
+ response = self._make_request("events")
419
+ if isinstance(response, list):
420
+ for event in response:
421
+ yield event
422
+
436
423
 
437
424
  def get_client(
438
425
  base_url: str, email: str, api_token: str, timeout: int = REQUEST_TIMEOUT
@@ -0,0 +1,246 @@
1
+ """
2
+ Monday.com source for data extraction via GraphQL API.
3
+
4
+ This source provides access to Monday.com app installation data.
5
+ """
6
+
7
+ from typing import Any, Iterable, Iterator, Optional
8
+
9
+ import dlt
10
+ from dlt.sources import DltResource
11
+
12
+ from .helpers import MondayClient, normalize_dict
13
+
14
+
15
+ @dlt.source(max_table_nesting=0, name="monday_source")
16
+ def monday_source(
17
+ api_token: str,
18
+ params: list[str],
19
+ start_date: Optional[str] = None,
20
+ end_date: Optional[str] = None,
21
+ ) -> Iterable[DltResource]:
22
+ """
23
+ Monday.com data source.
24
+
25
+ Args:
26
+ api_token: Monday.com API token for authentication
27
+ params: Table-specific parameters in format [table_type, ...params]
28
+ start_date: Optional start date for date-filtered queries (YYYY-MM-DD)
29
+ end_date: Optional end date for date-filtered queries (YYYY-MM-DD)
30
+
31
+ Yields:
32
+ DltResource: Data resource for the requested table
33
+ """
34
+ monday_client = MondayClient(api_token)
35
+
36
+ @dlt.resource(
37
+ name="account",
38
+ write_disposition="replace",
39
+ )
40
+ def fetch_account() -> Iterator[dict[str, Any]]:
41
+ """
42
+ Fetch account information from Monday.com.
43
+
44
+ Table format: account (no parameters needed)
45
+ """
46
+ if len(params) != 0:
47
+ raise ValueError("Account table must be in the format `account`")
48
+
49
+ yield normalize_dict(monday_client.get_account())
50
+
51
+ @dlt.resource(
52
+ name="account_roles",
53
+ write_disposition="replace",
54
+ )
55
+ def fetch_account_roles() -> Iterator[dict[str, Any]]:
56
+ """
57
+ Fetch account roles from Monday.com.
58
+
59
+ Table format: account_roles (no parameters needed)
60
+ """
61
+ if len(params) != 0:
62
+ raise ValueError(
63
+ "Account roles table must be in the format `account_roles`"
64
+ )
65
+
66
+ yield from monday_client.get_account_roles()
67
+
68
+ @dlt.resource(
69
+ name="users",
70
+ write_disposition="replace",
71
+ )
72
+ def fetch_users() -> Iterator[dict[str, Any]]:
73
+ """
74
+ Fetch users from Monday.com.
75
+
76
+ Table format: users (no parameters needed)
77
+ """
78
+ if len(params) != 0:
79
+ raise ValueError("Users table must be in the format `users`")
80
+
81
+ yield from monday_client.get_users()
82
+
83
+ @dlt.resource(
84
+ name="boards",
85
+ write_disposition="merge",
86
+ primary_key="id",
87
+ )
88
+ def fetch_boards(
89
+ updated_at: dlt.sources.incremental[str] = dlt.sources.incremental(
90
+ "updated_at", initial_value=start_date
91
+ ),
92
+ ) -> Iterator[dict[str, Any]]:
93
+ """
94
+ Fetch boards from Monday.com.
95
+
96
+ Table format: boards (no parameters needed)
97
+ """
98
+ if len(params) != 0:
99
+ raise ValueError("Boards table must be in the format `boards`")
100
+
101
+ yield from monday_client.get_boards()
102
+
103
+ @dlt.resource(
104
+ name="workspaces",
105
+ write_disposition="replace",
106
+ )
107
+ def fetch_workspaces() -> Iterator[dict[str, Any]]:
108
+ """
109
+ Fetch workspaces from Monday.com.
110
+
111
+ Table format: workspaces (no parameters needed)
112
+ """
113
+ if len(params) != 0:
114
+ raise ValueError("Workspaces table must be in the format `workspaces`")
115
+
116
+ yield from monday_client.get_workspaces()
117
+
118
+ @dlt.resource(
119
+ name="webhooks",
120
+ write_disposition="replace",
121
+ )
122
+ def fetch_webhooks() -> Iterator[dict[str, Any]]:
123
+ """
124
+ Fetch webhooks from Monday.com.
125
+
126
+ Table format: webhooks (no parameters needed)
127
+ """
128
+ if len(params) != 0:
129
+ raise ValueError("Webhooks table must be in the format `webhooks`")
130
+
131
+ yield from monday_client.get_webhooks()
132
+
133
+ @dlt.resource(
134
+ name="updates",
135
+ write_disposition="merge",
136
+ primary_key="id",
137
+ )
138
+ def fetch_updates(
139
+ updated_at: dlt.sources.incremental[str] = dlt.sources.incremental(
140
+ "updated_at", initial_value=start_date
141
+ ),
142
+ ) -> Iterator[dict[str, Any]]:
143
+ """
144
+ Fetch updates from Monday.com.
145
+
146
+ Table format: updates (no parameters needed)
147
+ Requires start_date and end_date parameters
148
+ """
149
+ if len(params) != 0:
150
+ raise ValueError("Updates table must be in the format `updates`")
151
+
152
+ yield from monday_client.get_updates(start_date=start_date, end_date=end_date)
153
+
154
+ @dlt.resource(
155
+ name="teams",
156
+ write_disposition="replace",
157
+ )
158
+ def fetch_teams() -> Iterator[dict[str, Any]]:
159
+ """
160
+ Fetch teams from Monday.com.
161
+
162
+ Table format: teams (no parameters needed)
163
+ """
164
+ if len(params) != 0:
165
+ raise ValueError("Teams table must be in the format `teams`")
166
+
167
+ yield from monday_client.get_teams()
168
+
169
+ @dlt.resource(
170
+ name="tags",
171
+ write_disposition="replace",
172
+ )
173
+ def fetch_tags() -> Iterator[dict[str, Any]]:
174
+ """
175
+ Fetch tags from Monday.com.
176
+
177
+ Table format: tags (no parameters needed)
178
+ """
179
+ if len(params) != 0:
180
+ raise ValueError("Tags table must be in the format `tags`")
181
+
182
+ yield from monday_client.get_tags()
183
+
184
+ @dlt.resource(
185
+ name="custom_activities",
186
+ write_disposition="replace",
187
+ )
188
+ def fetch_custom_activities() -> Iterator[dict[str, Any]]:
189
+ """
190
+ Fetch custom activities from Monday.com.
191
+
192
+ Table format: custom_activities (no parameters needed)
193
+ """
194
+ if len(params) != 0:
195
+ raise ValueError(
196
+ "Custom activities table must be in the format `custom_activities`"
197
+ )
198
+
199
+ yield from monday_client.get_custom_activities()
200
+
201
+ @dlt.resource(
202
+ name="board_columns",
203
+ write_disposition="replace",
204
+ )
205
+ def fetch_board_columns() -> Iterator[dict[str, Any]]:
206
+ """
207
+ Fetch board columns from Monday.com.
208
+
209
+ Table format: board_columns (no parameters needed)
210
+ """
211
+ if len(params) != 0:
212
+ raise ValueError(
213
+ "Board columns table must be in the format `board_columns`"
214
+ )
215
+
216
+ yield from monday_client.get_board_columns()
217
+
218
+ @dlt.resource(
219
+ name="board_views",
220
+ write_disposition="replace",
221
+ )
222
+ def fetch_board_views() -> Iterator[dict[str, Any]]:
223
+ """
224
+ Fetch board views from Monday.com.
225
+
226
+ Table format: board_views (no parameters needed)
227
+ """
228
+ if len(params) != 0:
229
+ raise ValueError("Board views table must be in the format `board_views`")
230
+
231
+ yield from monday_client.get_board_views()
232
+
233
+ return (
234
+ fetch_account,
235
+ fetch_account_roles,
236
+ fetch_users,
237
+ fetch_boards,
238
+ fetch_workspaces,
239
+ fetch_webhooks,
240
+ fetch_updates,
241
+ fetch_teams,
242
+ fetch_tags,
243
+ fetch_custom_activities,
244
+ fetch_board_columns,
245
+ fetch_board_views,
246
+ )