ingestr 0.14.5__py3-none-any.whl → 0.14.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

ingestr/src/buildinfo.py CHANGED
@@ -1 +1 @@
1
- version = "v0.14.5"
1
+ version = "v0.14.6"
ingestr/src/factory.py CHANGED
@@ -52,6 +52,7 @@ from ingestr.src.sources import (
52
52
  GoogleAnalyticsSource,
53
53
  GoogleSheetsSource,
54
54
  GorgiasSource,
55
+ HttpSource,
55
56
  HubspotSource,
56
57
  InfluxDBSource,
57
58
  IntercomSource,
@@ -64,6 +65,7 @@ from ingestr.src.sources import (
64
65
  LinkedInAdsSource,
65
66
  LocalCsvSource,
66
67
  MixpanelSource,
68
+ MondaySource,
67
69
  MongoDbSource,
68
70
  NotionSource,
69
71
  PersonioSource,
@@ -156,6 +158,8 @@ class SourceDestinationFactory:
156
158
  "anthropic": AnthropicSource,
157
159
  "csv": LocalCsvSource,
158
160
  "docebo": DoceboSource,
161
+ "http": HttpSource,
162
+ "https": HttpSource,
159
163
  "mongodb": MongoDbSource,
160
164
  "mongodb+srv": MongoDbSource,
161
165
  "notion": NotionSource,
@@ -214,6 +218,7 @@ class SourceDestinationFactory:
214
218
  "influxdb": InfluxDBSource,
215
219
  "wise": WiseSource,
216
220
  "plusvibeai": PlusVibeAISource,
221
+ "monday": MondaySource,
217
222
  }
218
223
  destinations: Dict[str, Type[DestinationProtocol]] = {
219
224
  "bigquery": BigQueryDestination,
@@ -0,0 +1,35 @@
1
+ """HTTP source for reading CSV, JSON, and Parquet files from public URLs"""
2
+
3
+ from typing import Any, Optional
4
+
5
+ import dlt
6
+ from dlt.sources import DltResource
7
+
8
+ from .readers import HttpReader
9
+
10
+
11
+ @dlt.source
12
+ def http_source(
13
+ url: str,
14
+ file_format: Optional[str] = None,
15
+ **kwargs: Any,
16
+ ) -> DltResource:
17
+ """Source for reading files from HTTP URLs.
18
+
19
+ Supports CSV, JSON, and Parquet file formats.
20
+
21
+ Args:
22
+ url (str): The HTTP(S) URL to the file
23
+ file_format (str, optional): File format ('csv', 'json', 'parquet').
24
+ If not provided, will be inferred from URL extension.
25
+ **kwargs: Additional arguments passed to the reader functions
26
+
27
+ Returns:
28
+ DltResource: A dlt resource that yields the file data
29
+ """
30
+ reader = HttpReader(url, file_format)
31
+
32
+ return dlt.resource(
33
+ reader.read_file(**kwargs),
34
+ name="http_data",
35
+ )
@@ -0,0 +1,114 @@
1
+ """Readers for HTTP file sources"""
2
+
3
+ import io
4
+ from typing import Any, Iterator, Optional
5
+ from urllib.parse import urlparse
6
+
7
+ import requests
8
+ from dlt.sources import TDataItems
9
+
10
+
11
+ class HttpReader:
12
+ """Reader for HTTP-based file sources"""
13
+
14
+ def __init__(self, url: str, file_format: Optional[str] = None):
15
+ self.url = url
16
+ self.file_format = file_format or self._infer_format(url)
17
+
18
+ if self.file_format not in ["csv", "json", "parquet"]:
19
+ raise ValueError(
20
+ f"Unsupported file format: {self.file_format}. "
21
+ "Supported formats: csv, json, parquet"
22
+ )
23
+
24
+ def _infer_format(self, url: str) -> str:
25
+ """Infer file format from URL extension"""
26
+ parsed = urlparse(url)
27
+ path = parsed.path.lower()
28
+
29
+ if path.endswith(".csv"):
30
+ return "csv"
31
+ elif path.endswith(".json") or path.endswith(".jsonl"):
32
+ return "json"
33
+ elif path.endswith(".parquet"):
34
+ return "parquet"
35
+ else:
36
+ raise ValueError(
37
+ f"Cannot infer file format from URL: {url}. "
38
+ "Please specify file_format parameter."
39
+ )
40
+
41
+ def _download_file(self) -> bytes:
42
+ """Download file from URL"""
43
+ response = requests.get(self.url, stream=True, timeout=30)
44
+ response.raise_for_status()
45
+ return response.content
46
+
47
+ def read_file(self, **kwargs: Any) -> Iterator[TDataItems]:
48
+ """Read file and yield data in chunks"""
49
+ content = self._download_file()
50
+
51
+ if self.file_format == "csv":
52
+ yield from self._read_csv(content, **kwargs)
53
+ elif self.file_format == "json":
54
+ yield from self._read_json(content, **kwargs)
55
+ elif self.file_format == "parquet":
56
+ yield from self._read_parquet(content, **kwargs)
57
+
58
+ def _read_csv(
59
+ self, content: bytes, chunksize: int = 10000, **pandas_kwargs: Any
60
+ ) -> Iterator[TDataItems]:
61
+ """Read CSV file with Pandas chunk by chunk"""
62
+ import pandas as pd # type: ignore
63
+
64
+ kwargs = {**{"header": "infer", "chunksize": chunksize}, **pandas_kwargs}
65
+
66
+ file_obj = io.BytesIO(content)
67
+ for df in pd.read_csv(file_obj, **kwargs):
68
+ yield df.to_dict(orient="records")
69
+
70
+ def _read_json(
71
+ self, content: bytes, chunksize: int = 1000, **kwargs: Any
72
+ ) -> Iterator[TDataItems]:
73
+ """Read JSON or JSONL file"""
74
+ from dlt.common import json
75
+
76
+ file_obj = io.BytesIO(content)
77
+ text = file_obj.read().decode("utf-8")
78
+
79
+ # Try to detect if it's JSONL format (one JSON object per line)
80
+ lines = text.strip().split("\n")
81
+
82
+ if len(lines) > 1:
83
+ # Likely JSONL format
84
+ lines_chunk = []
85
+ for line in lines:
86
+ if line.strip():
87
+ lines_chunk.append(json.loads(line))
88
+ if len(lines_chunk) >= chunksize:
89
+ yield lines_chunk
90
+ lines_chunk = []
91
+ if lines_chunk:
92
+ yield lines_chunk
93
+ else:
94
+ # Single JSON object or array
95
+ data = json.loads(text)
96
+ if isinstance(data, list):
97
+ # Chunk the list
98
+ for i in range(0, len(data), chunksize):
99
+ yield data[i : i + chunksize]
100
+ else:
101
+ # Single object
102
+ yield [data]
103
+
104
+ def _read_parquet(
105
+ self, content: bytes, chunksize: int = 10000, **kwargs: Any
106
+ ) -> Iterator[TDataItems]:
107
+ """Read Parquet file"""
108
+ from pyarrow import parquet as pq # type: ignore
109
+
110
+ file_obj = io.BytesIO(content)
111
+ parquet_file = pq.ParquetFile(file_obj)
112
+
113
+ for batch in parquet_file.iter_batches(batch_size=chunksize):
114
+ yield batch.to_pylist()
@@ -0,0 +1,246 @@
1
+ """
2
+ Monday.com source for data extraction via GraphQL API.
3
+
4
+ This source provides access to Monday.com app installation data.
5
+ """
6
+
7
+ from typing import Any, Iterable, Iterator, Optional
8
+
9
+ import dlt
10
+ from dlt.sources import DltResource
11
+
12
+ from .helpers import MondayClient, normalize_dict
13
+
14
+
15
+ @dlt.source(max_table_nesting=0, name="monday_source")
16
+ def monday_source(
17
+ api_token: str,
18
+ params: list[str],
19
+ start_date: Optional[str] = None,
20
+ end_date: Optional[str] = None,
21
+ ) -> Iterable[DltResource]:
22
+ """
23
+ Monday.com data source.
24
+
25
+ Args:
26
+ api_token: Monday.com API token for authentication
27
+ params: Table-specific parameters in format [table_type, ...params]
28
+ start_date: Optional start date for date-filtered queries (YYYY-MM-DD)
29
+ end_date: Optional end date for date-filtered queries (YYYY-MM-DD)
30
+
31
+ Yields:
32
+ DltResource: Data resource for the requested table
33
+ """
34
+ monday_client = MondayClient(api_token)
35
+
36
+ @dlt.resource(
37
+ name="account",
38
+ write_disposition="replace",
39
+ )
40
+ def fetch_account() -> Iterator[dict[str, Any]]:
41
+ """
42
+ Fetch account information from Monday.com.
43
+
44
+ Table format: account (no parameters needed)
45
+ """
46
+ if len(params) != 0:
47
+ raise ValueError("Account table must be in the format `account`")
48
+
49
+ yield normalize_dict(monday_client.get_account())
50
+
51
+ @dlt.resource(
52
+ name="account_roles",
53
+ write_disposition="replace",
54
+ )
55
+ def fetch_account_roles() -> Iterator[dict[str, Any]]:
56
+ """
57
+ Fetch account roles from Monday.com.
58
+
59
+ Table format: account_roles (no parameters needed)
60
+ """
61
+ if len(params) != 0:
62
+ raise ValueError(
63
+ "Account roles table must be in the format `account_roles`"
64
+ )
65
+
66
+ yield from monday_client.get_account_roles()
67
+
68
+ @dlt.resource(
69
+ name="users",
70
+ write_disposition="replace",
71
+ )
72
+ def fetch_users() -> Iterator[dict[str, Any]]:
73
+ """
74
+ Fetch users from Monday.com.
75
+
76
+ Table format: users (no parameters needed)
77
+ """
78
+ if len(params) != 0:
79
+ raise ValueError("Users table must be in the format `users`")
80
+
81
+ yield from monday_client.get_users()
82
+
83
+ @dlt.resource(
84
+ name="boards",
85
+ write_disposition="merge",
86
+ primary_key="id",
87
+ )
88
+ def fetch_boards(
89
+ updated_at: dlt.sources.incremental[str] = dlt.sources.incremental(
90
+ "updated_at", initial_value=start_date
91
+ ),
92
+ ) -> Iterator[dict[str, Any]]:
93
+ """
94
+ Fetch boards from Monday.com.
95
+
96
+ Table format: boards (no parameters needed)
97
+ """
98
+ if len(params) != 0:
99
+ raise ValueError("Boards table must be in the format `boards`")
100
+
101
+ yield from monday_client.get_boards()
102
+
103
+ @dlt.resource(
104
+ name="workspaces",
105
+ write_disposition="replace",
106
+ )
107
+ def fetch_workspaces() -> Iterator[dict[str, Any]]:
108
+ """
109
+ Fetch workspaces from Monday.com.
110
+
111
+ Table format: workspaces (no parameters needed)
112
+ """
113
+ if len(params) != 0:
114
+ raise ValueError("Workspaces table must be in the format `workspaces`")
115
+
116
+ yield from monday_client.get_workspaces()
117
+
118
+ @dlt.resource(
119
+ name="webhooks",
120
+ write_disposition="replace",
121
+ )
122
+ def fetch_webhooks() -> Iterator[dict[str, Any]]:
123
+ """
124
+ Fetch webhooks from Monday.com.
125
+
126
+ Table format: webhooks (no parameters needed)
127
+ """
128
+ if len(params) != 0:
129
+ raise ValueError("Webhooks table must be in the format `webhooks`")
130
+
131
+ yield from monday_client.get_webhooks()
132
+
133
+ @dlt.resource(
134
+ name="updates",
135
+ write_disposition="merge",
136
+ primary_key="id",
137
+ )
138
+ def fetch_updates(
139
+ updated_at: dlt.sources.incremental[str] = dlt.sources.incremental(
140
+ "updated_at", initial_value=start_date
141
+ ),
142
+ ) -> Iterator[dict[str, Any]]:
143
+ """
144
+ Fetch updates from Monday.com.
145
+
146
+ Table format: updates (no parameters needed)
147
+ Requires start_date and end_date parameters
148
+ """
149
+ if len(params) != 0:
150
+ raise ValueError("Updates table must be in the format `updates`")
151
+
152
+ yield from monday_client.get_updates(start_date=start_date, end_date=end_date)
153
+
154
+ @dlt.resource(
155
+ name="teams",
156
+ write_disposition="replace",
157
+ )
158
+ def fetch_teams() -> Iterator[dict[str, Any]]:
159
+ """
160
+ Fetch teams from Monday.com.
161
+
162
+ Table format: teams (no parameters needed)
163
+ """
164
+ if len(params) != 0:
165
+ raise ValueError("Teams table must be in the format `teams`")
166
+
167
+ yield from monday_client.get_teams()
168
+
169
+ @dlt.resource(
170
+ name="tags",
171
+ write_disposition="replace",
172
+ )
173
+ def fetch_tags() -> Iterator[dict[str, Any]]:
174
+ """
175
+ Fetch tags from Monday.com.
176
+
177
+ Table format: tags (no parameters needed)
178
+ """
179
+ if len(params) != 0:
180
+ raise ValueError("Tags table must be in the format `tags`")
181
+
182
+ yield from monday_client.get_tags()
183
+
184
+ @dlt.resource(
185
+ name="custom_activities",
186
+ write_disposition="replace",
187
+ )
188
+ def fetch_custom_activities() -> Iterator[dict[str, Any]]:
189
+ """
190
+ Fetch custom activities from Monday.com.
191
+
192
+ Table format: custom_activities (no parameters needed)
193
+ """
194
+ if len(params) != 0:
195
+ raise ValueError(
196
+ "Custom activities table must be in the format `custom_activities`"
197
+ )
198
+
199
+ yield from monday_client.get_custom_activities()
200
+
201
+ @dlt.resource(
202
+ name="board_columns",
203
+ write_disposition="replace",
204
+ )
205
+ def fetch_board_columns() -> Iterator[dict[str, Any]]:
206
+ """
207
+ Fetch board columns from Monday.com.
208
+
209
+ Table format: board_columns (no parameters needed)
210
+ """
211
+ if len(params) != 0:
212
+ raise ValueError(
213
+ "Board columns table must be in the format `board_columns`"
214
+ )
215
+
216
+ yield from monday_client.get_board_columns()
217
+
218
+ @dlt.resource(
219
+ name="board_views",
220
+ write_disposition="replace",
221
+ )
222
+ def fetch_board_views() -> Iterator[dict[str, Any]]:
223
+ """
224
+ Fetch board views from Monday.com.
225
+
226
+ Table format: board_views (no parameters needed)
227
+ """
228
+ if len(params) != 0:
229
+ raise ValueError("Board views table must be in the format `board_views`")
230
+
231
+ yield from monday_client.get_board_views()
232
+
233
+ return (
234
+ fetch_account,
235
+ fetch_account_roles,
236
+ fetch_users,
237
+ fetch_boards,
238
+ fetch_workspaces,
239
+ fetch_webhooks,
240
+ fetch_updates,
241
+ fetch_teams,
242
+ fetch_tags,
243
+ fetch_custom_activities,
244
+ fetch_board_columns,
245
+ fetch_board_views,
246
+ )
@@ -0,0 +1,392 @@
1
+ from typing import Any, Dict, Iterator, Optional
2
+
3
+ from ingestr.src.http_client import create_client
4
+
5
+ from .settings import (
6
+ ACCOUNT_QUERY,
7
+ ACCOUNT_ROLES_QUERY,
8
+ BOARD_COLUMNS_QUERY,
9
+ BOARD_VIEWS_QUERY,
10
+ BOARDS_QUERY,
11
+ CUSTOM_ACTIVITIES_QUERY,
12
+ MAX_PAGE_SIZE,
13
+ TAGS_QUERY,
14
+ TEAMS_QUERY,
15
+ UPDATES_QUERY,
16
+ USERS_QUERY,
17
+ WEBHOOKS_QUERY,
18
+ WORKSPACES_QUERY,
19
+ )
20
+
21
+
22
+ def _paginate(
23
+ client: "MondayClient",
24
+ query: str,
25
+ field_name: str,
26
+ limit: int = 100,
27
+ extra_variables: Optional[Dict[str, Any]] = None,
28
+ ) -> Iterator[Dict[str, Any]]:
29
+ """
30
+ Helper function to paginate through Monday.com API results.
31
+
32
+ Args:
33
+ client: MondayClient instance
34
+ query: GraphQL query with $limit and $page variables
35
+ field_name: Name of the field in the response to extract
36
+ limit: Number of results per page
37
+ extra_variables: Additional variables to pass to the query
38
+
39
+ Yields:
40
+ Normalized dictionaries from the API response
41
+ """
42
+ page = 1
43
+
44
+ while True:
45
+ variables = {
46
+ "limit": min(limit, MAX_PAGE_SIZE),
47
+ "page": page,
48
+ }
49
+
50
+ if extra_variables:
51
+ variables.update(extra_variables)
52
+
53
+ data = client._execute_query(query, variables)
54
+ items = data.get(field_name, [])
55
+
56
+ if not items:
57
+ break
58
+
59
+ for item in items:
60
+ yield normalize_dict(item)
61
+
62
+ if len(items) < limit:
63
+ break
64
+
65
+ page += 1
66
+
67
+
68
+ def _get_all_board_ids(client: "MondayClient") -> list[str]:
69
+ """
70
+ Collect all board IDs from the Monday.com API.
71
+
72
+ Args:
73
+ client: MondayClient instance
74
+
75
+ Returns:
76
+ List of board IDs as strings
77
+ """
78
+ board_ids = []
79
+ for board in _paginate(client, BOARDS_QUERY, "boards", MAX_PAGE_SIZE):
80
+ board_id = board.get("id")
81
+ if board_id:
82
+ board_ids.append(str(board_id))
83
+ return board_ids
84
+
85
+
86
+ def _fetch_nested_board_data(
87
+ client: "MondayClient", query: str, nested_field: str
88
+ ) -> Iterator[Dict[str, Any]]:
89
+ """
90
+ Fetch nested data from boards (columns, views, etc).
91
+
92
+ Args:
93
+ client: MondayClient instance
94
+ query: GraphQL query to execute
95
+ nested_field: Name of the nested field to extract (e.g., "columns", "views")
96
+
97
+ Yields:
98
+ Dict containing nested data with board_id added
99
+ """
100
+ board_ids = _get_all_board_ids(client)
101
+
102
+ if not board_ids:
103
+ return
104
+
105
+ for board_id in board_ids:
106
+ variables = {"board_ids": [board_id]}
107
+ data = client._execute_query(query, variables)
108
+ boards = data.get("boards", [])
109
+
110
+ for board in boards:
111
+ nested_items = board.get(nested_field, [])
112
+
113
+ if nested_items and isinstance(nested_items, list):
114
+ for item in nested_items:
115
+ item_data = item.copy()
116
+ item_data["board_id"] = board.get("id")
117
+ yield normalize_dict(item_data)
118
+
119
+
120
+ def _fetch_simple_list(
121
+ client: "MondayClient", query: str, field_name: str
122
+ ) -> Iterator[Dict[str, Any]]:
123
+ """
124
+ Fetch a simple list of items from Monday.com API without pagination.
125
+
126
+ Args:
127
+ client: MondayClient instance
128
+ query: GraphQL query to execute
129
+ field_name: Name of the field in the response to extract
130
+
131
+ Yields:
132
+ Normalized dictionaries from the API response
133
+ """
134
+ data = client._execute_query(query)
135
+ items = data.get(field_name, [])
136
+
137
+ for item in items:
138
+ yield normalize_dict(item)
139
+
140
+
141
+ def normalize_dict(data: Dict[str, Any]) -> Dict[str, Any]:
142
+ """
143
+ Normalize dictionary fields by detecting their structure:
144
+ - Convert nested objects with 'id' field to {field_name}_id
145
+ - Convert objects with other fields to flattened {field_name}_{subfield}
146
+ - Convert arrays to JSON strings for storage
147
+ - Preserve null values
148
+
149
+ Args:
150
+ data: The dictionary to normalize
151
+
152
+ Returns:
153
+ Normalized dictionary with flattened structure
154
+
155
+ Example:
156
+ >>> normalize_dict({"user": {"id": "123"}, "plan": {"tier": "pro"}})
157
+ {"user_id": "123", "plan_tier": "pro"}
158
+ """
159
+ import json
160
+
161
+ normalized: Dict[str, Any] = {}
162
+
163
+ for key, value in data.items():
164
+ if value is None:
165
+ # Keep null values as-is
166
+ normalized[key] = None
167
+ elif isinstance(value, dict):
168
+ # If the dict has only an 'id' field, replace with {key}_id
169
+ if "id" in value and len(value) == 1:
170
+ normalized[f"{key}_id"] = value["id"]
171
+ # If dict has multiple fields, flatten them
172
+ elif value:
173
+ for subkey, subvalue in value.items():
174
+ normalized[f"{key}_{subkey}"] = subvalue
175
+ elif isinstance(value, list):
176
+ # If list contains dicts with only 'id' field, extract ids
177
+ if value and isinstance(value[0], dict) and list(value[0].keys()) == ["id"]:
178
+ normalized[key] = [item["id"] for item in value]
179
+ else:
180
+ # Convert other lists to JSON strings for storage
181
+ normalized[key] = json.dumps(value)
182
+ else:
183
+ # Add scalar values directly
184
+ normalized[key] = value
185
+
186
+ return normalized
187
+
188
+
189
+ class MondayClient:
190
+ """Monday.com GraphQL API client."""
191
+
192
+ def __init__(self, api_token: str) -> None:
193
+ self.api_token = api_token
194
+ self.base_url = "https://api.monday.com/v2"
195
+ self.session = create_client()
196
+
197
+ def _headers(self) -> Dict[str, str]:
198
+ return {
199
+ "Authorization": self.api_token,
200
+ "Content-Type": "application/json",
201
+ }
202
+
203
+ def _execute_query(
204
+ self, query: str, variables: Optional[Dict[str, Any]] = None
205
+ ) -> Dict[str, Any]:
206
+ """Execute a GraphQL query against Monday.com API."""
207
+ payload: Dict[str, Any] = {"query": query}
208
+ if variables:
209
+ payload["variables"] = variables
210
+
211
+ response = self.session.post(
212
+ self.base_url,
213
+ headers=self._headers(),
214
+ json=payload,
215
+ )
216
+ response.raise_for_status()
217
+ data = response.json()
218
+
219
+ if "errors" in data:
220
+ raise Exception(f"GraphQL errors: {data['errors']}")
221
+
222
+ return data.get("data", {})
223
+
224
+ def get_account(self) -> Dict[str, Any]:
225
+ """
226
+ Fetch account information from Monday.com API.
227
+
228
+ Returns:
229
+ Dict containing account data
230
+ """
231
+ data = self._execute_query(ACCOUNT_QUERY)
232
+ account = data.get("account", {})
233
+
234
+ if not account:
235
+ raise Exception("No account data returned from Monday.com API")
236
+
237
+ return normalize_dict(account)
238
+
239
+ def get_account_roles(self) -> Iterator[Dict[str, Any]]:
240
+ """
241
+ Fetch account roles from Monday.com API.
242
+
243
+ Yields:
244
+ Dict containing account role data
245
+ """
246
+ yield from _fetch_simple_list(self, ACCOUNT_ROLES_QUERY, "account_roles")
247
+
248
+ def get_users(self, limit: int = MAX_PAGE_SIZE) -> Iterator[Dict[str, Any]]:
249
+ """
250
+ Fetch users from Monday.com API with pagination.
251
+
252
+ Args:
253
+ limit: Number of results per page (max 100)
254
+
255
+ Yields:
256
+ Dict containing user data
257
+ """
258
+ yield from _paginate(self, USERS_QUERY, "users", limit)
259
+
260
+ def get_boards(self, limit: int = MAX_PAGE_SIZE) -> Iterator[Dict[str, Any]]:
261
+ """
262
+ Fetch boards from Monday.com API with pagination.
263
+
264
+ Args:
265
+ limit: Number of results per page (max 100)
266
+
267
+ Yields:
268
+ Dict containing board data
269
+ """
270
+ yield from _paginate(self, BOARDS_QUERY, "boards", limit)
271
+
272
+ def get_workspaces(self) -> Iterator[Dict[str, Any]]:
273
+ """
274
+ Fetch workspaces from Monday.com API.
275
+ First gets all boards to extract unique workspace IDs,
276
+ then fetches workspace details.
277
+
278
+ Yields:
279
+ Dict containing workspace data
280
+ """
281
+ # Collect unique workspace IDs from boards
282
+ workspace_ids = set()
283
+ for board in _paginate(self, BOARDS_QUERY, "boards", MAX_PAGE_SIZE):
284
+ workspace_id = board.get("workspace_id")
285
+ if workspace_id:
286
+ workspace_ids.add(str(workspace_id))
287
+
288
+ if not workspace_ids:
289
+ return
290
+
291
+ # Fetch workspace details
292
+ variables = {"ids": list(workspace_ids)}
293
+ data = self._execute_query(WORKSPACES_QUERY, variables)
294
+ workspaces = data.get("workspaces", [])
295
+
296
+ for workspace in workspaces:
297
+ yield normalize_dict(workspace)
298
+
299
+ def get_webhooks(self) -> Iterator[Dict[str, Any]]:
300
+ """
301
+ Fetch webhooks from Monday.com API.
302
+ First gets all board IDs, then fetches webhooks for each board.
303
+
304
+ Yields:
305
+ Dict containing webhook data
306
+ """
307
+ board_ids = _get_all_board_ids(self)
308
+
309
+ for board_id in board_ids:
310
+ variables = {"board_id": board_id}
311
+ data = self._execute_query(WEBHOOKS_QUERY, variables)
312
+ webhooks = data.get("webhooks", [])
313
+
314
+ for webhook in webhooks:
315
+ yield normalize_dict(webhook)
316
+
317
+ def get_updates(
318
+ self,
319
+ limit: int = MAX_PAGE_SIZE,
320
+ start_date: Optional[str] = None,
321
+ end_date: Optional[str] = None,
322
+ ) -> Iterator[Dict[str, Any]]:
323
+ """
324
+ Fetch updates from Monday.com API.
325
+
326
+ Args:
327
+ limit: Number of results (max 100)
328
+ start_date: Start date in YYYY-MM-DD format
329
+ end_date: End date in YYYY-MM-DD format
330
+
331
+ Yields:
332
+ Dict containing update data
333
+ """
334
+ variables: Dict[str, Any] = {"limit": min(limit, MAX_PAGE_SIZE)}
335
+
336
+ if start_date:
337
+ variables["from_date"] = start_date
338
+ if end_date:
339
+ variables["to_date"] = end_date
340
+
341
+ data = self._execute_query(UPDATES_QUERY, variables)
342
+ updates = data.get("updates", [])
343
+
344
+ for update in updates:
345
+ yield normalize_dict(update)
346
+
347
+ def get_teams(self) -> Iterator[Dict[str, Any]]:
348
+ """
349
+ Fetch teams from Monday.com API.
350
+
351
+ Yields:
352
+ Dict containing team data
353
+ """
354
+ yield from _fetch_simple_list(self, TEAMS_QUERY, "teams")
355
+
356
+ def get_tags(self) -> Iterator[Dict[str, Any]]:
357
+ """
358
+ Fetch tags from Monday.com API.
359
+
360
+ Yields:
361
+ Dict containing tag data
362
+ """
363
+ yield from _fetch_simple_list(self, TAGS_QUERY, "tags")
364
+
365
+ def get_custom_activities(self) -> Iterator[Dict[str, Any]]:
366
+ """
367
+ Fetch custom activities from Monday.com API.
368
+
369
+ Yields:
370
+ Dict containing custom activity data
371
+ """
372
+ yield from _fetch_simple_list(self, CUSTOM_ACTIVITIES_QUERY, "custom_activity")
373
+
374
+ def get_board_columns(self) -> Iterator[Dict[str, Any]]:
375
+ """
376
+ Fetch board columns from Monday.com API.
377
+ First gets all board IDs, then fetches columns for each board.
378
+
379
+ Yields:
380
+ Dict containing board column data with board_id
381
+ """
382
+ yield from _fetch_nested_board_data(self, BOARD_COLUMNS_QUERY, "columns")
383
+
384
+ def get_board_views(self) -> Iterator[Dict[str, Any]]:
385
+ """
386
+ Fetch board views from Monday.com API.
387
+ First gets all board IDs, then fetches views for each board.
388
+
389
+ Yields:
390
+ Dict containing board view data with board_id
391
+ """
392
+ yield from _fetch_nested_board_data(self, BOARD_VIEWS_QUERY, "views")
@@ -0,0 +1,328 @@
1
+ """Monday.com source settings and constants"""
2
+
3
+ # GraphQL query for fetching app installs
4
+ APP_INSTALLS_QUERY = """
5
+ query ($app_id: ID!, $account_id: ID, $limit: Int!, $page: Int!) {
6
+ app_installs(
7
+ app_id: $app_id
8
+ account_id: $account_id
9
+ limit: $limit
10
+ page: $page
11
+ ) {
12
+ app_id
13
+ timestamp
14
+ app_install_account {
15
+ id
16
+ }
17
+ app_install_user {
18
+ id
19
+ }
20
+ app_version {
21
+ major
22
+ minor
23
+ patch
24
+ type
25
+ text
26
+ }
27
+ permissions {
28
+ approved_scopes
29
+ required_scopes
30
+ }
31
+ }
32
+ }
33
+ """
34
+
35
+ # GraphQL query for fetching account information
36
+ ACCOUNT_QUERY = """
37
+ query {
38
+ account {
39
+ id
40
+ name
41
+ slug
42
+ tier
43
+ country_code
44
+ first_day_of_the_week
45
+ show_timeline_weekends
46
+ sign_up_product_kind
47
+ active_members_count
48
+ logo
49
+ plan {
50
+ max_users
51
+ period
52
+ tier
53
+ version
54
+ }
55
+ }
56
+ }
57
+ """
58
+
59
+ # GraphQL query for fetching account roles
60
+ ACCOUNT_ROLES_QUERY = """
61
+ query {
62
+ account_roles {
63
+ id
64
+ name
65
+ roleType
66
+ }
67
+ }
68
+ """
69
+
70
+ # GraphQL query for fetching users
71
+ USERS_QUERY = """
72
+ query ($limit: Int!, $page: Int!) {
73
+ users(limit: $limit, page: $page) {
74
+ id
75
+ name
76
+ email
77
+ enabled
78
+ is_admin
79
+ is_guest
80
+ is_pending
81
+ is_view_only
82
+ created_at
83
+ birthday
84
+ country_code
85
+ join_date
86
+ location
87
+ mobile_phone
88
+ phone
89
+ photo_original
90
+ photo_thumb
91
+ photo_tiny
92
+ time_zone_identifier
93
+ title
94
+ url
95
+ utc_hours_diff
96
+ current_language
97
+ account {
98
+ id
99
+ }
100
+ }
101
+ }
102
+ """
103
+
104
+ # GraphQL query for fetching boards
105
+ BOARDS_QUERY = """
106
+ query ($limit: Int!, $page: Int!) {
107
+ boards(limit: $limit, page: $page) {
108
+ id
109
+ name
110
+ description
111
+ state
112
+ board_kind
113
+ board_folder_id
114
+ workspace_id
115
+ permissions
116
+ item_terminology
117
+ items_count
118
+ updated_at
119
+ url
120
+ communication
121
+ object_type_unique_key
122
+ type
123
+ creator {
124
+ id
125
+ }
126
+ owners {
127
+ id
128
+ }
129
+ subscribers {
130
+ id
131
+ }
132
+ team_owners {
133
+ id
134
+ }
135
+ team_subscribers {
136
+ id
137
+ }
138
+ tags {
139
+ id
140
+
141
+ }
142
+ }
143
+ }
144
+ """
145
+
146
+ # GraphQL query for fetching custom activities
147
+ CUSTOM_ACTIVITIES_QUERY = """
148
+ query {
149
+ custom_activity {
150
+ id
151
+ name
152
+ type
153
+ color
154
+ icon_id
155
+ }
156
+ }
157
+ """
158
+
159
+ # GraphQL query for fetching board columns
160
+ BOARD_COLUMNS_QUERY = """
161
+ query ($board_ids: [ID!]) {
162
+ boards(ids: $board_ids) {
163
+ id
164
+ columns {
165
+ id
166
+ title
167
+ type
168
+ archived
169
+ description
170
+ settings_str
171
+ width
172
+ }
173
+ }
174
+ }
175
+ """
176
+
177
+ # GraphQL query for fetching board views
178
+ BOARD_VIEWS_QUERY = """
179
+ query ($board_ids: [ID!]) {
180
+ boards(ids: $board_ids) {
181
+ id
182
+ views {
183
+ id
184
+ name
185
+ type
186
+ settings_str
187
+ view_specific_data_str
188
+ source_view_id
189
+ access_level
190
+ }
191
+ }
192
+ }
193
+ """
194
+
195
+ # GraphQL query for fetching workspaces by IDs
196
+ WORKSPACES_QUERY = """
197
+ query ($ids: [ID!]) {
198
+ workspaces(ids: $ids) {
199
+ id
200
+ name
201
+ kind
202
+ description
203
+ created_at
204
+ is_default_workspace
205
+ state
206
+ account_product {
207
+ id
208
+ }
209
+ owners_subscribers {
210
+ id
211
+ }
212
+ team_owners_subscribers {
213
+ id
214
+ }
215
+ teams_subscribers {
216
+ id
217
+ }
218
+ users_subscribers {
219
+ id
220
+ }
221
+ settings {
222
+ icon
223
+ }
224
+ }
225
+ }
226
+ """
227
+
228
+ # GraphQL query for fetching webhooks by board ID
229
+ WEBHOOKS_QUERY = """
230
+ query ($board_id: ID!) {
231
+ webhooks(board_id: $board_id) {
232
+ id
233
+ event
234
+ board_id
235
+ config
236
+ }
237
+ }
238
+ """
239
+
240
+ # GraphQL query for fetching updates
241
+ UPDATES_QUERY = """
242
+ query ($limit: Int!, $from_date: String, $to_date: String) {
243
+ updates(limit: $limit, from_date: $from_date, to_date: $to_date) {
244
+ id
245
+ body
246
+ text_body
247
+ created_at
248
+ updated_at
249
+ edited_at
250
+ creator_id
251
+ item_id
252
+ creator {
253
+ id
254
+ }
255
+ item {
256
+ id
257
+ }
258
+ assets {
259
+ id
260
+ name
261
+ file_extension
262
+ file_size
263
+ public_url
264
+ url
265
+ url_thumbnail
266
+ created_at
267
+ original_geometry
268
+ uploaded_by {
269
+ id
270
+ }
271
+ }
272
+ replies {
273
+ id
274
+ body
275
+ text_body
276
+ created_at
277
+ updated_at
278
+ creator_id
279
+ creator {
280
+ id
281
+ }
282
+ }
283
+ likes {
284
+ id
285
+ }
286
+ pinned_to_top {
287
+ item_id
288
+ }
289
+ viewers {
290
+ medium
291
+ user_id
292
+ user {
293
+ id
294
+ }
295
+ }
296
+ }
297
+ }
298
+ """
299
+
300
+ # GraphQL query for fetching teams
301
+ TEAMS_QUERY = """
302
+ query {
303
+ teams {
304
+ id
305
+ name
306
+ picture_url
307
+ users {
308
+ id
309
+ created_at
310
+ phone
311
+ }
312
+ }
313
+ }
314
+ """
315
+
316
+ # GraphQL query for fetching tags
317
+ TAGS_QUERY = """
318
+ query {
319
+ tags {
320
+ id
321
+ name
322
+ color
323
+ }
324
+ }
325
+ """
326
+
327
+ # Maximum number of results per page
328
+ MAX_PAGE_SIZE = 100
ingestr/src/sources.py CHANGED
@@ -3878,3 +3878,86 @@ class IntercomSource:
3878
3878
  start_date=start_date,
3879
3879
  end_date=end_date,
3880
3880
  ).with_resources(table)
3881
+
3882
+
3883
+ class HttpSource:
3884
+ """Source for reading CSV, JSON, and Parquet files from HTTP URLs"""
3885
+
3886
+ def handles_incrementality(self) -> bool:
3887
+ return False
3888
+
3889
+ def dlt_source(self, uri: str, table: str, **kwargs):
3890
+ """
3891
+ Create a dlt source for reading files from HTTP URLs.
3892
+
3893
+ URI format: http://example.com/file.csv or https://example.com/file.json
3894
+
3895
+ Args:
3896
+ uri: HTTP(S) URL to the file
3897
+ table: Not used for HTTP source (files are read directly)
3898
+ **kwargs: Additional arguments:
3899
+ - file_format: Optional file format override ('csv', 'json', 'parquet')
3900
+ - chunksize: Number of records to process at once (default varies by format)
3901
+ - merge_key: Merge key for the resource
3902
+
3903
+ Returns:
3904
+ DltResource for the HTTP file
3905
+ """
3906
+ from ingestr.src.http import http_source
3907
+
3908
+ # Extract the actual URL (remove the http:// or https:// scheme if duplicated)
3909
+ url = uri
3910
+ if uri.startswith("http://http://") or uri.startswith("https://https://"):
3911
+ url = uri.split("://", 1)[1]
3912
+
3913
+ file_format = kwargs.get("file_format")
3914
+ chunksize = kwargs.get("chunksize")
3915
+ merge_key = kwargs.get("merge_key")
3916
+
3917
+ reader_kwargs = {}
3918
+ if chunksize is not None:
3919
+ reader_kwargs["chunksize"] = chunksize
3920
+
3921
+ source = http_source(url=url, file_format=file_format, **reader_kwargs)
3922
+
3923
+ if merge_key:
3924
+ source.apply_hints(merge_key=merge_key)
3925
+
3926
+ return source
3927
+
3928
+
3929
+ class MondaySource:
3930
+ def handles_incrementality(self) -> bool:
3931
+ return False
3932
+
3933
+ def dlt_source(self, uri: str, table: str, **kwargs):
3934
+ parsed_uri = urlparse(uri)
3935
+ query_params = parse_qs(parsed_uri.query)
3936
+ api_token = query_params.get("api_token")
3937
+
3938
+ if api_token is None:
3939
+ raise MissingValueError("api_token", "Monday")
3940
+
3941
+ parts = table.replace(" ", "").split(":")
3942
+ table_name = parts[0]
3943
+ params = parts[1:]
3944
+
3945
+ # Get interval_start and interval_end from kwargs (command line args)
3946
+ interval_start = kwargs.get("interval_start")
3947
+ interval_end = kwargs.get("interval_end")
3948
+
3949
+ # Convert datetime to string format YYYY-MM-DD
3950
+ start_date = interval_start.strftime("%Y-%m-%d") if interval_start else None
3951
+ end_date = interval_end.strftime("%Y-%m-%d") if interval_end else None
3952
+
3953
+ from ingestr.src.monday import monday_source
3954
+
3955
+ try:
3956
+ return monday_source(
3957
+ api_token=api_token[0],
3958
+ params=params,
3959
+ start_date=start_date,
3960
+ end_date=end_date,
3961
+ ).with_resources(table_name)
3962
+ except ResourcesNotFoundError:
3963
+ raise UnsupportedResourceError(table_name, "Monday")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ingestr
3
- Version: 0.14.5
3
+ Version: 0.14.6
4
4
  Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
5
5
  Project-URL: Homepage, https://github.com/bruin-data/ingestr
6
6
  Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
@@ -2,17 +2,17 @@ ingestr/conftest.py,sha256=OE2yxeTCosS9CUFVuqNypm-2ftYvVBeeq7egm3878cI,1981
2
2
  ingestr/main.py,sha256=qo0g3wCFl8a_1jUwXagX8L1Q8PKKQlTF7md9pfnzW0Y,27155
3
3
  ingestr/src/.gitignore,sha256=8cX1AZTSI0TcdZFGTmS_oyBjpfCzhOEt0DdAo2dFIY8,203
4
4
  ingestr/src/blob.py,sha256=UUWMjHUuoR9xP1XZQ6UANQmnMVyDx3d0X4-2FQC271I,2138
5
- ingestr/src/buildinfo.py,sha256=8EpHAODYK-dLsm_OMa9EmwMw0Qxn4aTYbjDN-PvxxBQ,20
5
+ ingestr/src/buildinfo.py,sha256=dazsjzReTYtam8X7FVSN4WAYUmvlPNZ0XztT57SOHTU,20
6
6
  ingestr/src/destinations.py,sha256=QtjE0AGs0WkPHaI2snWPHJ8HHi4lwXUBYLJPklz8Mvk,27772
7
7
  ingestr/src/errors.py,sha256=Ufs4_DfE77_E3vnA1fOQdi6cmuLVNm7_SbFLkL1XPGk,686
8
- ingestr/src/factory.py,sha256=NNUwks5V66iaWfOblF9MtIpclBasFjYZsMXOKpXUKq4,7538
8
+ ingestr/src/factory.py,sha256=03eGDe2rL6qyT5sGmTGZi-XIwJbbdoedE_KjW3ZF7QY,7661
9
9
  ingestr/src/filters.py,sha256=0n0sNAVG_f-B_1r7lW5iNtw9z_G1bxWzPaiL1i6tnbU,1665
10
10
  ingestr/src/http_client.py,sha256=bxqsk6nJNXCo-79gW04B53DQO-yr25vaSsqP0AKtjx4,732
11
11
  ingestr/src/loader.py,sha256=9NaWAyfkXdqAZSS-N72Iwo36Lbx4PyqIfaaH1dNdkFs,1712
12
12
  ingestr/src/masking.py,sha256=VN0LdfvExhQ1bZMRylGtaBUIoH-vjuIUmRnYKwo3yiY,11358
13
13
  ingestr/src/partition.py,sha256=BrIP6wFJvyR7Nus_3ElnfxknUXeCipK_E_bB8kZowfc,969
14
14
  ingestr/src/resource.py,sha256=ZqmZxFQVGlF8rFPhBiUB08HES0yoTj8sZ--jKfaaVps,1164
15
- ingestr/src/sources.py,sha256=MuMD7r8_ojCAMVix5p5BD461biZdT5AKE4YZspikrHw,135647
15
+ ingestr/src/sources.py,sha256=yQhmgIIfzMr8qHxQr-yDmzowti_q59khRzBDPY0Kw-I,138486
16
16
  ingestr/src/table_definition.py,sha256=REbAbqdlmUMUuRh8nEQRreWjPVOQ5ZcfqGkScKdCrmk,390
17
17
  ingestr/src/time.py,sha256=H_Fk2J4ShXyUM-EMY7MqCLZQhlnZMZvO952bmZPc4yE,254
18
18
  ingestr/src/version.py,sha256=J_2xgZ0mKlvuHcjdKCx2nlioneLH0I47JiU_Slr_Nwc,189
@@ -83,6 +83,8 @@ ingestr/src/google_sheets/helpers/api_calls.py,sha256=RiVfdacbaneszhmuhYilkJnkc9
83
83
  ingestr/src/google_sheets/helpers/data_processing.py,sha256=RNt2MYfdJhk4bRahnQVezpNg2x9z0vx60YFq2ukZ8vI,11004
84
84
  ingestr/src/gorgias/__init__.py,sha256=_mFkMYwlY5OKEY0o_FK1OKol03A-8uk7bm1cKlmt5cs,21432
85
85
  ingestr/src/gorgias/helpers.py,sha256=DamuijnvhGY9hysQO4txrVMf4izkGbh5qfBKImdOINE,5427
86
+ ingestr/src/http/__init__.py,sha256=Y9mQIE0RolHOh6dPjW41qzYXSG8BC0GPKxEtz2CJGpU,902
87
+ ingestr/src/http/readers.py,sha256=rgBwYG5SOQ7P2uzBAFMOQIevKxV51ZW41VSiRTZ0Xvo,3863
86
88
  ingestr/src/hubspot/__init__.py,sha256=FCqjLeOjijdc9JC_NoDwtRqy3FDyY-szDi6UV7CdDN0,11548
87
89
  ingestr/src/hubspot/helpers.py,sha256=k2b-lhxqBNKHoOSHoHegFSsk8xxjjGA0I04V0XyX2b4,7883
88
90
  ingestr/src/hubspot/settings.py,sha256=i73MkSiJfRLMFLfiJgYdhp-rhymHTfoqFzZ4uOJdFJM,2456
@@ -109,6 +111,9 @@ ingestr/src/linkedin_ads/dimension_time_enum.py,sha256=EmHRdkFyTAfo4chGjThrwqffW
109
111
  ingestr/src/linkedin_ads/helpers.py,sha256=eUWudRVlXl4kqIhfXQ1eVsUpZwJn7UFqKSpnbLfxzds,4498
110
112
  ingestr/src/mixpanel/__init__.py,sha256=s1QtqMP0BTGW6YtdCabJFWj7lEn7KujzELwGpBOQgfs,1796
111
113
  ingestr/src/mixpanel/client.py,sha256=c_reouegOVYBOwHLfgYFwpmkba0Sxro1Zkml07NCYf0,3602
114
+ ingestr/src/monday/__init__.py,sha256=ZNdGCC_1CEYlgxAef-5QO56Drm9IMP82-rZpEvbD8aY,6918
115
+ ingestr/src/monday/helpers.py,sha256=xkAYTFIwjbU-dQTa4d41oQm6kFvCHv74AhCmN-H8aPE,11572
116
+ ingestr/src/monday/settings.py,sha256=5TC0OrTHQO52AifwP3Z2xsh4D8SDUq0WxqY5AQMjcns,5667
112
117
  ingestr/src/mongodb/__init__.py,sha256=6-DvvaKL7XOPPRwItI7lSpoMQLEPzYubV6dKhpzbuME,7494
113
118
  ingestr/src/mongodb/helpers.py,sha256=BKb0F-AUWjFJikE9OPP9z5wFuMmJsf8YsyWhvQ9dC1k,38076
114
119
  ingestr/src/notion/__init__.py,sha256=36wUui8finbc85ObkRMq8boMraXMUehdABN_AMe_hzA,1834
@@ -175,8 +180,8 @@ ingestr/testdata/merge_expected.csv,sha256=DReHqWGnQMsf2PBv_Q2pfjsgvikYFnf1zYcQZ
175
180
  ingestr/testdata/merge_part1.csv,sha256=Pw8Z9IDKcNU0qQHx1z6BUf4rF_-SxKGFOvymCt4OY9I,185
176
181
  ingestr/testdata/merge_part2.csv,sha256=T_GiWxA81SN63_tMOIuemcvboEFeAmbKc7xRXvL9esw,287
177
182
  ingestr/tests/unit/test_smartsheets.py,sha256=zf3DXT29Y4TH2lNPBFphdjlaelUUyPJcsW2UO68RzDs,4862
178
- ingestr-0.14.5.dist-info/METADATA,sha256=YSfv9AqmjLd5qjotu6cNcKxFQT8FMQu32EsY60B07Tw,15265
179
- ingestr-0.14.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
180
- ingestr-0.14.5.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
181
- ingestr-0.14.5.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
182
- ingestr-0.14.5.dist-info/RECORD,,
183
+ ingestr-0.14.6.dist-info/METADATA,sha256=3akmbk91m4xi1AMYFkuMPKmAtlcUSzwCOsYbeXwFlsk,15265
184
+ ingestr-0.14.6.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
185
+ ingestr-0.14.6.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
186
+ ingestr-0.14.6.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
187
+ ingestr-0.14.6.dist-info/RECORD,,