graflo 1.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of graflo might be problematic. Click here for more details.

Files changed (70) hide show
  1. graflo/README.md +18 -0
  2. graflo/__init__.py +70 -0
  3. graflo/architecture/__init__.py +38 -0
  4. graflo/architecture/actor.py +1276 -0
  5. graflo/architecture/actor_util.py +450 -0
  6. graflo/architecture/edge.py +418 -0
  7. graflo/architecture/onto.py +376 -0
  8. graflo/architecture/onto_sql.py +54 -0
  9. graflo/architecture/resource.py +163 -0
  10. graflo/architecture/schema.py +135 -0
  11. graflo/architecture/transform.py +292 -0
  12. graflo/architecture/util.py +89 -0
  13. graflo/architecture/vertex.py +562 -0
  14. graflo/caster.py +736 -0
  15. graflo/cli/__init__.py +14 -0
  16. graflo/cli/ingest.py +203 -0
  17. graflo/cli/manage_dbs.py +197 -0
  18. graflo/cli/plot_schema.py +132 -0
  19. graflo/cli/xml2json.py +93 -0
  20. graflo/data_source/__init__.py +48 -0
  21. graflo/data_source/api.py +339 -0
  22. graflo/data_source/base.py +95 -0
  23. graflo/data_source/factory.py +304 -0
  24. graflo/data_source/file.py +148 -0
  25. graflo/data_source/memory.py +70 -0
  26. graflo/data_source/registry.py +82 -0
  27. graflo/data_source/sql.py +183 -0
  28. graflo/db/__init__.py +44 -0
  29. graflo/db/arango/__init__.py +22 -0
  30. graflo/db/arango/conn.py +1025 -0
  31. graflo/db/arango/query.py +180 -0
  32. graflo/db/arango/util.py +88 -0
  33. graflo/db/conn.py +377 -0
  34. graflo/db/connection/__init__.py +6 -0
  35. graflo/db/connection/config_mapping.py +18 -0
  36. graflo/db/connection/onto.py +717 -0
  37. graflo/db/connection/wsgi.py +29 -0
  38. graflo/db/manager.py +119 -0
  39. graflo/db/neo4j/__init__.py +16 -0
  40. graflo/db/neo4j/conn.py +639 -0
  41. graflo/db/postgres/__init__.py +37 -0
  42. graflo/db/postgres/conn.py +948 -0
  43. graflo/db/postgres/fuzzy_matcher.py +281 -0
  44. graflo/db/postgres/heuristics.py +133 -0
  45. graflo/db/postgres/inference_utils.py +428 -0
  46. graflo/db/postgres/resource_mapping.py +273 -0
  47. graflo/db/postgres/schema_inference.py +372 -0
  48. graflo/db/postgres/types.py +148 -0
  49. graflo/db/postgres/util.py +87 -0
  50. graflo/db/tigergraph/__init__.py +9 -0
  51. graflo/db/tigergraph/conn.py +2365 -0
  52. graflo/db/tigergraph/onto.py +26 -0
  53. graflo/db/util.py +49 -0
  54. graflo/filter/__init__.py +21 -0
  55. graflo/filter/onto.py +525 -0
  56. graflo/logging.conf +22 -0
  57. graflo/onto.py +312 -0
  58. graflo/plot/__init__.py +17 -0
  59. graflo/plot/plotter.py +616 -0
  60. graflo/util/__init__.py +23 -0
  61. graflo/util/chunker.py +807 -0
  62. graflo/util/merge.py +150 -0
  63. graflo/util/misc.py +37 -0
  64. graflo/util/onto.py +422 -0
  65. graflo/util/transform.py +454 -0
  66. graflo-1.3.7.dist-info/METADATA +243 -0
  67. graflo-1.3.7.dist-info/RECORD +70 -0
  68. graflo-1.3.7.dist-info/WHEEL +4 -0
  69. graflo-1.3.7.dist-info/entry_points.txt +5 -0
  70. graflo-1.3.7.dist-info/licenses/LICENSE +126 -0
@@ -0,0 +1,339 @@
1
+ """REST API data source implementation.
2
+
3
+ This module provides a data source for REST API endpoints, supporting
4
+ full HTTP configuration including authentication, headers, pagination,
5
+ and retry logic.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import dataclasses
11
+ import logging
12
+ from typing import Any, Iterator
13
+
14
+ import requests
15
+ from requests.adapters import HTTPAdapter
16
+ from requests.auth import HTTPBasicAuth, HTTPDigestAuth
17
+ from urllib3.util.retry import Retry
18
+
19
+ from graflo.data_source.base import AbstractDataSource, DataSourceType
20
+ from graflo.onto import BaseDataclass
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ @dataclasses.dataclass
26
+ class PaginationConfig(BaseDataclass):
27
+ """Configuration for API pagination.
28
+
29
+ Supports multiple pagination strategies:
30
+ - offset: Offset-based pagination (offset, limit)
31
+ - cursor: Cursor-based pagination (cursor parameter)
32
+ - page: Page-based pagination (page, per_page)
33
+
34
+ Attributes:
35
+ strategy: Pagination strategy ('offset', 'cursor', 'page')
36
+ offset_param: Parameter name for offset (default: 'offset')
37
+ limit_param: Parameter name for limit (default: 'limit')
38
+ cursor_param: Parameter name for cursor (default: 'cursor')
39
+ page_param: Parameter name for page (default: 'page')
40
+ per_page_param: Parameter name for per_page (default: 'per_page')
41
+ initial_offset: Initial offset value (default: 0)
42
+ initial_page: Initial page value (default: 1)
43
+ page_size: Number of items per page (default: 100)
44
+ cursor_path: JSON path to cursor in response (for cursor-based)
45
+ has_more_path: JSON path to has_more flag in response
46
+ data_path: JSON path to data array in response (default: root)
47
+ """
48
+
49
+ strategy: str = "offset" # 'offset', 'cursor', 'page'
50
+ offset_param: str = "offset"
51
+ limit_param: str = "limit"
52
+ cursor_param: str = "cursor"
53
+ page_param: str = "page"
54
+ per_page_param: str = "per_page"
55
+ initial_offset: int = 0
56
+ initial_page: int = 1
57
+ page_size: int = 100
58
+ cursor_path: str | None = None # JSON path like "next_cursor"
59
+ has_more_path: str | None = None # JSON path like "has_more"
60
+ data_path: str | None = None # JSON path to data array, None means root
61
+
62
+
63
+ @dataclasses.dataclass
64
+ class APIConfig(BaseDataclass):
65
+ """Configuration for REST API data source.
66
+
67
+ Attributes:
68
+ url: API endpoint URL
69
+ method: HTTP method (default: 'GET')
70
+ headers: HTTP headers as dictionary
71
+ auth: Authentication configuration
72
+ - For Basic auth: {'type': 'basic', 'username': '...', 'password': '...'}
73
+ - For Bearer token: {'type': 'bearer', 'token': '...'}
74
+ - For Digest auth: {'type': 'digest', 'username': '...', 'password': '...'}
75
+ params: Query parameters as dictionary
76
+ timeout: Request timeout in seconds (default: None for no timeout)
77
+ retries: Number of retry attempts (default: 0)
78
+ retry_backoff_factor: Backoff factor for retries (default: 0.1)
79
+ retry_status_forcelist: HTTP status codes to retry on (default: [500, 502, 503, 504])
80
+ verify: Verify SSL certificates (default: True)
81
+ pagination: Pagination configuration (default: None)
82
+ """
83
+
84
+ url: str
85
+ method: str = "GET"
86
+ headers: dict[str, str] = dataclasses.field(default_factory=dict)
87
+ auth: dict[str, Any] | None = None
88
+ params: dict[str, Any] = dataclasses.field(default_factory=dict)
89
+ timeout: float | None = None
90
+ retries: int = 0
91
+ retry_backoff_factor: float = 0.1
92
+ retry_status_forcelist: list[int] = dataclasses.field(
93
+ default_factory=lambda: [500, 502, 503, 504]
94
+ )
95
+ verify: bool = True
96
+ pagination: PaginationConfig | None = None
97
+
98
+
99
+ @dataclasses.dataclass
100
+ class APIDataSource(AbstractDataSource):
101
+ """Data source for REST API endpoints.
102
+
103
+ This class provides a data source for REST API endpoints, supporting
104
+ full HTTP configuration, authentication, pagination, and retry logic.
105
+ Returns JSON responses as hierarchical dictionaries, similar to JSON files.
106
+
107
+ Attributes:
108
+ config: API configuration
109
+ """
110
+
111
+ config: APIConfig
112
+
113
+ def __post_init__(self):
114
+ """Initialize the API data source."""
115
+ self.source_type = DataSourceType.API
116
+
117
+ def _create_session(self) -> requests.Session:
118
+ """Create a requests session with retry configuration.
119
+
120
+ Returns:
121
+ Configured requests session
122
+ """
123
+ session = requests.Session()
124
+
125
+ # Configure retries
126
+ if self.config.retries > 0:
127
+ retry_strategy = Retry(
128
+ total=self.config.retries,
129
+ backoff_factor=self.config.retry_backoff_factor,
130
+ status_forcelist=self.config.retry_status_forcelist,
131
+ )
132
+ adapter = HTTPAdapter(max_retries=retry_strategy)
133
+ session.mount("http://", adapter)
134
+ session.mount("https://", adapter)
135
+
136
+ # Configure authentication
137
+ if self.config.auth:
138
+ auth_type = self.config.auth.get("type", "").lower()
139
+ if auth_type == "basic":
140
+ session.auth = HTTPBasicAuth(
141
+ self.config.auth.get("username", ""),
142
+ self.config.auth.get("password", ""),
143
+ )
144
+ elif auth_type == "digest":
145
+ session.auth = HTTPDigestAuth(
146
+ self.config.auth.get("username", ""),
147
+ self.config.auth.get("password", ""),
148
+ )
149
+ elif auth_type == "bearer":
150
+ token = self.config.auth.get("token", "")
151
+ session.headers["Authorization"] = f"Bearer {token}"
152
+
153
+ # Set headers
154
+ session.headers.update(self.config.headers)
155
+
156
+ return session
157
+
158
+ def _extract_data(self, response: dict | list) -> list[dict]:
159
+ """Extract data array from API response.
160
+
161
+ Args:
162
+ response: API response as dictionary or list
163
+
164
+ Returns:
165
+ List of data items
166
+ """
167
+ if self.config.pagination and self.config.pagination.data_path:
168
+ # Navigate JSON path
169
+ parts = self.config.pagination.data_path.split(".")
170
+ data = response
171
+ for part in parts:
172
+ if isinstance(data, dict):
173
+ data = data.get(part)
174
+ elif isinstance(data, list):
175
+ data = data[int(part)]
176
+ else:
177
+ return []
178
+ if isinstance(data, list):
179
+ return data
180
+ elif isinstance(data, dict):
181
+ return [data]
182
+ else:
183
+ return []
184
+ else:
185
+ # Root level data
186
+ if isinstance(response, list):
187
+ return response
188
+ elif isinstance(response, dict):
189
+ return [response]
190
+ else:
191
+ return []
192
+
193
+ def _has_more(self, response: dict) -> bool:
194
+ """Check if there are more pages to fetch.
195
+
196
+ Args:
197
+ response: API response as dictionary
198
+
199
+ Returns:
200
+ True if there are more pages
201
+ """
202
+ if not self.config.pagination:
203
+ return False
204
+
205
+ if self.config.pagination.has_more_path:
206
+ parts = self.config.pagination.has_more_path.split(".")
207
+ value = response
208
+ for part in parts:
209
+ if isinstance(value, dict):
210
+ value = value.get(part)
211
+ else:
212
+ return False
213
+ return bool(value)
214
+
215
+ # Default: check if data array is not empty
216
+ data = self._extract_data(response)
217
+ return len(data) > 0
218
+
219
+ def _get_next_cursor(self, response: dict) -> str | None:
220
+ """Get next cursor from response.
221
+
222
+ Args:
223
+ response: API response as dictionary
224
+
225
+ Returns:
226
+ Next cursor value or None
227
+ """
228
+ if not self.config.pagination or not self.config.pagination.cursor_path:
229
+ return None
230
+
231
+ parts = self.config.pagination.cursor_path.split(".")
232
+ value = response
233
+ for part in parts:
234
+ if isinstance(value, dict):
235
+ value = value.get(part)
236
+ else:
237
+ return None
238
+ return str(value) if value is not None else None
239
+
240
+ def iter_batches(
241
+ self, batch_size: int = 1000, limit: int | None = None
242
+ ) -> Iterator[list[dict]]:
243
+ """Iterate over API data in batches.
244
+
245
+ Args:
246
+ batch_size: Number of items per batch
247
+ limit: Maximum number of items to retrieve
248
+
249
+ Yields:
250
+ list[dict]: Batches of documents as dictionaries
251
+ """
252
+ session = self._create_session()
253
+ total_items = 0
254
+
255
+ try:
256
+ # Initialize pagination state
257
+ offset = (
258
+ self.config.pagination.initial_offset if self.config.pagination else 0
259
+ )
260
+ page = self.config.pagination.initial_page if self.config.pagination else 1
261
+ cursor: str | None = None
262
+
263
+ while True:
264
+ # Build request parameters
265
+ params = self.config.params.copy()
266
+
267
+ # Add pagination parameters
268
+ if self.config.pagination:
269
+ if self.config.pagination.strategy == "offset":
270
+ params[self.config.pagination.offset_param] = offset
271
+ params[self.config.pagination.limit_param] = (
272
+ self.config.pagination.page_size
273
+ )
274
+ elif self.config.pagination.strategy == "page":
275
+ params[self.config.pagination.page_param] = page
276
+ params[self.config.pagination.per_page_param] = (
277
+ self.config.pagination.page_size
278
+ )
279
+ elif self.config.pagination.strategy == "cursor" and cursor:
280
+ params[self.config.pagination.cursor_param] = cursor
281
+
282
+ # Make request
283
+ try:
284
+ response = session.request(
285
+ method=self.config.method,
286
+ url=self.config.url,
287
+ params=params,
288
+ timeout=self.config.timeout,
289
+ verify=self.config.verify,
290
+ )
291
+ response.raise_for_status()
292
+ data = response.json()
293
+ except requests.RequestException as e:
294
+ logger.error(f"API request failed: {e}")
295
+ break
296
+
297
+ # Extract data from response
298
+ items = self._extract_data(data)
299
+
300
+ # Process items in batches
301
+ batch = []
302
+ for item in items:
303
+ if limit and total_items >= limit:
304
+ break
305
+ batch.append(item)
306
+ total_items += 1
307
+
308
+ if len(batch) >= batch_size:
309
+ yield batch
310
+ batch = []
311
+
312
+ # Yield remaining items
313
+ if batch:
314
+ yield batch
315
+
316
+ # Check if we should continue
317
+ if limit and total_items >= limit:
318
+ break
319
+
320
+ # Update pagination state
321
+ if self.config.pagination:
322
+ if self.config.pagination.strategy == "offset":
323
+ if not self._has_more(data):
324
+ break
325
+ offset += self.config.pagination.page_size
326
+ elif self.config.pagination.strategy == "page":
327
+ if not self._has_more(data):
328
+ break
329
+ page += 1
330
+ elif self.config.pagination.strategy == "cursor":
331
+ cursor = self._get_next_cursor(data)
332
+ if not cursor:
333
+ break
334
+ else:
335
+ # No pagination, single request
336
+ break
337
+
338
+ finally:
339
+ session.close()
@@ -0,0 +1,95 @@
1
+ """Base classes for data source abstraction.
2
+
3
+ This module defines the abstract base class and types for all data sources.
4
+ Data sources handle data retrieval from various sources (files, APIs, databases)
5
+ and provide a unified interface for batch iteration.
6
+ """
7
+
8
+ import abc
9
+ from typing import Iterator
10
+
11
+ from graflo.onto import BaseDataclass, BaseEnum
12
+
13
+
14
+ class DataSourceType(BaseEnum):
15
+ """Types of data sources supported by the system.
16
+
17
+ FILE: File-based data sources (JSON, JSONL, CSV/TSV)
18
+ API: REST API data sources
19
+ SQL: SQL database data sources
20
+ IN_MEMORY: In-memory data sources (lists, DataFrames)
21
+ """
22
+
23
+ FILE = "file"
24
+ API = "api"
25
+ SQL = "sql"
26
+ IN_MEMORY = "in_memory"
27
+
28
+
29
+ class AbstractDataSource(BaseDataclass, abc.ABC):
30
+ """Abstract base class for all data sources.
31
+
32
+ Data sources handle data retrieval from various sources and provide
33
+ a unified interface for batch iteration. They are separate from Resources,
34
+ which handle data transformation. Many DataSources can map to the same Resource.
35
+
36
+ Attributes:
37
+ source_type: Type of the data source
38
+ resource_name: Name of the resource this data source maps to
39
+ (set externally via DataSourceRegistry)
40
+ """
41
+
42
+ source_type: DataSourceType
43
+
44
+ def __post_init__(self):
45
+ """Initialize the data source after dataclass initialization."""
46
+ self._resource_name: str | None = None
47
+
48
+ @property
49
+ def resource_name(self) -> str | None:
50
+ """Get the resource name this data source maps to.
51
+
52
+ Returns:
53
+ Resource name or None if not set
54
+ """
55
+ return self._resource_name
56
+
57
+ @resource_name.setter
58
+ def resource_name(self, value: str | None):
59
+ """Set the resource name this data source maps to.
60
+
61
+ Args:
62
+ value: Resource name to set
63
+ """
64
+ self._resource_name = value
65
+
66
+ @abc.abstractmethod
67
+ def iter_batches(
68
+ self, batch_size: int = 1000, limit: int | None = None
69
+ ) -> Iterator[list[dict]]:
70
+ """Iterate over data in batches.
71
+
72
+ This method yields batches of documents (dictionaries) from the data source.
73
+ Each batch is a list of dictionaries representing the data items.
74
+
75
+ Args:
76
+ batch_size: Number of items per batch
77
+ limit: Maximum number of items to retrieve (None for no limit)
78
+
79
+ Yields:
80
+ list[dict]: Batches of documents as dictionaries
81
+
82
+ Raises:
83
+ NotImplementedError: Must be implemented by subclasses
84
+ """
85
+ raise NotImplementedError("Subclasses must implement iter_batches")
86
+
87
+ def __iter__(self):
88
+ """Make data source iterable, yielding individual items.
89
+
90
+ Yields:
91
+ dict: Individual documents
92
+ """
93
+ for batch in self.iter_batches(batch_size=1, limit=None):
94
+ for item in batch:
95
+ yield item