graflo 1.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- graflo/README.md +18 -0
- graflo/__init__.py +70 -0
- graflo/architecture/__init__.py +38 -0
- graflo/architecture/actor.py +1120 -0
- graflo/architecture/actor_util.py +450 -0
- graflo/architecture/edge.py +297 -0
- graflo/architecture/onto.py +374 -0
- graflo/architecture/resource.py +161 -0
- graflo/architecture/schema.py +136 -0
- graflo/architecture/transform.py +292 -0
- graflo/architecture/util.py +93 -0
- graflo/architecture/vertex.py +586 -0
- graflo/caster.py +655 -0
- graflo/cli/__init__.py +14 -0
- graflo/cli/ingest.py +194 -0
- graflo/cli/manage_dbs.py +197 -0
- graflo/cli/plot_schema.py +132 -0
- graflo/cli/xml2json.py +93 -0
- graflo/data_source/__init__.py +48 -0
- graflo/data_source/api.py +339 -0
- graflo/data_source/base.py +97 -0
- graflo/data_source/factory.py +298 -0
- graflo/data_source/file.py +133 -0
- graflo/data_source/memory.py +72 -0
- graflo/data_source/registry.py +82 -0
- graflo/data_source/sql.py +185 -0
- graflo/db/__init__.py +44 -0
- graflo/db/arango/__init__.py +22 -0
- graflo/db/arango/conn.py +1026 -0
- graflo/db/arango/query.py +180 -0
- graflo/db/arango/util.py +88 -0
- graflo/db/conn.py +377 -0
- graflo/db/connection/__init__.py +6 -0
- graflo/db/connection/config_mapping.py +18 -0
- graflo/db/connection/onto.py +688 -0
- graflo/db/connection/wsgi.py +29 -0
- graflo/db/manager.py +119 -0
- graflo/db/neo4j/__init__.py +16 -0
- graflo/db/neo4j/conn.py +639 -0
- graflo/db/postgres/__init__.py +156 -0
- graflo/db/postgres/conn.py +425 -0
- graflo/db/postgres/resource_mapping.py +139 -0
- graflo/db/postgres/schema_inference.py +245 -0
- graflo/db/postgres/types.py +148 -0
- graflo/db/tigergraph/__init__.py +9 -0
- graflo/db/tigergraph/conn.py +2212 -0
- graflo/db/util.py +49 -0
- graflo/filter/__init__.py +21 -0
- graflo/filter/onto.py +525 -0
- graflo/logging.conf +22 -0
- graflo/onto.py +190 -0
- graflo/plot/__init__.py +17 -0
- graflo/plot/plotter.py +556 -0
- graflo/util/__init__.py +23 -0
- graflo/util/chunker.py +751 -0
- graflo/util/merge.py +150 -0
- graflo/util/misc.py +37 -0
- graflo/util/onto.py +332 -0
- graflo/util/transform.py +448 -0
- graflo-1.3.3.dist-info/METADATA +190 -0
- graflo-1.3.3.dist-info/RECORD +64 -0
- graflo-1.3.3.dist-info/WHEEL +4 -0
- graflo-1.3.3.dist-info/entry_points.txt +5 -0
- graflo-1.3.3.dist-info/licenses/LICENSE +126 -0
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
"""REST API data source implementation.
|
|
2
|
+
|
|
3
|
+
This module provides a data source for REST API endpoints, supporting
|
|
4
|
+
full HTTP configuration including authentication, headers, pagination,
|
|
5
|
+
and retry logic.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import dataclasses
|
|
11
|
+
import logging
|
|
12
|
+
from typing import Any, Iterator
|
|
13
|
+
|
|
14
|
+
import requests
|
|
15
|
+
from requests.adapters import HTTPAdapter
|
|
16
|
+
from requests.auth import HTTPBasicAuth, HTTPDigestAuth
|
|
17
|
+
from urllib3.util.retry import Retry
|
|
18
|
+
|
|
19
|
+
from graflo.data_source.base import AbstractDataSource, DataSourceType
|
|
20
|
+
from graflo.onto import BaseDataclass
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclasses.dataclass
|
|
26
|
+
class PaginationConfig(BaseDataclass):
|
|
27
|
+
"""Configuration for API pagination.
|
|
28
|
+
|
|
29
|
+
Supports multiple pagination strategies:
|
|
30
|
+
- offset: Offset-based pagination (offset, limit)
|
|
31
|
+
- cursor: Cursor-based pagination (cursor parameter)
|
|
32
|
+
- page: Page-based pagination (page, per_page)
|
|
33
|
+
|
|
34
|
+
Attributes:
|
|
35
|
+
strategy: Pagination strategy ('offset', 'cursor', 'page')
|
|
36
|
+
offset_param: Parameter name for offset (default: 'offset')
|
|
37
|
+
limit_param: Parameter name for limit (default: 'limit')
|
|
38
|
+
cursor_param: Parameter name for cursor (default: 'cursor')
|
|
39
|
+
page_param: Parameter name for page (default: 'page')
|
|
40
|
+
per_page_param: Parameter name for per_page (default: 'per_page')
|
|
41
|
+
initial_offset: Initial offset value (default: 0)
|
|
42
|
+
initial_page: Initial page value (default: 1)
|
|
43
|
+
page_size: Number of items per page (default: 100)
|
|
44
|
+
cursor_path: JSON path to cursor in response (for cursor-based)
|
|
45
|
+
has_more_path: JSON path to has_more flag in response
|
|
46
|
+
data_path: JSON path to data array in response (default: root)
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
strategy: str = "offset" # 'offset', 'cursor', 'page'
|
|
50
|
+
offset_param: str = "offset"
|
|
51
|
+
limit_param: str = "limit"
|
|
52
|
+
cursor_param: str = "cursor"
|
|
53
|
+
page_param: str = "page"
|
|
54
|
+
per_page_param: str = "per_page"
|
|
55
|
+
initial_offset: int = 0
|
|
56
|
+
initial_page: int = 1
|
|
57
|
+
page_size: int = 100
|
|
58
|
+
cursor_path: str | None = None # JSON path like "next_cursor"
|
|
59
|
+
has_more_path: str | None = None # JSON path like "has_more"
|
|
60
|
+
data_path: str | None = None # JSON path to data array, None means root
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclasses.dataclass
|
|
64
|
+
class APIConfig(BaseDataclass):
|
|
65
|
+
"""Configuration for REST API data source.
|
|
66
|
+
|
|
67
|
+
Attributes:
|
|
68
|
+
url: API endpoint URL
|
|
69
|
+
method: HTTP method (default: 'GET')
|
|
70
|
+
headers: HTTP headers as dictionary
|
|
71
|
+
auth: Authentication configuration
|
|
72
|
+
- For Basic auth: {'type': 'basic', 'username': '...', 'password': '...'}
|
|
73
|
+
- For Bearer token: {'type': 'bearer', 'token': '...'}
|
|
74
|
+
- For Digest auth: {'type': 'digest', 'username': '...', 'password': '...'}
|
|
75
|
+
params: Query parameters as dictionary
|
|
76
|
+
timeout: Request timeout in seconds (default: None for no timeout)
|
|
77
|
+
retries: Number of retry attempts (default: 0)
|
|
78
|
+
retry_backoff_factor: Backoff factor for retries (default: 0.1)
|
|
79
|
+
retry_status_forcelist: HTTP status codes to retry on (default: [500, 502, 503, 504])
|
|
80
|
+
verify: Verify SSL certificates (default: True)
|
|
81
|
+
pagination: Pagination configuration (default: None)
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
url: str
|
|
85
|
+
method: str = "GET"
|
|
86
|
+
headers: dict[str, str] = dataclasses.field(default_factory=dict)
|
|
87
|
+
auth: dict[str, Any] | None = None
|
|
88
|
+
params: dict[str, Any] = dataclasses.field(default_factory=dict)
|
|
89
|
+
timeout: float | None = None
|
|
90
|
+
retries: int = 0
|
|
91
|
+
retry_backoff_factor: float = 0.1
|
|
92
|
+
retry_status_forcelist: list[int] = dataclasses.field(
|
|
93
|
+
default_factory=lambda: [500, 502, 503, 504]
|
|
94
|
+
)
|
|
95
|
+
verify: bool = True
|
|
96
|
+
pagination: PaginationConfig | None = None
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@dataclasses.dataclass
|
|
100
|
+
class APIDataSource(AbstractDataSource):
|
|
101
|
+
"""Data source for REST API endpoints.
|
|
102
|
+
|
|
103
|
+
This class provides a data source for REST API endpoints, supporting
|
|
104
|
+
full HTTP configuration, authentication, pagination, and retry logic.
|
|
105
|
+
Returns JSON responses as hierarchical dictionaries, similar to JSON files.
|
|
106
|
+
|
|
107
|
+
Attributes:
|
|
108
|
+
config: API configuration
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
config: APIConfig
|
|
112
|
+
|
|
113
|
+
def __post_init__(self):
|
|
114
|
+
"""Initialize the API data source."""
|
|
115
|
+
self.source_type = DataSourceType.API
|
|
116
|
+
|
|
117
|
+
def _create_session(self) -> requests.Session:
|
|
118
|
+
"""Create a requests session with retry configuration.
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
Configured requests session
|
|
122
|
+
"""
|
|
123
|
+
session = requests.Session()
|
|
124
|
+
|
|
125
|
+
# Configure retries
|
|
126
|
+
if self.config.retries > 0:
|
|
127
|
+
retry_strategy = Retry(
|
|
128
|
+
total=self.config.retries,
|
|
129
|
+
backoff_factor=self.config.retry_backoff_factor,
|
|
130
|
+
status_forcelist=self.config.retry_status_forcelist,
|
|
131
|
+
)
|
|
132
|
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
|
133
|
+
session.mount("http://", adapter)
|
|
134
|
+
session.mount("https://", adapter)
|
|
135
|
+
|
|
136
|
+
# Configure authentication
|
|
137
|
+
if self.config.auth:
|
|
138
|
+
auth_type = self.config.auth.get("type", "").lower()
|
|
139
|
+
if auth_type == "basic":
|
|
140
|
+
session.auth = HTTPBasicAuth(
|
|
141
|
+
self.config.auth.get("username", ""),
|
|
142
|
+
self.config.auth.get("password", ""),
|
|
143
|
+
)
|
|
144
|
+
elif auth_type == "digest":
|
|
145
|
+
session.auth = HTTPDigestAuth(
|
|
146
|
+
self.config.auth.get("username", ""),
|
|
147
|
+
self.config.auth.get("password", ""),
|
|
148
|
+
)
|
|
149
|
+
elif auth_type == "bearer":
|
|
150
|
+
token = self.config.auth.get("token", "")
|
|
151
|
+
session.headers["Authorization"] = f"Bearer {token}"
|
|
152
|
+
|
|
153
|
+
# Set headers
|
|
154
|
+
session.headers.update(self.config.headers)
|
|
155
|
+
|
|
156
|
+
return session
|
|
157
|
+
|
|
158
|
+
def _extract_data(self, response: dict | list) -> list[dict]:
|
|
159
|
+
"""Extract data array from API response.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
response: API response as dictionary or list
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
List of data items
|
|
166
|
+
"""
|
|
167
|
+
if self.config.pagination and self.config.pagination.data_path:
|
|
168
|
+
# Navigate JSON path
|
|
169
|
+
parts = self.config.pagination.data_path.split(".")
|
|
170
|
+
data = response
|
|
171
|
+
for part in parts:
|
|
172
|
+
if isinstance(data, dict):
|
|
173
|
+
data = data.get(part)
|
|
174
|
+
elif isinstance(data, list):
|
|
175
|
+
data = data[int(part)]
|
|
176
|
+
else:
|
|
177
|
+
return []
|
|
178
|
+
if isinstance(data, list):
|
|
179
|
+
return data
|
|
180
|
+
elif isinstance(data, dict):
|
|
181
|
+
return [data]
|
|
182
|
+
else:
|
|
183
|
+
return []
|
|
184
|
+
else:
|
|
185
|
+
# Root level data
|
|
186
|
+
if isinstance(response, list):
|
|
187
|
+
return response
|
|
188
|
+
elif isinstance(response, dict):
|
|
189
|
+
return [response]
|
|
190
|
+
else:
|
|
191
|
+
return []
|
|
192
|
+
|
|
193
|
+
def _has_more(self, response: dict) -> bool:
|
|
194
|
+
"""Check if there are more pages to fetch.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
response: API response as dictionary
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
True if there are more pages
|
|
201
|
+
"""
|
|
202
|
+
if not self.config.pagination:
|
|
203
|
+
return False
|
|
204
|
+
|
|
205
|
+
if self.config.pagination.has_more_path:
|
|
206
|
+
parts = self.config.pagination.has_more_path.split(".")
|
|
207
|
+
value = response
|
|
208
|
+
for part in parts:
|
|
209
|
+
if isinstance(value, dict):
|
|
210
|
+
value = value.get(part)
|
|
211
|
+
else:
|
|
212
|
+
return False
|
|
213
|
+
return bool(value)
|
|
214
|
+
|
|
215
|
+
# Default: check if data array is not empty
|
|
216
|
+
data = self._extract_data(response)
|
|
217
|
+
return len(data) > 0
|
|
218
|
+
|
|
219
|
+
def _get_next_cursor(self, response: dict) -> str | None:
|
|
220
|
+
"""Get next cursor from response.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
response: API response as dictionary
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
Next cursor value or None
|
|
227
|
+
"""
|
|
228
|
+
if not self.config.pagination or not self.config.pagination.cursor_path:
|
|
229
|
+
return None
|
|
230
|
+
|
|
231
|
+
parts = self.config.pagination.cursor_path.split(".")
|
|
232
|
+
value = response
|
|
233
|
+
for part in parts:
|
|
234
|
+
if isinstance(value, dict):
|
|
235
|
+
value = value.get(part)
|
|
236
|
+
else:
|
|
237
|
+
return None
|
|
238
|
+
return str(value) if value is not None else None
|
|
239
|
+
|
|
240
|
+
def iter_batches(
|
|
241
|
+
self, batch_size: int = 1000, limit: int | None = None
|
|
242
|
+
) -> Iterator[list[dict]]:
|
|
243
|
+
"""Iterate over API data in batches.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
batch_size: Number of items per batch
|
|
247
|
+
limit: Maximum number of items to retrieve
|
|
248
|
+
|
|
249
|
+
Yields:
|
|
250
|
+
list[dict]: Batches of documents as dictionaries
|
|
251
|
+
"""
|
|
252
|
+
session = self._create_session()
|
|
253
|
+
total_items = 0
|
|
254
|
+
|
|
255
|
+
try:
|
|
256
|
+
# Initialize pagination state
|
|
257
|
+
offset = (
|
|
258
|
+
self.config.pagination.initial_offset if self.config.pagination else 0
|
|
259
|
+
)
|
|
260
|
+
page = self.config.pagination.initial_page if self.config.pagination else 1
|
|
261
|
+
cursor: str | None = None
|
|
262
|
+
|
|
263
|
+
while True:
|
|
264
|
+
# Build request parameters
|
|
265
|
+
params = self.config.params.copy()
|
|
266
|
+
|
|
267
|
+
# Add pagination parameters
|
|
268
|
+
if self.config.pagination:
|
|
269
|
+
if self.config.pagination.strategy == "offset":
|
|
270
|
+
params[self.config.pagination.offset_param] = offset
|
|
271
|
+
params[self.config.pagination.limit_param] = (
|
|
272
|
+
self.config.pagination.page_size
|
|
273
|
+
)
|
|
274
|
+
elif self.config.pagination.strategy == "page":
|
|
275
|
+
params[self.config.pagination.page_param] = page
|
|
276
|
+
params[self.config.pagination.per_page_param] = (
|
|
277
|
+
self.config.pagination.page_size
|
|
278
|
+
)
|
|
279
|
+
elif self.config.pagination.strategy == "cursor" and cursor:
|
|
280
|
+
params[self.config.pagination.cursor_param] = cursor
|
|
281
|
+
|
|
282
|
+
# Make request
|
|
283
|
+
try:
|
|
284
|
+
response = session.request(
|
|
285
|
+
method=self.config.method,
|
|
286
|
+
url=self.config.url,
|
|
287
|
+
params=params,
|
|
288
|
+
timeout=self.config.timeout,
|
|
289
|
+
verify=self.config.verify,
|
|
290
|
+
)
|
|
291
|
+
response.raise_for_status()
|
|
292
|
+
data = response.json()
|
|
293
|
+
except requests.RequestException as e:
|
|
294
|
+
logger.error(f"API request failed: {e}")
|
|
295
|
+
break
|
|
296
|
+
|
|
297
|
+
# Extract data from response
|
|
298
|
+
items = self._extract_data(data)
|
|
299
|
+
|
|
300
|
+
# Process items in batches
|
|
301
|
+
batch = []
|
|
302
|
+
for item in items:
|
|
303
|
+
if limit and total_items >= limit:
|
|
304
|
+
break
|
|
305
|
+
batch.append(item)
|
|
306
|
+
total_items += 1
|
|
307
|
+
|
|
308
|
+
if len(batch) >= batch_size:
|
|
309
|
+
yield batch
|
|
310
|
+
batch = []
|
|
311
|
+
|
|
312
|
+
# Yield remaining items
|
|
313
|
+
if batch:
|
|
314
|
+
yield batch
|
|
315
|
+
|
|
316
|
+
# Check if we should continue
|
|
317
|
+
if limit and total_items >= limit:
|
|
318
|
+
break
|
|
319
|
+
|
|
320
|
+
# Update pagination state
|
|
321
|
+
if self.config.pagination:
|
|
322
|
+
if self.config.pagination.strategy == "offset":
|
|
323
|
+
if not self._has_more(data):
|
|
324
|
+
break
|
|
325
|
+
offset += self.config.pagination.page_size
|
|
326
|
+
elif self.config.pagination.strategy == "page":
|
|
327
|
+
if not self._has_more(data):
|
|
328
|
+
break
|
|
329
|
+
page += 1
|
|
330
|
+
elif self.config.pagination.strategy == "cursor":
|
|
331
|
+
cursor = self._get_next_cursor(data)
|
|
332
|
+
if not cursor:
|
|
333
|
+
break
|
|
334
|
+
else:
|
|
335
|
+
# No pagination, single request
|
|
336
|
+
break
|
|
337
|
+
|
|
338
|
+
finally:
|
|
339
|
+
session.close()
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""Base classes for data source abstraction.
|
|
2
|
+
|
|
3
|
+
This module defines the abstract base class and types for all data sources.
|
|
4
|
+
Data sources handle data retrieval from various sources (files, APIs, databases)
|
|
5
|
+
and provide a unified interface for batch iteration.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import abc
|
|
11
|
+
from typing import Iterator
|
|
12
|
+
|
|
13
|
+
from graflo.onto import BaseDataclass, BaseEnum
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DataSourceType(BaseEnum):
|
|
17
|
+
"""Types of data sources supported by the system.
|
|
18
|
+
|
|
19
|
+
FILE: File-based data sources (JSON, JSONL, CSV/TSV)
|
|
20
|
+
API: REST API data sources
|
|
21
|
+
SQL: SQL database data sources
|
|
22
|
+
IN_MEMORY: In-memory data sources (lists, DataFrames)
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
FILE = "file"
|
|
26
|
+
API = "api"
|
|
27
|
+
SQL = "sql"
|
|
28
|
+
IN_MEMORY = "in_memory"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class AbstractDataSource(BaseDataclass, abc.ABC):
|
|
32
|
+
"""Abstract base class for all data sources.
|
|
33
|
+
|
|
34
|
+
Data sources handle data retrieval from various sources and provide
|
|
35
|
+
a unified interface for batch iteration. They are separate from Resources,
|
|
36
|
+
which handle data transformation. Many DataSources can map to the same Resource.
|
|
37
|
+
|
|
38
|
+
Attributes:
|
|
39
|
+
source_type: Type of the data source
|
|
40
|
+
resource_name: Name of the resource this data source maps to
|
|
41
|
+
(set externally via DataSourceRegistry)
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
source_type: DataSourceType
|
|
45
|
+
|
|
46
|
+
def __post_init__(self):
|
|
47
|
+
"""Initialize the data source after dataclass initialization."""
|
|
48
|
+
self._resource_name: str | None = None
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def resource_name(self) -> str | None:
|
|
52
|
+
"""Get the resource name this data source maps to.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
Resource name or None if not set
|
|
56
|
+
"""
|
|
57
|
+
return self._resource_name
|
|
58
|
+
|
|
59
|
+
@resource_name.setter
|
|
60
|
+
def resource_name(self, value: str | None):
|
|
61
|
+
"""Set the resource name this data source maps to.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
value: Resource name to set
|
|
65
|
+
"""
|
|
66
|
+
self._resource_name = value
|
|
67
|
+
|
|
68
|
+
@abc.abstractmethod
|
|
69
|
+
def iter_batches(
|
|
70
|
+
self, batch_size: int = 1000, limit: int | None = None
|
|
71
|
+
) -> Iterator[list[dict]]:
|
|
72
|
+
"""Iterate over data in batches.
|
|
73
|
+
|
|
74
|
+
This method yields batches of documents (dictionaries) from the data source.
|
|
75
|
+
Each batch is a list of dictionaries representing the data items.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
batch_size: Number of items per batch
|
|
79
|
+
limit: Maximum number of items to retrieve (None for no limit)
|
|
80
|
+
|
|
81
|
+
Yields:
|
|
82
|
+
list[dict]: Batches of documents as dictionaries
|
|
83
|
+
|
|
84
|
+
Raises:
|
|
85
|
+
NotImplementedError: Must be implemented by subclasses
|
|
86
|
+
"""
|
|
87
|
+
raise NotImplementedError("Subclasses must implement iter_batches")
|
|
88
|
+
|
|
89
|
+
def __iter__(self):
|
|
90
|
+
"""Make data source iterable, yielding individual items.
|
|
91
|
+
|
|
92
|
+
Yields:
|
|
93
|
+
dict: Individual documents
|
|
94
|
+
"""
|
|
95
|
+
for batch in self.iter_batches(batch_size=1, limit=None):
|
|
96
|
+
for item in batch:
|
|
97
|
+
yield item
|