graflo 1.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- graflo/README.md +18 -0
- graflo/__init__.py +70 -0
- graflo/architecture/__init__.py +38 -0
- graflo/architecture/actor.py +1120 -0
- graflo/architecture/actor_util.py +450 -0
- graflo/architecture/edge.py +297 -0
- graflo/architecture/onto.py +374 -0
- graflo/architecture/resource.py +161 -0
- graflo/architecture/schema.py +136 -0
- graflo/architecture/transform.py +292 -0
- graflo/architecture/util.py +93 -0
- graflo/architecture/vertex.py +586 -0
- graflo/caster.py +655 -0
- graflo/cli/__init__.py +14 -0
- graflo/cli/ingest.py +194 -0
- graflo/cli/manage_dbs.py +197 -0
- graflo/cli/plot_schema.py +132 -0
- graflo/cli/xml2json.py +93 -0
- graflo/data_source/__init__.py +48 -0
- graflo/data_source/api.py +339 -0
- graflo/data_source/base.py +97 -0
- graflo/data_source/factory.py +298 -0
- graflo/data_source/file.py +133 -0
- graflo/data_source/memory.py +72 -0
- graflo/data_source/registry.py +82 -0
- graflo/data_source/sql.py +185 -0
- graflo/db/__init__.py +44 -0
- graflo/db/arango/__init__.py +22 -0
- graflo/db/arango/conn.py +1026 -0
- graflo/db/arango/query.py +180 -0
- graflo/db/arango/util.py +88 -0
- graflo/db/conn.py +377 -0
- graflo/db/connection/__init__.py +6 -0
- graflo/db/connection/config_mapping.py +18 -0
- graflo/db/connection/onto.py +688 -0
- graflo/db/connection/wsgi.py +29 -0
- graflo/db/manager.py +119 -0
- graflo/db/neo4j/__init__.py +16 -0
- graflo/db/neo4j/conn.py +639 -0
- graflo/db/postgres/__init__.py +156 -0
- graflo/db/postgres/conn.py +425 -0
- graflo/db/postgres/resource_mapping.py +139 -0
- graflo/db/postgres/schema_inference.py +245 -0
- graflo/db/postgres/types.py +148 -0
- graflo/db/tigergraph/__init__.py +9 -0
- graflo/db/tigergraph/conn.py +2212 -0
- graflo/db/util.py +49 -0
- graflo/filter/__init__.py +21 -0
- graflo/filter/onto.py +525 -0
- graflo/logging.conf +22 -0
- graflo/onto.py +190 -0
- graflo/plot/__init__.py +17 -0
- graflo/plot/plotter.py +556 -0
- graflo/util/__init__.py +23 -0
- graflo/util/chunker.py +751 -0
- graflo/util/merge.py +150 -0
- graflo/util/misc.py +37 -0
- graflo/util/onto.py +332 -0
- graflo/util/transform.py +448 -0
- graflo-1.3.3.dist-info/METADATA +190 -0
- graflo-1.3.3.dist-info/RECORD +64 -0
- graflo-1.3.3.dist-info/WHEEL +4 -0
- graflo-1.3.3.dist-info/entry_points.txt +5 -0
- graflo-1.3.3.dist-info/licenses/LICENSE +126 -0
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
"""Factory for creating data source instances.
|
|
2
|
+
|
|
3
|
+
This module provides a factory for creating appropriate data source instances
|
|
4
|
+
based on configuration. It supports file-based, API, and SQL data sources.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
from graflo.architecture.onto import EncodingType
|
|
16
|
+
from graflo.data_source.api import APIConfig, APIDataSource
|
|
17
|
+
from graflo.data_source.base import AbstractDataSource, DataSourceType
|
|
18
|
+
from graflo.data_source.file import (
|
|
19
|
+
JsonFileDataSource,
|
|
20
|
+
JsonlFileDataSource,
|
|
21
|
+
TableFileDataSource,
|
|
22
|
+
)
|
|
23
|
+
from graflo.data_source.memory import InMemoryDataSource
|
|
24
|
+
from graflo.data_source.sql import SQLConfig, SQLDataSource
|
|
25
|
+
from graflo.util.chunker import ChunkerFactory, ChunkerType
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class DataSourceFactory:
|
|
31
|
+
"""Factory for creating data source instances.
|
|
32
|
+
|
|
33
|
+
This factory creates appropriate data source instances based on the
|
|
34
|
+
provided configuration. It supports file-based, API, and SQL data sources.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
@staticmethod
|
|
38
|
+
def _guess_file_type(filename: Path) -> ChunkerType:
|
|
39
|
+
"""Guess the file type based on file extension.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
filename: Path to the file
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
ChunkerType: Guessed file type
|
|
46
|
+
|
|
47
|
+
Raises:
|
|
48
|
+
ValueError: If file extension is not recognized
|
|
49
|
+
"""
|
|
50
|
+
return ChunkerFactory._guess_chunker_type(filename)
|
|
51
|
+
|
|
52
|
+
@classmethod
|
|
53
|
+
def create_file_data_source(
|
|
54
|
+
cls,
|
|
55
|
+
path: Path | str,
|
|
56
|
+
file_type: str | ChunkerType | None = None,
|
|
57
|
+
encoding: EncodingType = EncodingType.UTF_8,
|
|
58
|
+
sep: str | None = None,
|
|
59
|
+
) -> JsonFileDataSource | JsonlFileDataSource | TableFileDataSource:
|
|
60
|
+
"""Create a file-based data source.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
path: Path to the file
|
|
64
|
+
file_type: Type of file ('json', 'jsonl', 'table') or ChunkerType.
|
|
65
|
+
If None, will be guessed from file extension.
|
|
66
|
+
encoding: File encoding (default: UTF_8)
|
|
67
|
+
sep: Field separator for table files (default: ',').
|
|
68
|
+
Only used for table files.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
Appropriate file data source instance (JsonFileDataSource,
|
|
72
|
+
JsonlFileDataSource, or TableFileDataSource)
|
|
73
|
+
|
|
74
|
+
Raises:
|
|
75
|
+
ValueError: If file type cannot be determined
|
|
76
|
+
"""
|
|
77
|
+
if isinstance(path, str):
|
|
78
|
+
path = Path(path)
|
|
79
|
+
|
|
80
|
+
# Determine file type
|
|
81
|
+
if file_type is None:
|
|
82
|
+
try:
|
|
83
|
+
file_type_enum = cls._guess_file_type(path)
|
|
84
|
+
except ValueError as e:
|
|
85
|
+
raise ValueError(
|
|
86
|
+
f"Could not determine file type for {path}. "
|
|
87
|
+
f"Please specify file_type explicitly. Error: {e}"
|
|
88
|
+
)
|
|
89
|
+
elif isinstance(file_type, str):
|
|
90
|
+
file_type_enum = ChunkerType(file_type.lower())
|
|
91
|
+
else:
|
|
92
|
+
file_type_enum = file_type
|
|
93
|
+
|
|
94
|
+
# Create appropriate data source
|
|
95
|
+
if file_type_enum == ChunkerType.JSON:
|
|
96
|
+
return JsonFileDataSource(path=path, encoding=encoding)
|
|
97
|
+
elif file_type_enum == ChunkerType.JSONL:
|
|
98
|
+
return JsonlFileDataSource(path=path, encoding=encoding)
|
|
99
|
+
elif file_type_enum == ChunkerType.TABLE:
|
|
100
|
+
# sep is only for table files
|
|
101
|
+
return TableFileDataSource(path=path, encoding=encoding, sep=sep or ",")
|
|
102
|
+
else:
|
|
103
|
+
raise ValueError(f"Unsupported file type: {file_type_enum}")
|
|
104
|
+
|
|
105
|
+
@classmethod
|
|
106
|
+
def create_api_data_source(cls, config: APIConfig) -> APIDataSource:
|
|
107
|
+
"""Create an API data source.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
config: API configuration
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
APIDataSource instance
|
|
114
|
+
"""
|
|
115
|
+
return APIDataSource(config=config)
|
|
116
|
+
|
|
117
|
+
@classmethod
|
|
118
|
+
def create_sql_data_source(cls, config: SQLConfig) -> SQLDataSource:
|
|
119
|
+
"""Create a SQL data source.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
config: SQL configuration
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
SQLDataSource instance
|
|
126
|
+
"""
|
|
127
|
+
return SQLDataSource(config=config)
|
|
128
|
+
|
|
129
|
+
@classmethod
|
|
130
|
+
def create_in_memory_data_source(
|
|
131
|
+
cls,
|
|
132
|
+
data: list[dict] | list[list] | pd.DataFrame,
|
|
133
|
+
columns: list[str] | None = None,
|
|
134
|
+
) -> InMemoryDataSource:
|
|
135
|
+
"""Create an in-memory data source.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
data: Data to process (list[dict], list[list], or pd.DataFrame)
|
|
139
|
+
columns: Optional column names for list[list] data
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
InMemoryDataSource instance
|
|
143
|
+
"""
|
|
144
|
+
return InMemoryDataSource(data=data, columns=columns)
|
|
145
|
+
|
|
146
|
+
@classmethod
|
|
147
|
+
def create_data_source(
|
|
148
|
+
cls,
|
|
149
|
+
source_type: DataSourceType | str | None = None,
|
|
150
|
+
**kwargs: Any,
|
|
151
|
+
) -> AbstractDataSource:
|
|
152
|
+
"""Create a data source of the specified type.
|
|
153
|
+
|
|
154
|
+
This is a general factory method that routes to specific factory methods
|
|
155
|
+
based on the source type.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
source_type: Type of data source to create. If None, will be inferred
|
|
159
|
+
from kwargs (e.g., 'path' -> FILE, 'data' -> IN_MEMORY, 'config' with url -> API)
|
|
160
|
+
**kwargs: Configuration parameters for the data source
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Data source instance
|
|
164
|
+
|
|
165
|
+
Raises:
|
|
166
|
+
ValueError: If source type is not supported or required parameters are missing
|
|
167
|
+
"""
|
|
168
|
+
# Auto-detect source type if not provided
|
|
169
|
+
if source_type is None:
|
|
170
|
+
if "path" in kwargs or "file_type" in kwargs:
|
|
171
|
+
source_type = DataSourceType.FILE
|
|
172
|
+
elif "data" in kwargs:
|
|
173
|
+
source_type = DataSourceType.IN_MEMORY
|
|
174
|
+
elif "config" in kwargs:
|
|
175
|
+
config = kwargs["config"]
|
|
176
|
+
# Check if it's an API config (has 'url') or SQL config (has 'connection_string')
|
|
177
|
+
if isinstance(config, dict):
|
|
178
|
+
if "url" in config:
|
|
179
|
+
source_type = DataSourceType.API
|
|
180
|
+
elif "connection_string" in config or "query" in config:
|
|
181
|
+
source_type = DataSourceType.SQL
|
|
182
|
+
else:
|
|
183
|
+
# Try to create from dict
|
|
184
|
+
if "source_type" in config:
|
|
185
|
+
source_type = DataSourceType(config["source_type"].lower())
|
|
186
|
+
else:
|
|
187
|
+
raise ValueError(
|
|
188
|
+
"Cannot determine source type from config. "
|
|
189
|
+
"Please specify source_type or provide 'url' (API) "
|
|
190
|
+
"or 'connection_string'/'query' (SQL) in config."
|
|
191
|
+
)
|
|
192
|
+
elif hasattr(config, "url"):
|
|
193
|
+
source_type = DataSourceType.API
|
|
194
|
+
elif hasattr(config, "connection_string") or hasattr(config, "query"):
|
|
195
|
+
source_type = DataSourceType.SQL
|
|
196
|
+
else:
|
|
197
|
+
raise ValueError(
|
|
198
|
+
"Cannot determine source type from config. "
|
|
199
|
+
"Please specify source_type explicitly."
|
|
200
|
+
)
|
|
201
|
+
else:
|
|
202
|
+
raise ValueError(
|
|
203
|
+
"Cannot determine source type. Please specify source_type or "
|
|
204
|
+
"provide one of: path (FILE), data (IN_MEMORY), or config (API/SQL)."
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
if isinstance(source_type, str):
|
|
208
|
+
source_type = DataSourceType(source_type.lower())
|
|
209
|
+
|
|
210
|
+
if source_type == DataSourceType.FILE:
|
|
211
|
+
return cls.create_file_data_source(**kwargs)
|
|
212
|
+
elif source_type == DataSourceType.API:
|
|
213
|
+
if "config" not in kwargs:
|
|
214
|
+
# Create APIConfig from kwargs
|
|
215
|
+
from graflo.data_source.api import APIConfig, PaginationConfig
|
|
216
|
+
|
|
217
|
+
# Handle nested pagination config manually
|
|
218
|
+
api_kwargs = kwargs.copy()
|
|
219
|
+
pagination_dict = api_kwargs.pop("pagination", None)
|
|
220
|
+
pagination = None
|
|
221
|
+
if pagination_dict is not None:
|
|
222
|
+
if isinstance(pagination_dict, dict):
|
|
223
|
+
# Manually construct PaginationConfig to avoid dataclass_wizard issues
|
|
224
|
+
pagination = PaginationConfig(**pagination_dict)
|
|
225
|
+
else:
|
|
226
|
+
pagination = pagination_dict
|
|
227
|
+
api_kwargs["pagination"] = pagination
|
|
228
|
+
config = APIConfig(**api_kwargs)
|
|
229
|
+
return cls.create_api_data_source(config=config)
|
|
230
|
+
config = kwargs["config"]
|
|
231
|
+
if isinstance(config, dict):
|
|
232
|
+
from graflo.data_source.api import APIConfig, PaginationConfig
|
|
233
|
+
|
|
234
|
+
# Handle nested pagination config manually
|
|
235
|
+
config_copy = config.copy()
|
|
236
|
+
pagination_dict = config_copy.pop("pagination", None)
|
|
237
|
+
pagination = None
|
|
238
|
+
if pagination_dict is not None:
|
|
239
|
+
if isinstance(pagination_dict, dict):
|
|
240
|
+
# Manually construct PaginationConfig to avoid dataclass_wizard issues
|
|
241
|
+
pagination = PaginationConfig(**pagination_dict)
|
|
242
|
+
else:
|
|
243
|
+
pagination = pagination_dict
|
|
244
|
+
config_copy["pagination"] = pagination
|
|
245
|
+
config = APIConfig(**config_copy)
|
|
246
|
+
return cls.create_api_data_source(config=config)
|
|
247
|
+
elif source_type == DataSourceType.SQL:
|
|
248
|
+
if "config" not in kwargs:
|
|
249
|
+
# Create SQLConfig from kwargs
|
|
250
|
+
from graflo.data_source.sql import SQLConfig
|
|
251
|
+
|
|
252
|
+
config = SQLConfig.from_dict(kwargs)
|
|
253
|
+
return cls.create_sql_data_source(config=config)
|
|
254
|
+
config = kwargs["config"]
|
|
255
|
+
if isinstance(config, dict):
|
|
256
|
+
from graflo.data_source.sql import SQLConfig
|
|
257
|
+
|
|
258
|
+
config = SQLConfig.from_dict(config)
|
|
259
|
+
return cls.create_sql_data_source(config=config)
|
|
260
|
+
elif source_type == DataSourceType.IN_MEMORY:
|
|
261
|
+
if "data" not in kwargs:
|
|
262
|
+
raise ValueError("In-memory data source requires 'data' parameter")
|
|
263
|
+
return cls.create_in_memory_data_source(**kwargs)
|
|
264
|
+
else:
|
|
265
|
+
raise ValueError(f"Unsupported data source type: {source_type}")
|
|
266
|
+
|
|
267
|
+
@classmethod
|
|
268
|
+
def create_data_source_from_config(
|
|
269
|
+
cls, config: dict[str, Any]
|
|
270
|
+
) -> AbstractDataSource:
|
|
271
|
+
"""Create a data source from a configuration dictionary.
|
|
272
|
+
|
|
273
|
+
The configuration dict should contain:
|
|
274
|
+
- 'source_type': Type of data source (FILE, API, SQL, IN_MEMORY)
|
|
275
|
+
- Other parameters specific to the data source type
|
|
276
|
+
|
|
277
|
+
Examples:
|
|
278
|
+
File source:
|
|
279
|
+
{"source_type": "file", "path": "data.json"}
|
|
280
|
+
API source:
|
|
281
|
+
{"source_type": "api", "config": {"url": "https://api.example.com"}}
|
|
282
|
+
SQL source:
|
|
283
|
+
{"source_type": "sql", "config": {"connection_string": "...", "query": "..."}}
|
|
284
|
+
In-memory source:
|
|
285
|
+
{"source_type": "in_memory", "data": [...]}
|
|
286
|
+
|
|
287
|
+
Args:
|
|
288
|
+
config: Configuration dictionary
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
Data source instance
|
|
292
|
+
|
|
293
|
+
Raises:
|
|
294
|
+
ValueError: If configuration is invalid
|
|
295
|
+
"""
|
|
296
|
+
config = config.copy()
|
|
297
|
+
source_type = config.pop("source_type", None)
|
|
298
|
+
return cls.create_data_source(source_type=source_type, **config)
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""File-based data source implementations.
|
|
2
|
+
|
|
3
|
+
This module provides data source implementations for file-based data sources,
|
|
4
|
+
including JSON, JSONL, and CSV/TSV files. It integrates with the existing
|
|
5
|
+
chunker logic for efficient batch processing.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import dataclasses
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Iterator
|
|
13
|
+
|
|
14
|
+
from graflo.architecture.onto import EncodingType
|
|
15
|
+
from graflo.data_source.base import AbstractDataSource, DataSourceType
|
|
16
|
+
from graflo.util.chunker import ChunkerFactory, ChunkerType
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclasses.dataclass
|
|
20
|
+
class FileDataSource(AbstractDataSource):
|
|
21
|
+
"""Base class for file-based data sources.
|
|
22
|
+
|
|
23
|
+
This class provides a common interface for file-based data sources,
|
|
24
|
+
integrating with the existing chunker system for batch processing.
|
|
25
|
+
|
|
26
|
+
Attributes:
|
|
27
|
+
path: Path to the file
|
|
28
|
+
file_type: Type of file (json, jsonl, table)
|
|
29
|
+
encoding: File encoding (default: UTF_8)
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
path: Path | str
|
|
33
|
+
file_type: str | None = None
|
|
34
|
+
encoding: EncodingType = EncodingType.UTF_8
|
|
35
|
+
|
|
36
|
+
def __post_init__(self):
|
|
37
|
+
"""Initialize the file data source."""
|
|
38
|
+
self.source_type = DataSourceType.FILE
|
|
39
|
+
if isinstance(self.path, str):
|
|
40
|
+
self.path = Path(self.path)
|
|
41
|
+
|
|
42
|
+
def iter_batches(
|
|
43
|
+
self, batch_size: int = 1000, limit: int | None = None
|
|
44
|
+
) -> Iterator[list[dict]]:
|
|
45
|
+
"""Iterate over file data in batches.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
batch_size: Number of items per batch
|
|
49
|
+
limit: Maximum number of items to retrieve
|
|
50
|
+
|
|
51
|
+
Yields:
|
|
52
|
+
list[dict]: Batches of documents as dictionaries
|
|
53
|
+
"""
|
|
54
|
+
# Determine chunker type
|
|
55
|
+
chunker_type = None
|
|
56
|
+
if self.file_type:
|
|
57
|
+
chunker_type = ChunkerType(self.file_type.lower())
|
|
58
|
+
|
|
59
|
+
# Create chunker using factory
|
|
60
|
+
chunker_kwargs = {
|
|
61
|
+
"resource": self.path,
|
|
62
|
+
"type": chunker_type,
|
|
63
|
+
"batch_size": batch_size,
|
|
64
|
+
"limit": limit,
|
|
65
|
+
"encoding": self.encoding,
|
|
66
|
+
}
|
|
67
|
+
# Only add sep for table files
|
|
68
|
+
if chunker_type == ChunkerType.TABLE and hasattr(self, "sep"):
|
|
69
|
+
chunker_kwargs["sep"] = self.sep
|
|
70
|
+
|
|
71
|
+
chunker = ChunkerFactory.create_chunker(**chunker_kwargs)
|
|
72
|
+
|
|
73
|
+
# Yield batches
|
|
74
|
+
for batch in chunker:
|
|
75
|
+
yield batch
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@dataclasses.dataclass
|
|
79
|
+
class JsonFileDataSource(FileDataSource):
|
|
80
|
+
"""Data source for JSON files.
|
|
81
|
+
|
|
82
|
+
JSON files are expected to contain hierarchical data structures,
|
|
83
|
+
similar to REST API responses. The chunker handles nested structures
|
|
84
|
+
and converts them to dictionaries.
|
|
85
|
+
|
|
86
|
+
Attributes:
|
|
87
|
+
path: Path to the JSON file
|
|
88
|
+
encoding: File encoding (default: UTF_8)
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
def __post_init__(self):
|
|
92
|
+
"""Initialize the JSON file data source."""
|
|
93
|
+
super().__post_init__()
|
|
94
|
+
self.file_type = ChunkerType.JSON.value
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@dataclasses.dataclass
|
|
98
|
+
class JsonlFileDataSource(FileDataSource):
|
|
99
|
+
"""Data source for JSONL (JSON Lines) files.
|
|
100
|
+
|
|
101
|
+
JSONL files contain one JSON object per line, making them suitable
|
|
102
|
+
for streaming and batch processing.
|
|
103
|
+
|
|
104
|
+
Attributes:
|
|
105
|
+
path: Path to the JSONL file
|
|
106
|
+
encoding: File encoding (default: UTF_8)
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
def __post_init__(self):
|
|
110
|
+
"""Initialize the JSONL file data source."""
|
|
111
|
+
super().__post_init__()
|
|
112
|
+
self.file_type = ChunkerType.JSONL.value
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@dataclasses.dataclass
|
|
116
|
+
class TableFileDataSource(FileDataSource):
|
|
117
|
+
"""Data source for CSV/TSV files.
|
|
118
|
+
|
|
119
|
+
Table files are converted to dictionaries with column headers as keys.
|
|
120
|
+
Each row becomes a dictionary.
|
|
121
|
+
|
|
122
|
+
Attributes:
|
|
123
|
+
path: Path to the CSV/TSV file
|
|
124
|
+
encoding: File encoding (default: UTF_8)
|
|
125
|
+
sep: Field separator (default: ',')
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
sep: str = ","
|
|
129
|
+
|
|
130
|
+
def __post_init__(self):
|
|
131
|
+
"""Initialize the table file data source."""
|
|
132
|
+
super().__post_init__()
|
|
133
|
+
self.file_type = ChunkerType.TABLE.value
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""In-memory data source implementations.
|
|
2
|
+
|
|
3
|
+
This module provides data source implementations for in-memory data structures,
|
|
4
|
+
including lists of dictionaries, lists of lists, and Pandas DataFrames.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import dataclasses
|
|
10
|
+
from typing import Iterator
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
|
|
14
|
+
from graflo.data_source.base import AbstractDataSource, DataSourceType
|
|
15
|
+
from graflo.util.chunker import ChunkerFactory
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclasses.dataclass
|
|
19
|
+
class InMemoryDataSource(AbstractDataSource):
|
|
20
|
+
"""Data source for in-memory data structures.
|
|
21
|
+
|
|
22
|
+
This class provides a data source for Python objects that are already
|
|
23
|
+
in memory, including lists of dictionaries, lists of lists, and Pandas DataFrames.
|
|
24
|
+
|
|
25
|
+
Attributes:
|
|
26
|
+
data: Data to process (list[dict], list[list], or pd.DataFrame)
|
|
27
|
+
columns: Optional column names for list[list] data
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
data: list[dict] | list[list] | pd.DataFrame
|
|
31
|
+
columns: list[str] | None = None
|
|
32
|
+
|
|
33
|
+
def __post_init__(self):
|
|
34
|
+
"""Initialize the in-memory data source."""
|
|
35
|
+
self.source_type = DataSourceType.IN_MEMORY
|
|
36
|
+
|
|
37
|
+
def iter_batches(
|
|
38
|
+
self, batch_size: int = 1000, limit: int | None = None
|
|
39
|
+
) -> Iterator[list[dict]]:
|
|
40
|
+
"""Iterate over in-memory data in batches.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
batch_size: Number of items per batch
|
|
44
|
+
limit: Maximum number of items to retrieve
|
|
45
|
+
|
|
46
|
+
Yields:
|
|
47
|
+
list[dict]: Batches of documents as dictionaries
|
|
48
|
+
"""
|
|
49
|
+
# Normalize data: convert list[list] to list[dict] if needed
|
|
50
|
+
data = self.data
|
|
51
|
+
if isinstance(data, list) and len(data) > 0 and isinstance(data[0], list):
|
|
52
|
+
# list[list] - convert to list[dict] using columns
|
|
53
|
+
if self.columns is None:
|
|
54
|
+
raise ValueError(
|
|
55
|
+
"columns parameter is required when data is list[list]"
|
|
56
|
+
)
|
|
57
|
+
data = [{k: v for k, v in zip(self.columns, item)} for item in data]
|
|
58
|
+
|
|
59
|
+
# Create chunker using factory (only pass columns if it's a DataFrame)
|
|
60
|
+
chunker_kwargs = {
|
|
61
|
+
"resource": data,
|
|
62
|
+
"batch_size": batch_size,
|
|
63
|
+
"limit": limit,
|
|
64
|
+
}
|
|
65
|
+
# Note: columns is not passed to chunker - we handle list[list] conversion above
|
|
66
|
+
# DataFrame chunker doesn't need columns either
|
|
67
|
+
|
|
68
|
+
chunker = ChunkerFactory.create_chunker(**chunker_kwargs)
|
|
69
|
+
|
|
70
|
+
# Yield batches
|
|
71
|
+
for batch in chunker:
|
|
72
|
+
yield batch
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""Data source registry for mapping data sources to resources.
|
|
2
|
+
|
|
3
|
+
This module provides a registry for mapping data sources to resource names.
|
|
4
|
+
Many data sources can map to the same resource, allowing flexible data
|
|
5
|
+
ingestion from multiple sources.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import dataclasses
|
|
11
|
+
from typing import TYPE_CHECKING
|
|
12
|
+
|
|
13
|
+
from graflo.onto import BaseDataclass
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from graflo.data_source.base import AbstractDataSource
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclasses.dataclass
|
|
20
|
+
class DataSourceRegistry(BaseDataclass):
|
|
21
|
+
"""Registry for mapping data sources to resource names.
|
|
22
|
+
|
|
23
|
+
This class maintains a mapping from resource names to lists of data sources.
|
|
24
|
+
Many data sources can map to the same resource, allowing data to be ingested
|
|
25
|
+
from multiple sources and combined.
|
|
26
|
+
|
|
27
|
+
Attributes:
|
|
28
|
+
sources: Dictionary mapping resource names to lists of data sources
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
sources: dict[str, list[AbstractDataSource]] = dataclasses.field(
|
|
32
|
+
default_factory=dict
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
def register(self, data_source: AbstractDataSource, resource_name: str) -> None:
|
|
36
|
+
"""Register a data source for a resource.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
data_source: Data source to register
|
|
40
|
+
resource_name: Name of the resource to map to
|
|
41
|
+
"""
|
|
42
|
+
if resource_name not in self.sources:
|
|
43
|
+
self.sources[resource_name] = []
|
|
44
|
+
self.sources[resource_name].append(data_source)
|
|
45
|
+
data_source.resource_name = resource_name
|
|
46
|
+
|
|
47
|
+
def get_data_sources(self, resource_name: str) -> list[AbstractDataSource]:
|
|
48
|
+
"""Get all data sources for a resource.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
resource_name: Name of the resource
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
List of data sources for the resource (empty list if none found)
|
|
55
|
+
"""
|
|
56
|
+
return self.sources.get(resource_name, [])
|
|
57
|
+
|
|
58
|
+
def get_all_data_sources(self) -> list[AbstractDataSource]:
|
|
59
|
+
"""Get all registered data sources.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
List of all registered data sources
|
|
63
|
+
"""
|
|
64
|
+
all_sources = []
|
|
65
|
+
for sources_list in self.sources.values():
|
|
66
|
+
all_sources.extend(sources_list)
|
|
67
|
+
return all_sources
|
|
68
|
+
|
|
69
|
+
def has_resource(self, resource_name: str) -> bool:
|
|
70
|
+
"""Check if a resource has any data sources.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
resource_name: Name of the resource
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
True if the resource has data sources, False otherwise
|
|
77
|
+
"""
|
|
78
|
+
return resource_name in self.sources and len(self.sources[resource_name]) > 0
|
|
79
|
+
|
|
80
|
+
def clear(self) -> None:
|
|
81
|
+
"""Clear all registered data sources."""
|
|
82
|
+
self.sources.clear()
|