graflo 1.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- graflo/README.md +18 -0
- graflo/__init__.py +70 -0
- graflo/architecture/__init__.py +38 -0
- graflo/architecture/actor.py +1120 -0
- graflo/architecture/actor_util.py +450 -0
- graflo/architecture/edge.py +297 -0
- graflo/architecture/onto.py +374 -0
- graflo/architecture/resource.py +161 -0
- graflo/architecture/schema.py +136 -0
- graflo/architecture/transform.py +292 -0
- graflo/architecture/util.py +93 -0
- graflo/architecture/vertex.py +586 -0
- graflo/caster.py +655 -0
- graflo/cli/__init__.py +14 -0
- graflo/cli/ingest.py +194 -0
- graflo/cli/manage_dbs.py +197 -0
- graflo/cli/plot_schema.py +132 -0
- graflo/cli/xml2json.py +93 -0
- graflo/data_source/__init__.py +48 -0
- graflo/data_source/api.py +339 -0
- graflo/data_source/base.py +97 -0
- graflo/data_source/factory.py +298 -0
- graflo/data_source/file.py +133 -0
- graflo/data_source/memory.py +72 -0
- graflo/data_source/registry.py +82 -0
- graflo/data_source/sql.py +185 -0
- graflo/db/__init__.py +44 -0
- graflo/db/arango/__init__.py +22 -0
- graflo/db/arango/conn.py +1026 -0
- graflo/db/arango/query.py +180 -0
- graflo/db/arango/util.py +88 -0
- graflo/db/conn.py +377 -0
- graflo/db/connection/__init__.py +6 -0
- graflo/db/connection/config_mapping.py +18 -0
- graflo/db/connection/onto.py +688 -0
- graflo/db/connection/wsgi.py +29 -0
- graflo/db/manager.py +119 -0
- graflo/db/neo4j/__init__.py +16 -0
- graflo/db/neo4j/conn.py +639 -0
- graflo/db/postgres/__init__.py +156 -0
- graflo/db/postgres/conn.py +425 -0
- graflo/db/postgres/resource_mapping.py +139 -0
- graflo/db/postgres/schema_inference.py +245 -0
- graflo/db/postgres/types.py +148 -0
- graflo/db/tigergraph/__init__.py +9 -0
- graflo/db/tigergraph/conn.py +2212 -0
- graflo/db/util.py +49 -0
- graflo/filter/__init__.py +21 -0
- graflo/filter/onto.py +525 -0
- graflo/logging.conf +22 -0
- graflo/onto.py +190 -0
- graflo/plot/__init__.py +17 -0
- graflo/plot/plotter.py +556 -0
- graflo/util/__init__.py +23 -0
- graflo/util/chunker.py +751 -0
- graflo/util/merge.py +150 -0
- graflo/util/misc.py +37 -0
- graflo/util/onto.py +332 -0
- graflo/util/transform.py +448 -0
- graflo-1.3.3.dist-info/METADATA +190 -0
- graflo-1.3.3.dist-info/RECORD +64 -0
- graflo-1.3.3.dist-info/WHEEL +4 -0
- graflo-1.3.3.dist-info/entry_points.txt +5 -0
- graflo-1.3.3.dist-info/licenses/LICENSE +126 -0
graflo/util/chunker.py
ADDED
|
@@ -0,0 +1,751 @@
|
|
|
1
|
+
"""Data chunking utilities for efficient file processing.
|
|
2
|
+
|
|
3
|
+
This module provides utilities for processing large files by breaking them into
|
|
4
|
+
manageable chunks. It supports various file formats (JSON, JSONL, CSV) and provides
|
|
5
|
+
both file-based and in-memory chunking capabilities.
|
|
6
|
+
|
|
7
|
+
Key Components:
|
|
8
|
+
- AbstractChunker: Base class for chunking implementations
|
|
9
|
+
- FileChunker: File-based chunking with encoding support
|
|
10
|
+
- TableChunker: CSV/TSV file chunking
|
|
11
|
+
- JsonlChunker: JSON Lines file chunking
|
|
12
|
+
- JsonChunker: JSON file chunking
|
|
13
|
+
- TrivialChunker: In-memory list chunking
|
|
14
|
+
- ChunkerDataFrame: Pandas DataFrame chunking
|
|
15
|
+
- ChunkerFactory: Factory for creating appropriate chunkers
|
|
16
|
+
|
|
17
|
+
Example:
|
|
18
|
+
>>> chunker = ChunkerFactory.create_chunker(
|
|
19
|
+
... resource="data.json",
|
|
20
|
+
... type=ChunkerType.JSON,
|
|
21
|
+
... batch_size=1000
|
|
22
|
+
... )
|
|
23
|
+
>>> for batch in chunker:
|
|
24
|
+
... process_batch(batch)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
import abc
|
|
28
|
+
import csv
|
|
29
|
+
import gc
|
|
30
|
+
import gzip
|
|
31
|
+
import json
|
|
32
|
+
import logging
|
|
33
|
+
import pathlib
|
|
34
|
+
import re
|
|
35
|
+
from contextlib import contextmanager
|
|
36
|
+
from pathlib import Path
|
|
37
|
+
from shutil import copyfileobj
|
|
38
|
+
from typing import Any, Callable, TextIO, TypeVar
|
|
39
|
+
from xml.etree import ElementTree as et
|
|
40
|
+
|
|
41
|
+
import ijson
|
|
42
|
+
import pandas as pd
|
|
43
|
+
import xmltodict
|
|
44
|
+
|
|
45
|
+
from graflo.architecture.onto import BaseEnum, EncodingType
|
|
46
|
+
|
|
47
|
+
AbstractChunkerType = TypeVar("AbstractChunkerType", bound="AbstractChunker")
|
|
48
|
+
|
|
49
|
+
logger = logging.getLogger(__name__)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class ChunkerType(BaseEnum):
|
|
53
|
+
"""Types of chunkers supported by the system.
|
|
54
|
+
|
|
55
|
+
JSON: For JSON files
|
|
56
|
+
JSONL: For JSON Lines files
|
|
57
|
+
TABLE: For CSV/TSV files
|
|
58
|
+
TRIVIAL: For in-memory lists
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
JSON = "json"
|
|
62
|
+
JSONL = "jsonl"
|
|
63
|
+
TABLE = "table"
|
|
64
|
+
TRIVIAL = "trivial"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class AbstractChunker(abc.ABC):
|
|
68
|
+
"""Abstract base class for chunking implementations.
|
|
69
|
+
|
|
70
|
+
This class defines the interface for all chunkers, providing common
|
|
71
|
+
functionality for batch processing and iteration.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
batch_size: Number of items per batch (default: 10)
|
|
75
|
+
limit: Maximum number of items to process (default: None)
|
|
76
|
+
|
|
77
|
+
Attributes:
|
|
78
|
+
units_processed: Number of items processed
|
|
79
|
+
batch_size: Size of each batch
|
|
80
|
+
limit: Maximum number of items to process
|
|
81
|
+
cnt: Current count of processed items
|
|
82
|
+
iteration_tried: Whether iteration has been attempted
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
def __init__(self, batch_size=10, limit=None):
|
|
86
|
+
self.units_processed = 0
|
|
87
|
+
self.batch_size = batch_size
|
|
88
|
+
self.limit: int | None = limit
|
|
89
|
+
self.cnt = 0
|
|
90
|
+
self.iteration_tried = False
|
|
91
|
+
|
|
92
|
+
def _limit_reached(self):
|
|
93
|
+
"""Check if the processing limit has been reached.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
bool: True if limit is reached, False otherwise
|
|
97
|
+
"""
|
|
98
|
+
return self.limit is not None and self.cnt >= self.limit
|
|
99
|
+
|
|
100
|
+
def __iter__(self):
|
|
101
|
+
"""Initialize iteration if not already done.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
self: Iterator instance
|
|
105
|
+
"""
|
|
106
|
+
if not self.iteration_tried:
|
|
107
|
+
self._prepare_iteration()
|
|
108
|
+
return self
|
|
109
|
+
|
|
110
|
+
def __next__(self):
|
|
111
|
+
"""Get the next batch of items.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
list: Next batch of items
|
|
115
|
+
|
|
116
|
+
Raises:
|
|
117
|
+
StopIteration: When no more items are available or limit is reached
|
|
118
|
+
"""
|
|
119
|
+
batch = self._next_item()
|
|
120
|
+
self.cnt += len(batch)
|
|
121
|
+
if not batch or self._limit_reached():
|
|
122
|
+
raise StopIteration
|
|
123
|
+
return batch
|
|
124
|
+
|
|
125
|
+
@abc.abstractmethod
|
|
126
|
+
def _next_item(self):
|
|
127
|
+
"""Get the next item or batch of items.
|
|
128
|
+
|
|
129
|
+
This method must be implemented by subclasses.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
Any: Next item or batch of items
|
|
133
|
+
"""
|
|
134
|
+
pass
|
|
135
|
+
|
|
136
|
+
def _prepare_iteration(self):
|
|
137
|
+
"""Prepare for iteration.
|
|
138
|
+
|
|
139
|
+
This method is called before the first iteration attempt.
|
|
140
|
+
"""
|
|
141
|
+
self.iteration_tried = True
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class FileChunker(AbstractChunker):
|
|
145
|
+
"""Base class for file-based chunking.
|
|
146
|
+
|
|
147
|
+
This class provides functionality for reading and chunking files,
|
|
148
|
+
with support for different encodings and compression.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
filename: Path to the file to process
|
|
152
|
+
encoding: File encoding (default: UTF_8)
|
|
153
|
+
mode: File mode ('t' for text, 'b' for binary)
|
|
154
|
+
**kwargs: Additional arguments for AbstractChunker
|
|
155
|
+
|
|
156
|
+
Attributes:
|
|
157
|
+
filename: Path to the file
|
|
158
|
+
file_obj: File object for reading
|
|
159
|
+
encoding: File encoding
|
|
160
|
+
mode: File mode
|
|
161
|
+
"""
|
|
162
|
+
|
|
163
|
+
def __init__(
|
|
164
|
+
self,
|
|
165
|
+
filename,
|
|
166
|
+
encoding: EncodingType = EncodingType.UTF_8,
|
|
167
|
+
mode="t",
|
|
168
|
+
**kwargs,
|
|
169
|
+
):
|
|
170
|
+
super().__init__(**kwargs)
|
|
171
|
+
self.filename: Path = filename
|
|
172
|
+
self.file_obj: TextIO | gzip.GzipFile | None = None
|
|
173
|
+
self.encoding: EncodingType | None = encoding
|
|
174
|
+
self.mode = mode
|
|
175
|
+
if self.mode == "b":
|
|
176
|
+
self.encoding = None
|
|
177
|
+
|
|
178
|
+
def _next_item(self):
|
|
179
|
+
"""Get the next line from the file.
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
str: Next line from the file
|
|
183
|
+
|
|
184
|
+
Raises:
|
|
185
|
+
StopIteration: When end of file is reached
|
|
186
|
+
RuntimeError: If file is not opened (should not happen in normal flow)
|
|
187
|
+
"""
|
|
188
|
+
# file_obj is guaranteed to be open after _prepare_iteration() is called
|
|
189
|
+
if self.file_obj is None:
|
|
190
|
+
raise RuntimeError("File should be opened before calling _next_item()")
|
|
191
|
+
return next(self.file_obj)
|
|
192
|
+
|
|
193
|
+
def _prepare_iteration(self):
|
|
194
|
+
"""Open the file for reading.
|
|
195
|
+
|
|
196
|
+
Handles both regular and gzipped files.
|
|
197
|
+
"""
|
|
198
|
+
super()._prepare_iteration()
|
|
199
|
+
if ".gz" in self.filename.suffixes:
|
|
200
|
+
self.file_obj = gzip.open(
|
|
201
|
+
self.filename.absolute().as_posix(),
|
|
202
|
+
f"r{self.mode}",
|
|
203
|
+
encoding=self.encoding,
|
|
204
|
+
)
|
|
205
|
+
else:
|
|
206
|
+
self.file_obj = open(
|
|
207
|
+
self.filename.absolute().as_posix(),
|
|
208
|
+
f"r{self.mode}",
|
|
209
|
+
encoding=self.encoding,
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
def __next__(self):
|
|
213
|
+
"""Get the next batch of lines.
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
list[str]: Next batch of lines
|
|
217
|
+
|
|
218
|
+
Raises:
|
|
219
|
+
StopIteration: When end of file is reached or limit is reached
|
|
220
|
+
RuntimeError: If file is not opened (should not happen in normal flow)
|
|
221
|
+
"""
|
|
222
|
+
batch = []
|
|
223
|
+
|
|
224
|
+
if self._limit_reached():
|
|
225
|
+
if self.file_obj is not None:
|
|
226
|
+
self.file_obj.close()
|
|
227
|
+
raise StopIteration
|
|
228
|
+
while len(batch) < self.batch_size and not self._limit_reached():
|
|
229
|
+
try:
|
|
230
|
+
batch += [self._next_item()]
|
|
231
|
+
self.cnt += 1
|
|
232
|
+
except StopIteration:
|
|
233
|
+
if batch:
|
|
234
|
+
return batch
|
|
235
|
+
if self.file_obj is not None:
|
|
236
|
+
self.file_obj.close()
|
|
237
|
+
raise StopIteration
|
|
238
|
+
|
|
239
|
+
return batch
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
class TableChunker(FileChunker):
|
|
243
|
+
"""Chunker for CSV/TSV files.
|
|
244
|
+
|
|
245
|
+
This class extends FileChunker to handle tabular data, converting
|
|
246
|
+
each row into a dictionary with column headers as keys.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
**kwargs: Arguments for FileChunker, including:
|
|
250
|
+
sep: Field separator (default: ',')
|
|
251
|
+
"""
|
|
252
|
+
|
|
253
|
+
def __init__(self, **kwargs):
|
|
254
|
+
self.sep = kwargs.pop("sep", ",")
|
|
255
|
+
super().__init__(**kwargs)
|
|
256
|
+
self.header: list[str]
|
|
257
|
+
|
|
258
|
+
def _prepare_iteration(self):
|
|
259
|
+
"""Read the header row and prepare for iteration."""
|
|
260
|
+
super()._prepare_iteration()
|
|
261
|
+
# After super()._prepare_iteration(), file_obj is guaranteed to be open
|
|
262
|
+
if self.file_obj is None:
|
|
263
|
+
raise RuntimeError("File should be opened by parent _prepare_iteration()")
|
|
264
|
+
header = next(self.file_obj)
|
|
265
|
+
self.header = header.rstrip("\n").split(self.sep)
|
|
266
|
+
|
|
267
|
+
def __next__(self):
|
|
268
|
+
"""Get the next batch of rows as dictionaries.
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
list[dict]: Next batch of rows as dictionaries
|
|
272
|
+
"""
|
|
273
|
+
lines = super().__next__()
|
|
274
|
+
lines2 = [
|
|
275
|
+
next(csv.reader([line.rstrip()], skipinitialspace=True)) for line in lines
|
|
276
|
+
]
|
|
277
|
+
dressed = [dict(zip(self.header, row)) for row in lines2]
|
|
278
|
+
return dressed
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
class JsonlChunker(FileChunker):
|
|
282
|
+
"""Chunker for JSON Lines files.
|
|
283
|
+
|
|
284
|
+
This class extends FileChunker to handle JSON Lines format,
|
|
285
|
+
parsing each line as a JSON object.
|
|
286
|
+
"""
|
|
287
|
+
|
|
288
|
+
def __init__(self, **kwargs):
|
|
289
|
+
super().__init__(**kwargs)
|
|
290
|
+
|
|
291
|
+
def __next__(self):
|
|
292
|
+
"""Get the next batch of JSON objects.
|
|
293
|
+
|
|
294
|
+
Returns:
|
|
295
|
+
list[dict]: Next batch of parsed JSON objects
|
|
296
|
+
"""
|
|
297
|
+
lines = super().__next__()
|
|
298
|
+
lines2 = [json.loads(line) for line in lines]
|
|
299
|
+
return lines2
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
class JsonChunker(FileChunker):
|
|
303
|
+
"""Chunker for JSON files.
|
|
304
|
+
|
|
305
|
+
This class extends FileChunker to handle JSON files using
|
|
306
|
+
streaming JSON parsing for memory efficiency.
|
|
307
|
+
"""
|
|
308
|
+
|
|
309
|
+
def __init__(self, **kwargs):
|
|
310
|
+
super().__init__(mode="b", **kwargs)
|
|
311
|
+
self.parser: Any
|
|
312
|
+
|
|
313
|
+
def _prepare_iteration(self):
|
|
314
|
+
"""Initialize the JSON parser for streaming."""
|
|
315
|
+
super()._prepare_iteration()
|
|
316
|
+
# After super()._prepare_iteration(), file_obj is guaranteed to be open
|
|
317
|
+
if self.file_obj is None:
|
|
318
|
+
raise RuntimeError("File should be opened by parent _prepare_iteration()")
|
|
319
|
+
self.parser = ijson.items(self.file_obj, "item")
|
|
320
|
+
|
|
321
|
+
def _next_item(self):
|
|
322
|
+
"""Get the next JSON object.
|
|
323
|
+
|
|
324
|
+
Returns:
|
|
325
|
+
dict: Next parsed JSON object
|
|
326
|
+
|
|
327
|
+
Raises:
|
|
328
|
+
StopIteration: When end of file is reached
|
|
329
|
+
"""
|
|
330
|
+
return next(self.parser)
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
class TrivialChunker(AbstractChunker):
|
|
334
|
+
"""Chunker for in-memory lists.
|
|
335
|
+
|
|
336
|
+
This class provides chunking functionality for lists of dictionaries
|
|
337
|
+
that are already in memory.
|
|
338
|
+
|
|
339
|
+
Args:
|
|
340
|
+
array: List of dictionaries to chunk
|
|
341
|
+
**kwargs: Additional arguments for AbstractChunker
|
|
342
|
+
"""
|
|
343
|
+
|
|
344
|
+
def __init__(self, array: list[dict], **kwargs):
|
|
345
|
+
super().__init__(**kwargs)
|
|
346
|
+
self.array = array
|
|
347
|
+
|
|
348
|
+
def _next_item(self):
|
|
349
|
+
"""Get the next batch of items from the array.
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
list[dict]: Next batch of items
|
|
353
|
+
"""
|
|
354
|
+
return self.array[self.cnt : self.cnt + self.batch_size]
|
|
355
|
+
|
|
356
|
+
def __next__(self):
|
|
357
|
+
"""Get the next batch of items.
|
|
358
|
+
|
|
359
|
+
Returns:
|
|
360
|
+
list[dict]: Next batch of items
|
|
361
|
+
|
|
362
|
+
Raises:
|
|
363
|
+
StopIteration: When no more items are available or limit is reached
|
|
364
|
+
"""
|
|
365
|
+
batch = self._next_item()
|
|
366
|
+
self.cnt += len(batch)
|
|
367
|
+
if not batch or self._limit_reached():
|
|
368
|
+
raise StopIteration
|
|
369
|
+
return batch
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
class ChunkerDataFrame(AbstractChunker):
|
|
373
|
+
"""Chunker for Pandas DataFrames.
|
|
374
|
+
|
|
375
|
+
This class provides chunking functionality for Pandas DataFrames,
|
|
376
|
+
converting each chunk into a list of dictionaries.
|
|
377
|
+
|
|
378
|
+
Args:
|
|
379
|
+
df: DataFrame to chunk
|
|
380
|
+
**kwargs: Additional arguments for AbstractChunker
|
|
381
|
+
"""
|
|
382
|
+
|
|
383
|
+
def __init__(self, df: pd.DataFrame, **kwargs):
|
|
384
|
+
super().__init__(**kwargs)
|
|
385
|
+
self.df = df
|
|
386
|
+
self.columns = df.columns
|
|
387
|
+
|
|
388
|
+
def _next_item(self):
|
|
389
|
+
"""Get the next batch of rows as dictionaries.
|
|
390
|
+
|
|
391
|
+
Returns:
|
|
392
|
+
list[dict]: Next batch of rows as dictionaries
|
|
393
|
+
"""
|
|
394
|
+
cid = self.cnt
|
|
395
|
+
pre_batch = self.df.iloc[cid : cid + self.batch_size].values.tolist()
|
|
396
|
+
batch = [{k: v for k, v in zip(self.columns, item)} for item in pre_batch]
|
|
397
|
+
return batch
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
class ChunkerFactory:
|
|
401
|
+
"""Factory for creating appropriate chunkers.
|
|
402
|
+
|
|
403
|
+
This class provides a factory method for creating chunkers based on
|
|
404
|
+
the type of resource and configuration provided.
|
|
405
|
+
|
|
406
|
+
Example:
|
|
407
|
+
>>> chunker = ChunkerFactory.create_chunker(
|
|
408
|
+
... resource="data.json",
|
|
409
|
+
... type=ChunkerType.JSON,
|
|
410
|
+
... batch_size=1000
|
|
411
|
+
... )
|
|
412
|
+
"""
|
|
413
|
+
|
|
414
|
+
@classmethod
|
|
415
|
+
def _guess_chunker_type(cls, filename: Path) -> ChunkerType:
|
|
416
|
+
"""Guess the appropriate chunker type based on file extension.
|
|
417
|
+
|
|
418
|
+
This method examines the file extension to determine the most appropriate
|
|
419
|
+
chunker type. It supports common file extensions for JSON, JSONL, and CSV/TSV files,
|
|
420
|
+
including compressed versions (e.g., .json.gz, .csv.gz).
|
|
421
|
+
|
|
422
|
+
Args:
|
|
423
|
+
filename: Path to the file to analyze
|
|
424
|
+
|
|
425
|
+
Returns:
|
|
426
|
+
ChunkerType: Guessed chunker type based on file extension
|
|
427
|
+
|
|
428
|
+
Raises:
|
|
429
|
+
ValueError: If file extension is not recognized
|
|
430
|
+
"""
|
|
431
|
+
# Get all suffixes and remove compression extensions
|
|
432
|
+
suffixes = filename.suffixes
|
|
433
|
+
base_suffix = [y for y in suffixes if y.lower() not in (".gz", ".zip")][
|
|
434
|
+
-1
|
|
435
|
+
].lower()
|
|
436
|
+
|
|
437
|
+
if base_suffix == ".json":
|
|
438
|
+
return ChunkerType.JSON
|
|
439
|
+
elif base_suffix == ".jsonl":
|
|
440
|
+
return ChunkerType.JSONL
|
|
441
|
+
elif base_suffix in (".csv", ".tsv", ".txt"):
|
|
442
|
+
return ChunkerType.TABLE
|
|
443
|
+
else:
|
|
444
|
+
raise ValueError(
|
|
445
|
+
f"Could not guess chunker type for file extension: {base_suffix}"
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
@classmethod
|
|
449
|
+
def create_chunker(cls, **kwargs) -> AbstractChunker:
|
|
450
|
+
"""Create an appropriate chunker for the given resource.
|
|
451
|
+
|
|
452
|
+
Args:
|
|
453
|
+
**kwargs: Configuration for the chunker, including:
|
|
454
|
+
resource: Path to file, list, or DataFrame
|
|
455
|
+
type: Type of chunker to create (optional, will be guessed if None)
|
|
456
|
+
batch_size: Size of each batch
|
|
457
|
+
limit: Maximum number of items to process
|
|
458
|
+
|
|
459
|
+
Returns:
|
|
460
|
+
AbstractChunker: Appropriate chunker instance
|
|
461
|
+
|
|
462
|
+
Raises:
|
|
463
|
+
ValueError: If resource type is not supported or chunker type cannot be guessed
|
|
464
|
+
"""
|
|
465
|
+
resource: Path | list[dict] | pd.DataFrame | None = kwargs.pop("resource", None)
|
|
466
|
+
chunker_type = kwargs.pop("type", None)
|
|
467
|
+
|
|
468
|
+
if isinstance(resource, list):
|
|
469
|
+
return TrivialChunker(array=resource, **kwargs)
|
|
470
|
+
elif isinstance(resource, pd.DataFrame):
|
|
471
|
+
return ChunkerDataFrame(df=resource, **kwargs)
|
|
472
|
+
elif isinstance(resource, Path):
|
|
473
|
+
if chunker_type is None:
|
|
474
|
+
chunker_type = cls._guess_chunker_type(resource)
|
|
475
|
+
if chunker_type == ChunkerType.JSON:
|
|
476
|
+
return JsonChunker(filename=resource, **kwargs)
|
|
477
|
+
elif chunker_type == ChunkerType.JSONL:
|
|
478
|
+
return JsonlChunker(filename=resource, **kwargs)
|
|
479
|
+
elif chunker_type == ChunkerType.TABLE:
|
|
480
|
+
return TableChunker(filename=resource, **kwargs)
|
|
481
|
+
else:
|
|
482
|
+
raise ValueError(f"Unknown chunker type: {chunker_type}")
|
|
483
|
+
else:
|
|
484
|
+
raise ValueError(f"Unsupported resource type: {type(resource)}")
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
class ChunkFlusherMono:
|
|
488
|
+
"""Monolithic chunk flusher for writing data to files.
|
|
489
|
+
|
|
490
|
+
This class provides functionality for writing chunks of data to files,
|
|
491
|
+
with support for file naming and size limits.
|
|
492
|
+
|
|
493
|
+
Args:
|
|
494
|
+
target_prefix: Prefix for output files
|
|
495
|
+
chunksize: Maximum number of items per file
|
|
496
|
+
maxchunks: Maximum number of chunks to write
|
|
497
|
+
suffix: File suffix (default: '.json')
|
|
498
|
+
"""
|
|
499
|
+
|
|
500
|
+
def __init__(self, target_prefix, chunksize, maxchunks=None, suffix=None):
|
|
501
|
+
self.target_prefix = target_prefix
|
|
502
|
+
self.acc = []
|
|
503
|
+
self.chunk_count = 0
|
|
504
|
+
self.chunksize = chunksize
|
|
505
|
+
self.maxchunks = maxchunks
|
|
506
|
+
self.iprocessed = 0
|
|
507
|
+
self.suffix = "good" if suffix is None else suffix
|
|
508
|
+
logger.info(f" in flush_chunk {self.chunksize}")
|
|
509
|
+
|
|
510
|
+
def flush_chunk(self):
|
|
511
|
+
"""Write the current chunk to a file."""
|
|
512
|
+
logger.info(
|
|
513
|
+
f" in flush_chunk: : {len(self.acc)}; chunk count : {self.chunk_count}"
|
|
514
|
+
)
|
|
515
|
+
if len(self.acc) > 0:
|
|
516
|
+
filename = f"{self.target_prefix}#{self.suffix}#{self.chunk_count}.json.gz"
|
|
517
|
+
with gzip.GzipFile(filename, "w") as fout:
|
|
518
|
+
fout.write(json.dumps(self.acc, indent=4).encode("utf-8"))
|
|
519
|
+
logger.info(f" flushed {filename}")
|
|
520
|
+
self.chunk_count += 1
|
|
521
|
+
self.iprocessed += len(self.acc)
|
|
522
|
+
self.acc = []
|
|
523
|
+
|
|
524
|
+
def push(self, item):
|
|
525
|
+
"""Add an item to the current chunk.
|
|
526
|
+
|
|
527
|
+
Args:
|
|
528
|
+
item: Item to add to the chunk
|
|
529
|
+
"""
|
|
530
|
+
self.acc.append(item)
|
|
531
|
+
if len(self.acc) >= self.chunksize:
|
|
532
|
+
self.flush_chunk()
|
|
533
|
+
gc.collect()
|
|
534
|
+
|
|
535
|
+
def stop(self):
|
|
536
|
+
"""Flush any remaining items and close."""
|
|
537
|
+
return self.maxchunks is not None and (self.chunk_count >= self.maxchunks)
|
|
538
|
+
|
|
539
|
+
def items_processed(self):
|
|
540
|
+
"""Get the total number of items processed.
|
|
541
|
+
|
|
542
|
+
Returns:
|
|
543
|
+
int: Number of items processed
|
|
544
|
+
"""
|
|
545
|
+
return self.iprocessed
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
class FPSmart:
|
|
549
|
+
"""Smart file pointer for pattern-based file processing.
|
|
550
|
+
|
|
551
|
+
This class provides a file-like interface with pattern-based
|
|
552
|
+
transformation of the data being read.
|
|
553
|
+
|
|
554
|
+
Args:
|
|
555
|
+
fp: File pointer to wrap
|
|
556
|
+
pattern: Regular expression pattern to match
|
|
557
|
+
substitute: String to substitute for matches
|
|
558
|
+
count: Maximum number of substitutions (0 for unlimited)
|
|
559
|
+
"""
|
|
560
|
+
|
|
561
|
+
def __init__(self, fp, pattern, substitute="", count=0):
|
|
562
|
+
self.fp = fp
|
|
563
|
+
self.pattern = pattern
|
|
564
|
+
self.p = re.compile(self.pattern)
|
|
565
|
+
self.count = count
|
|
566
|
+
self.sub = substitute
|
|
567
|
+
|
|
568
|
+
def read(self, n):
|
|
569
|
+
"""Read and transform data from the file.
|
|
570
|
+
|
|
571
|
+
Args:
|
|
572
|
+
n: Number of bytes to read
|
|
573
|
+
|
|
574
|
+
Returns:
|
|
575
|
+
str: Transformed data
|
|
576
|
+
"""
|
|
577
|
+
s = self.fp.read(n).decode()
|
|
578
|
+
return self.transform(s).encode()
|
|
579
|
+
|
|
580
|
+
def transform(self, s):
|
|
581
|
+
"""Transform the data using the pattern.
|
|
582
|
+
|
|
583
|
+
Args:
|
|
584
|
+
s: Data to transform
|
|
585
|
+
|
|
586
|
+
Returns:
|
|
587
|
+
str: Transformed data
|
|
588
|
+
"""
|
|
589
|
+
self.p.search(s)
|
|
590
|
+
r = self.p.sub(self.sub, s, count=self.count)
|
|
591
|
+
return r
|
|
592
|
+
|
|
593
|
+
def close(self):
|
|
594
|
+
"""Close the underlying file pointer."""
|
|
595
|
+
self.fp.close()
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
tag_wos = "REC"
|
|
599
|
+
pattern_wos = r"xmlns=\".*[^\"]\"(?=>)"
|
|
600
|
+
force_list_wos = (
|
|
601
|
+
"abstract",
|
|
602
|
+
"address_name",
|
|
603
|
+
"book_note",
|
|
604
|
+
"conf_date",
|
|
605
|
+
"conf_info",
|
|
606
|
+
"conf_location",
|
|
607
|
+
"conf_title",
|
|
608
|
+
"conference",
|
|
609
|
+
"contributor",
|
|
610
|
+
"doctype",
|
|
611
|
+
"grant",
|
|
612
|
+
"grant_id",
|
|
613
|
+
"heading",
|
|
614
|
+
"identifier",
|
|
615
|
+
"keyword",
|
|
616
|
+
"language",
|
|
617
|
+
"name",
|
|
618
|
+
"organization",
|
|
619
|
+
"p",
|
|
620
|
+
"publisher",
|
|
621
|
+
"reference",
|
|
622
|
+
"rw_author",
|
|
623
|
+
"sponsor",
|
|
624
|
+
"subheading",
|
|
625
|
+
"subject",
|
|
626
|
+
"suborganization",
|
|
627
|
+
"title",
|
|
628
|
+
"edition",
|
|
629
|
+
"zip",
|
|
630
|
+
)
|
|
631
|
+
|
|
632
|
+
|
|
633
|
+
@contextmanager
|
|
634
|
+
def nullcontext(enter_result=None):
|
|
635
|
+
"""Context manager that does nothing.
|
|
636
|
+
|
|
637
|
+
Args:
|
|
638
|
+
enter_result: Value to return when entering the context
|
|
639
|
+
|
|
640
|
+
Yields:
|
|
641
|
+
The enter_result value
|
|
642
|
+
"""
|
|
643
|
+
yield enter_result
|
|
644
|
+
|
|
645
|
+
|
|
646
|
+
def gunzip_file(fname_in, fname_out):
|
|
647
|
+
"""Decompress a gzipped file.
|
|
648
|
+
|
|
649
|
+
Args:
|
|
650
|
+
fname_in: Path to input gzipped file
|
|
651
|
+
fname_out: Path to output decompressed file
|
|
652
|
+
"""
|
|
653
|
+
with gzip.open(fname_in, "rb") as f_in:
|
|
654
|
+
with open(fname_out, "wb") as f_out:
|
|
655
|
+
copyfileobj(f_in, f_out)
|
|
656
|
+
|
|
657
|
+
|
|
658
|
+
def parse_simple(fp, good_cf, force_list=None, root_tag=None):
|
|
659
|
+
"""Parse XML file with simple structure.
|
|
660
|
+
|
|
661
|
+
Args:
|
|
662
|
+
fp: File pointer to parse
|
|
663
|
+
good_cf: Function to check if an element is valid
|
|
664
|
+
force_list: List of tags that should always be lists
|
|
665
|
+
root_tag: Root tag to start parsing from
|
|
666
|
+
|
|
667
|
+
Returns:
|
|
668
|
+
dict: Parsed XML data
|
|
669
|
+
"""
|
|
670
|
+
events = ("start", "end")
|
|
671
|
+
tree = et.iterparse(fp, events)
|
|
672
|
+
context = iter(tree)
|
|
673
|
+
event, root = next(context)
|
|
674
|
+
for event, pub in context:
|
|
675
|
+
if event == "end" and (pub.tag == root_tag if root_tag is not None else True):
|
|
676
|
+
item = et.tostring(pub, encoding="utf8", method="xml").decode("utf")
|
|
677
|
+
obj = xmltodict.parse(
|
|
678
|
+
item,
|
|
679
|
+
force_cdata=True,
|
|
680
|
+
force_list=force_list,
|
|
681
|
+
)
|
|
682
|
+
good_cf.push(obj)
|
|
683
|
+
root.clear()
|
|
684
|
+
if good_cf.stop():
|
|
685
|
+
break
|
|
686
|
+
|
|
687
|
+
|
|
688
|
+
def convert(
|
|
689
|
+
source: pathlib.Path,
|
|
690
|
+
target_root: str,
|
|
691
|
+
chunk_size: int = 10000,
|
|
692
|
+
max_chunks=None,
|
|
693
|
+
pattern: str | None = None,
|
|
694
|
+
force_list=None,
|
|
695
|
+
root_tag=None,
|
|
696
|
+
):
|
|
697
|
+
"""Convert XML file to JSON chunks.
|
|
698
|
+
|
|
699
|
+
This function processes an XML file and converts it to a series of JSON files,
|
|
700
|
+
with support for pattern-based transformation and chunking.
|
|
701
|
+
|
|
702
|
+
Args:
|
|
703
|
+
source: Path to source XML file
|
|
704
|
+
target_root: Root path for output files
|
|
705
|
+
chunk_size: Number of items per output file (default: 10000)
|
|
706
|
+
max_chunks: Maximum number of chunks to create (default: None)
|
|
707
|
+
pattern: Regular expression pattern for transformation
|
|
708
|
+
force_list: List of tags that should always be lists
|
|
709
|
+
root_tag: Root tag to start parsing from
|
|
710
|
+
|
|
711
|
+
Example:
|
|
712
|
+
>>> convert(
|
|
713
|
+
... source="data.xml",
|
|
714
|
+
... target_root="output",
|
|
715
|
+
... chunk_size=1000,
|
|
716
|
+
... pattern=r'xmlns="[^"]*"',
|
|
717
|
+
... root_tag="PubmedArticle"
|
|
718
|
+
... )
|
|
719
|
+
"""
|
|
720
|
+
logger.info(f" chunksize : {chunk_size} | maxchunks {max_chunks} ")
|
|
721
|
+
|
|
722
|
+
good_cf = ChunkFlusherMono(target_root, chunk_size, max_chunks)
|
|
723
|
+
bad_cf = ChunkFlusherMono(target_root, chunk_size, max_chunks, suffix="bad")
|
|
724
|
+
|
|
725
|
+
if source.suffix == ".gz":
|
|
726
|
+
open_foo: Callable = gzip.open
|
|
727
|
+
elif source.suffix == ".xml":
|
|
728
|
+
open_foo = open
|
|
729
|
+
else:
|
|
730
|
+
raise ValueError("Unknown file type")
|
|
731
|
+
# pylint: disable-next=assignment
|
|
732
|
+
fp: gzip.GzipFile | FPSmart | None
|
|
733
|
+
|
|
734
|
+
with (
|
|
735
|
+
open_foo(source, "rb")
|
|
736
|
+
if isinstance( # type: ignore
|
|
737
|
+
source, pathlib.Path
|
|
738
|
+
)
|
|
739
|
+
else nullcontext() as fp
|
|
740
|
+
):
|
|
741
|
+
if pattern is not None:
|
|
742
|
+
fp = FPSmart(fp, pattern)
|
|
743
|
+
else:
|
|
744
|
+
fp = fp
|
|
745
|
+
parse_simple(fp, good_cf, force_list, root_tag)
|
|
746
|
+
|
|
747
|
+
good_cf.flush_chunk()
|
|
748
|
+
|
|
749
|
+
logger.info(f" {good_cf.items_processed()} good records")
|
|
750
|
+
bad_cf.flush_chunk()
|
|
751
|
+
logger.info(f"{bad_cf.items_processed()} bad records")
|