graflo 1.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. graflo/README.md +18 -0
  2. graflo/__init__.py +70 -0
  3. graflo/architecture/__init__.py +38 -0
  4. graflo/architecture/actor.py +1120 -0
  5. graflo/architecture/actor_util.py +450 -0
  6. graflo/architecture/edge.py +297 -0
  7. graflo/architecture/onto.py +374 -0
  8. graflo/architecture/resource.py +161 -0
  9. graflo/architecture/schema.py +136 -0
  10. graflo/architecture/transform.py +292 -0
  11. graflo/architecture/util.py +93 -0
  12. graflo/architecture/vertex.py +586 -0
  13. graflo/caster.py +655 -0
  14. graflo/cli/__init__.py +14 -0
  15. graflo/cli/ingest.py +194 -0
  16. graflo/cli/manage_dbs.py +197 -0
  17. graflo/cli/plot_schema.py +132 -0
  18. graflo/cli/xml2json.py +93 -0
  19. graflo/data_source/__init__.py +48 -0
  20. graflo/data_source/api.py +339 -0
  21. graflo/data_source/base.py +97 -0
  22. graflo/data_source/factory.py +298 -0
  23. graflo/data_source/file.py +133 -0
  24. graflo/data_source/memory.py +72 -0
  25. graflo/data_source/registry.py +82 -0
  26. graflo/data_source/sql.py +185 -0
  27. graflo/db/__init__.py +44 -0
  28. graflo/db/arango/__init__.py +22 -0
  29. graflo/db/arango/conn.py +1026 -0
  30. graflo/db/arango/query.py +180 -0
  31. graflo/db/arango/util.py +88 -0
  32. graflo/db/conn.py +377 -0
  33. graflo/db/connection/__init__.py +6 -0
  34. graflo/db/connection/config_mapping.py +18 -0
  35. graflo/db/connection/onto.py +688 -0
  36. graflo/db/connection/wsgi.py +29 -0
  37. graflo/db/manager.py +119 -0
  38. graflo/db/neo4j/__init__.py +16 -0
  39. graflo/db/neo4j/conn.py +639 -0
  40. graflo/db/postgres/__init__.py +156 -0
  41. graflo/db/postgres/conn.py +425 -0
  42. graflo/db/postgres/resource_mapping.py +139 -0
  43. graflo/db/postgres/schema_inference.py +245 -0
  44. graflo/db/postgres/types.py +148 -0
  45. graflo/db/tigergraph/__init__.py +9 -0
  46. graflo/db/tigergraph/conn.py +2212 -0
  47. graflo/db/util.py +49 -0
  48. graflo/filter/__init__.py +21 -0
  49. graflo/filter/onto.py +525 -0
  50. graflo/logging.conf +22 -0
  51. graflo/onto.py +190 -0
  52. graflo/plot/__init__.py +17 -0
  53. graflo/plot/plotter.py +556 -0
  54. graflo/util/__init__.py +23 -0
  55. graflo/util/chunker.py +751 -0
  56. graflo/util/merge.py +150 -0
  57. graflo/util/misc.py +37 -0
  58. graflo/util/onto.py +332 -0
  59. graflo/util/transform.py +448 -0
  60. graflo-1.3.3.dist-info/METADATA +190 -0
  61. graflo-1.3.3.dist-info/RECORD +64 -0
  62. graflo-1.3.3.dist-info/WHEEL +4 -0
  63. graflo-1.3.3.dist-info/entry_points.txt +5 -0
  64. graflo-1.3.3.dist-info/licenses/LICENSE +126 -0
graflo/util/chunker.py ADDED
@@ -0,0 +1,751 @@
1
+ """Data chunking utilities for efficient file processing.
2
+
3
+ This module provides utilities for processing large files by breaking them into
4
+ manageable chunks. It supports various file formats (JSON, JSONL, CSV) and provides
5
+ both file-based and in-memory chunking capabilities.
6
+
7
+ Key Components:
8
+ - AbstractChunker: Base class for chunking implementations
9
+ - FileChunker: File-based chunking with encoding support
10
+ - TableChunker: CSV/TSV file chunking
11
+ - JsonlChunker: JSON Lines file chunking
12
+ - JsonChunker: JSON file chunking
13
+ - TrivialChunker: In-memory list chunking
14
+ - ChunkerDataFrame: Pandas DataFrame chunking
15
+ - ChunkerFactory: Factory for creating appropriate chunkers
16
+
17
+ Example:
18
+ >>> chunker = ChunkerFactory.create_chunker(
19
+ ... resource="data.json",
20
+ ... type=ChunkerType.JSON,
21
+ ... batch_size=1000
22
+ ... )
23
+ >>> for batch in chunker:
24
+ ... process_batch(batch)
25
+ """
26
+
27
+ import abc
28
+ import csv
29
+ import gc
30
+ import gzip
31
+ import json
32
+ import logging
33
+ import pathlib
34
+ import re
35
+ from contextlib import contextmanager
36
+ from pathlib import Path
37
+ from shutil import copyfileobj
38
+ from typing import Any, Callable, TextIO, TypeVar
39
+ from xml.etree import ElementTree as et
40
+
41
+ import ijson
42
+ import pandas as pd
43
+ import xmltodict
44
+
45
+ from graflo.architecture.onto import BaseEnum, EncodingType
46
+
47
+ AbstractChunkerType = TypeVar("AbstractChunkerType", bound="AbstractChunker")
48
+
49
+ logger = logging.getLogger(__name__)
50
+
51
+
52
+ class ChunkerType(BaseEnum):
53
+ """Types of chunkers supported by the system.
54
+
55
+ JSON: For JSON files
56
+ JSONL: For JSON Lines files
57
+ TABLE: For CSV/TSV files
58
+ TRIVIAL: For in-memory lists
59
+ """
60
+
61
+ JSON = "json"
62
+ JSONL = "jsonl"
63
+ TABLE = "table"
64
+ TRIVIAL = "trivial"
65
+
66
+
67
+ class AbstractChunker(abc.ABC):
68
+ """Abstract base class for chunking implementations.
69
+
70
+ This class defines the interface for all chunkers, providing common
71
+ functionality for batch processing and iteration.
72
+
73
+ Args:
74
+ batch_size: Number of items per batch (default: 10)
75
+ limit: Maximum number of items to process (default: None)
76
+
77
+ Attributes:
78
+ units_processed: Number of items processed
79
+ batch_size: Size of each batch
80
+ limit: Maximum number of items to process
81
+ cnt: Current count of processed items
82
+ iteration_tried: Whether iteration has been attempted
83
+ """
84
+
85
+ def __init__(self, batch_size=10, limit=None):
86
+ self.units_processed = 0
87
+ self.batch_size = batch_size
88
+ self.limit: int | None = limit
89
+ self.cnt = 0
90
+ self.iteration_tried = False
91
+
92
+ def _limit_reached(self):
93
+ """Check if the processing limit has been reached.
94
+
95
+ Returns:
96
+ bool: True if limit is reached, False otherwise
97
+ """
98
+ return self.limit is not None and self.cnt >= self.limit
99
+
100
+ def __iter__(self):
101
+ """Initialize iteration if not already done.
102
+
103
+ Returns:
104
+ self: Iterator instance
105
+ """
106
+ if not self.iteration_tried:
107
+ self._prepare_iteration()
108
+ return self
109
+
110
+ def __next__(self):
111
+ """Get the next batch of items.
112
+
113
+ Returns:
114
+ list: Next batch of items
115
+
116
+ Raises:
117
+ StopIteration: When no more items are available or limit is reached
118
+ """
119
+ batch = self._next_item()
120
+ self.cnt += len(batch)
121
+ if not batch or self._limit_reached():
122
+ raise StopIteration
123
+ return batch
124
+
125
+ @abc.abstractmethod
126
+ def _next_item(self):
127
+ """Get the next item or batch of items.
128
+
129
+ This method must be implemented by subclasses.
130
+
131
+ Returns:
132
+ Any: Next item or batch of items
133
+ """
134
+ pass
135
+
136
+ def _prepare_iteration(self):
137
+ """Prepare for iteration.
138
+
139
+ This method is called before the first iteration attempt.
140
+ """
141
+ self.iteration_tried = True
142
+
143
+
144
+ class FileChunker(AbstractChunker):
145
+ """Base class for file-based chunking.
146
+
147
+ This class provides functionality for reading and chunking files,
148
+ with support for different encodings and compression.
149
+
150
+ Args:
151
+ filename: Path to the file to process
152
+ encoding: File encoding (default: UTF_8)
153
+ mode: File mode ('t' for text, 'b' for binary)
154
+ **kwargs: Additional arguments for AbstractChunker
155
+
156
+ Attributes:
157
+ filename: Path to the file
158
+ file_obj: File object for reading
159
+ encoding: File encoding
160
+ mode: File mode
161
+ """
162
+
163
+ def __init__(
164
+ self,
165
+ filename,
166
+ encoding: EncodingType = EncodingType.UTF_8,
167
+ mode="t",
168
+ **kwargs,
169
+ ):
170
+ super().__init__(**kwargs)
171
+ self.filename: Path = filename
172
+ self.file_obj: TextIO | gzip.GzipFile | None = None
173
+ self.encoding: EncodingType | None = encoding
174
+ self.mode = mode
175
+ if self.mode == "b":
176
+ self.encoding = None
177
+
178
+ def _next_item(self):
179
+ """Get the next line from the file.
180
+
181
+ Returns:
182
+ str: Next line from the file
183
+
184
+ Raises:
185
+ StopIteration: When end of file is reached
186
+ RuntimeError: If file is not opened (should not happen in normal flow)
187
+ """
188
+ # file_obj is guaranteed to be open after _prepare_iteration() is called
189
+ if self.file_obj is None:
190
+ raise RuntimeError("File should be opened before calling _next_item()")
191
+ return next(self.file_obj)
192
+
193
+ def _prepare_iteration(self):
194
+ """Open the file for reading.
195
+
196
+ Handles both regular and gzipped files.
197
+ """
198
+ super()._prepare_iteration()
199
+ if ".gz" in self.filename.suffixes:
200
+ self.file_obj = gzip.open(
201
+ self.filename.absolute().as_posix(),
202
+ f"r{self.mode}",
203
+ encoding=self.encoding,
204
+ )
205
+ else:
206
+ self.file_obj = open(
207
+ self.filename.absolute().as_posix(),
208
+ f"r{self.mode}",
209
+ encoding=self.encoding,
210
+ )
211
+
212
+ def __next__(self):
213
+ """Get the next batch of lines.
214
+
215
+ Returns:
216
+ list[str]: Next batch of lines
217
+
218
+ Raises:
219
+ StopIteration: When end of file is reached or limit is reached
220
+ RuntimeError: If file is not opened (should not happen in normal flow)
221
+ """
222
+ batch = []
223
+
224
+ if self._limit_reached():
225
+ if self.file_obj is not None:
226
+ self.file_obj.close()
227
+ raise StopIteration
228
+ while len(batch) < self.batch_size and not self._limit_reached():
229
+ try:
230
+ batch += [self._next_item()]
231
+ self.cnt += 1
232
+ except StopIteration:
233
+ if batch:
234
+ return batch
235
+ if self.file_obj is not None:
236
+ self.file_obj.close()
237
+ raise StopIteration
238
+
239
+ return batch
240
+
241
+
242
+ class TableChunker(FileChunker):
243
+ """Chunker for CSV/TSV files.
244
+
245
+ This class extends FileChunker to handle tabular data, converting
246
+ each row into a dictionary with column headers as keys.
247
+
248
+ Args:
249
+ **kwargs: Arguments for FileChunker, including:
250
+ sep: Field separator (default: ',')
251
+ """
252
+
253
+ def __init__(self, **kwargs):
254
+ self.sep = kwargs.pop("sep", ",")
255
+ super().__init__(**kwargs)
256
+ self.header: list[str]
257
+
258
+ def _prepare_iteration(self):
259
+ """Read the header row and prepare for iteration."""
260
+ super()._prepare_iteration()
261
+ # After super()._prepare_iteration(), file_obj is guaranteed to be open
262
+ if self.file_obj is None:
263
+ raise RuntimeError("File should be opened by parent _prepare_iteration()")
264
+ header = next(self.file_obj)
265
+ self.header = header.rstrip("\n").split(self.sep)
266
+
267
+ def __next__(self):
268
+ """Get the next batch of rows as dictionaries.
269
+
270
+ Returns:
271
+ list[dict]: Next batch of rows as dictionaries
272
+ """
273
+ lines = super().__next__()
274
+ lines2 = [
275
+ next(csv.reader([line.rstrip()], skipinitialspace=True)) for line in lines
276
+ ]
277
+ dressed = [dict(zip(self.header, row)) for row in lines2]
278
+ return dressed
279
+
280
+
281
+ class JsonlChunker(FileChunker):
282
+ """Chunker for JSON Lines files.
283
+
284
+ This class extends FileChunker to handle JSON Lines format,
285
+ parsing each line as a JSON object.
286
+ """
287
+
288
+ def __init__(self, **kwargs):
289
+ super().__init__(**kwargs)
290
+
291
+ def __next__(self):
292
+ """Get the next batch of JSON objects.
293
+
294
+ Returns:
295
+ list[dict]: Next batch of parsed JSON objects
296
+ """
297
+ lines = super().__next__()
298
+ lines2 = [json.loads(line) for line in lines]
299
+ return lines2
300
+
301
+
302
+ class JsonChunker(FileChunker):
303
+ """Chunker for JSON files.
304
+
305
+ This class extends FileChunker to handle JSON files using
306
+ streaming JSON parsing for memory efficiency.
307
+ """
308
+
309
+ def __init__(self, **kwargs):
310
+ super().__init__(mode="b", **kwargs)
311
+ self.parser: Any
312
+
313
+ def _prepare_iteration(self):
314
+ """Initialize the JSON parser for streaming."""
315
+ super()._prepare_iteration()
316
+ # After super()._prepare_iteration(), file_obj is guaranteed to be open
317
+ if self.file_obj is None:
318
+ raise RuntimeError("File should be opened by parent _prepare_iteration()")
319
+ self.parser = ijson.items(self.file_obj, "item")
320
+
321
+ def _next_item(self):
322
+ """Get the next JSON object.
323
+
324
+ Returns:
325
+ dict: Next parsed JSON object
326
+
327
+ Raises:
328
+ StopIteration: When end of file is reached
329
+ """
330
+ return next(self.parser)
331
+
332
+
333
+ class TrivialChunker(AbstractChunker):
334
+ """Chunker for in-memory lists.
335
+
336
+ This class provides chunking functionality for lists of dictionaries
337
+ that are already in memory.
338
+
339
+ Args:
340
+ array: List of dictionaries to chunk
341
+ **kwargs: Additional arguments for AbstractChunker
342
+ """
343
+
344
+ def __init__(self, array: list[dict], **kwargs):
345
+ super().__init__(**kwargs)
346
+ self.array = array
347
+
348
+ def _next_item(self):
349
+ """Get the next batch of items from the array.
350
+
351
+ Returns:
352
+ list[dict]: Next batch of items
353
+ """
354
+ return self.array[self.cnt : self.cnt + self.batch_size]
355
+
356
+ def __next__(self):
357
+ """Get the next batch of items.
358
+
359
+ Returns:
360
+ list[dict]: Next batch of items
361
+
362
+ Raises:
363
+ StopIteration: When no more items are available or limit is reached
364
+ """
365
+ batch = self._next_item()
366
+ self.cnt += len(batch)
367
+ if not batch or self._limit_reached():
368
+ raise StopIteration
369
+ return batch
370
+
371
+
372
+ class ChunkerDataFrame(AbstractChunker):
373
+ """Chunker for Pandas DataFrames.
374
+
375
+ This class provides chunking functionality for Pandas DataFrames,
376
+ converting each chunk into a list of dictionaries.
377
+
378
+ Args:
379
+ df: DataFrame to chunk
380
+ **kwargs: Additional arguments for AbstractChunker
381
+ """
382
+
383
+ def __init__(self, df: pd.DataFrame, **kwargs):
384
+ super().__init__(**kwargs)
385
+ self.df = df
386
+ self.columns = df.columns
387
+
388
+ def _next_item(self):
389
+ """Get the next batch of rows as dictionaries.
390
+
391
+ Returns:
392
+ list[dict]: Next batch of rows as dictionaries
393
+ """
394
+ cid = self.cnt
395
+ pre_batch = self.df.iloc[cid : cid + self.batch_size].values.tolist()
396
+ batch = [{k: v for k, v in zip(self.columns, item)} for item in pre_batch]
397
+ return batch
398
+
399
+
400
+ class ChunkerFactory:
401
+ """Factory for creating appropriate chunkers.
402
+
403
+ This class provides a factory method for creating chunkers based on
404
+ the type of resource and configuration provided.
405
+
406
+ Example:
407
+ >>> chunker = ChunkerFactory.create_chunker(
408
+ ... resource="data.json",
409
+ ... type=ChunkerType.JSON,
410
+ ... batch_size=1000
411
+ ... )
412
+ """
413
+
414
+ @classmethod
415
+ def _guess_chunker_type(cls, filename: Path) -> ChunkerType:
416
+ """Guess the appropriate chunker type based on file extension.
417
+
418
+ This method examines the file extension to determine the most appropriate
419
+ chunker type. It supports common file extensions for JSON, JSONL, and CSV/TSV files,
420
+ including compressed versions (e.g., .json.gz, .csv.gz).
421
+
422
+ Args:
423
+ filename: Path to the file to analyze
424
+
425
+ Returns:
426
+ ChunkerType: Guessed chunker type based on file extension
427
+
428
+ Raises:
429
+ ValueError: If file extension is not recognized
430
+ """
431
+ # Get all suffixes and remove compression extensions
432
+ suffixes = filename.suffixes
433
+ base_suffix = [y for y in suffixes if y.lower() not in (".gz", ".zip")][
434
+ -1
435
+ ].lower()
436
+
437
+ if base_suffix == ".json":
438
+ return ChunkerType.JSON
439
+ elif base_suffix == ".jsonl":
440
+ return ChunkerType.JSONL
441
+ elif base_suffix in (".csv", ".tsv", ".txt"):
442
+ return ChunkerType.TABLE
443
+ else:
444
+ raise ValueError(
445
+ f"Could not guess chunker type for file extension: {base_suffix}"
446
+ )
447
+
448
+ @classmethod
449
+ def create_chunker(cls, **kwargs) -> AbstractChunker:
450
+ """Create an appropriate chunker for the given resource.
451
+
452
+ Args:
453
+ **kwargs: Configuration for the chunker, including:
454
+ resource: Path to file, list, or DataFrame
455
+ type: Type of chunker to create (optional, will be guessed if None)
456
+ batch_size: Size of each batch
457
+ limit: Maximum number of items to process
458
+
459
+ Returns:
460
+ AbstractChunker: Appropriate chunker instance
461
+
462
+ Raises:
463
+ ValueError: If resource type is not supported or chunker type cannot be guessed
464
+ """
465
+ resource: Path | list[dict] | pd.DataFrame | None = kwargs.pop("resource", None)
466
+ chunker_type = kwargs.pop("type", None)
467
+
468
+ if isinstance(resource, list):
469
+ return TrivialChunker(array=resource, **kwargs)
470
+ elif isinstance(resource, pd.DataFrame):
471
+ return ChunkerDataFrame(df=resource, **kwargs)
472
+ elif isinstance(resource, Path):
473
+ if chunker_type is None:
474
+ chunker_type = cls._guess_chunker_type(resource)
475
+ if chunker_type == ChunkerType.JSON:
476
+ return JsonChunker(filename=resource, **kwargs)
477
+ elif chunker_type == ChunkerType.JSONL:
478
+ return JsonlChunker(filename=resource, **kwargs)
479
+ elif chunker_type == ChunkerType.TABLE:
480
+ return TableChunker(filename=resource, **kwargs)
481
+ else:
482
+ raise ValueError(f"Unknown chunker type: {chunker_type}")
483
+ else:
484
+ raise ValueError(f"Unsupported resource type: {type(resource)}")
485
+
486
+
487
+ class ChunkFlusherMono:
488
+ """Monolithic chunk flusher for writing data to files.
489
+
490
+ This class provides functionality for writing chunks of data to files,
491
+ with support for file naming and size limits.
492
+
493
+ Args:
494
+ target_prefix: Prefix for output files
495
+ chunksize: Maximum number of items per file
496
+ maxchunks: Maximum number of chunks to write
497
+ suffix: File suffix (default: '.json')
498
+ """
499
+
500
+ def __init__(self, target_prefix, chunksize, maxchunks=None, suffix=None):
501
+ self.target_prefix = target_prefix
502
+ self.acc = []
503
+ self.chunk_count = 0
504
+ self.chunksize = chunksize
505
+ self.maxchunks = maxchunks
506
+ self.iprocessed = 0
507
+ self.suffix = "good" if suffix is None else suffix
508
+ logger.info(f" in flush_chunk {self.chunksize}")
509
+
510
+ def flush_chunk(self):
511
+ """Write the current chunk to a file."""
512
+ logger.info(
513
+ f" in flush_chunk: : {len(self.acc)}; chunk count : {self.chunk_count}"
514
+ )
515
+ if len(self.acc) > 0:
516
+ filename = f"{self.target_prefix}#{self.suffix}#{self.chunk_count}.json.gz"
517
+ with gzip.GzipFile(filename, "w") as fout:
518
+ fout.write(json.dumps(self.acc, indent=4).encode("utf-8"))
519
+ logger.info(f" flushed {filename}")
520
+ self.chunk_count += 1
521
+ self.iprocessed += len(self.acc)
522
+ self.acc = []
523
+
524
+ def push(self, item):
525
+ """Add an item to the current chunk.
526
+
527
+ Args:
528
+ item: Item to add to the chunk
529
+ """
530
+ self.acc.append(item)
531
+ if len(self.acc) >= self.chunksize:
532
+ self.flush_chunk()
533
+ gc.collect()
534
+
535
+ def stop(self):
536
+ """Flush any remaining items and close."""
537
+ return self.maxchunks is not None and (self.chunk_count >= self.maxchunks)
538
+
539
+ def items_processed(self):
540
+ """Get the total number of items processed.
541
+
542
+ Returns:
543
+ int: Number of items processed
544
+ """
545
+ return self.iprocessed
546
+
547
+
548
+ class FPSmart:
549
+ """Smart file pointer for pattern-based file processing.
550
+
551
+ This class provides a file-like interface with pattern-based
552
+ transformation of the data being read.
553
+
554
+ Args:
555
+ fp: File pointer to wrap
556
+ pattern: Regular expression pattern to match
557
+ substitute: String to substitute for matches
558
+ count: Maximum number of substitutions (0 for unlimited)
559
+ """
560
+
561
+ def __init__(self, fp, pattern, substitute="", count=0):
562
+ self.fp = fp
563
+ self.pattern = pattern
564
+ self.p = re.compile(self.pattern)
565
+ self.count = count
566
+ self.sub = substitute
567
+
568
+ def read(self, n):
569
+ """Read and transform data from the file.
570
+
571
+ Args:
572
+ n: Number of bytes to read
573
+
574
+ Returns:
575
+ str: Transformed data
576
+ """
577
+ s = self.fp.read(n).decode()
578
+ return self.transform(s).encode()
579
+
580
+ def transform(self, s):
581
+ """Transform the data using the pattern.
582
+
583
+ Args:
584
+ s: Data to transform
585
+
586
+ Returns:
587
+ str: Transformed data
588
+ """
589
+ self.p.search(s)
590
+ r = self.p.sub(self.sub, s, count=self.count)
591
+ return r
592
+
593
+ def close(self):
594
+ """Close the underlying file pointer."""
595
+ self.fp.close()
596
+
597
+
598
+ tag_wos = "REC"
599
+ pattern_wos = r"xmlns=\".*[^\"]\"(?=>)"
600
+ force_list_wos = (
601
+ "abstract",
602
+ "address_name",
603
+ "book_note",
604
+ "conf_date",
605
+ "conf_info",
606
+ "conf_location",
607
+ "conf_title",
608
+ "conference",
609
+ "contributor",
610
+ "doctype",
611
+ "grant",
612
+ "grant_id",
613
+ "heading",
614
+ "identifier",
615
+ "keyword",
616
+ "language",
617
+ "name",
618
+ "organization",
619
+ "p",
620
+ "publisher",
621
+ "reference",
622
+ "rw_author",
623
+ "sponsor",
624
+ "subheading",
625
+ "subject",
626
+ "suborganization",
627
+ "title",
628
+ "edition",
629
+ "zip",
630
+ )
631
+
632
+
633
+ @contextmanager
634
+ def nullcontext(enter_result=None):
635
+ """Context manager that does nothing.
636
+
637
+ Args:
638
+ enter_result: Value to return when entering the context
639
+
640
+ Yields:
641
+ The enter_result value
642
+ """
643
+ yield enter_result
644
+
645
+
646
+ def gunzip_file(fname_in, fname_out):
647
+ """Decompress a gzipped file.
648
+
649
+ Args:
650
+ fname_in: Path to input gzipped file
651
+ fname_out: Path to output decompressed file
652
+ """
653
+ with gzip.open(fname_in, "rb") as f_in:
654
+ with open(fname_out, "wb") as f_out:
655
+ copyfileobj(f_in, f_out)
656
+
657
+
658
+ def parse_simple(fp, good_cf, force_list=None, root_tag=None):
659
+ """Parse XML file with simple structure.
660
+
661
+ Args:
662
+ fp: File pointer to parse
663
+ good_cf: Function to check if an element is valid
664
+ force_list: List of tags that should always be lists
665
+ root_tag: Root tag to start parsing from
666
+
667
+ Returns:
668
+ dict: Parsed XML data
669
+ """
670
+ events = ("start", "end")
671
+ tree = et.iterparse(fp, events)
672
+ context = iter(tree)
673
+ event, root = next(context)
674
+ for event, pub in context:
675
+ if event == "end" and (pub.tag == root_tag if root_tag is not None else True):
676
+ item = et.tostring(pub, encoding="utf8", method="xml").decode("utf")
677
+ obj = xmltodict.parse(
678
+ item,
679
+ force_cdata=True,
680
+ force_list=force_list,
681
+ )
682
+ good_cf.push(obj)
683
+ root.clear()
684
+ if good_cf.stop():
685
+ break
686
+
687
+
688
+ def convert(
689
+ source: pathlib.Path,
690
+ target_root: str,
691
+ chunk_size: int = 10000,
692
+ max_chunks=None,
693
+ pattern: str | None = None,
694
+ force_list=None,
695
+ root_tag=None,
696
+ ):
697
+ """Convert XML file to JSON chunks.
698
+
699
+ This function processes an XML file and converts it to a series of JSON files,
700
+ with support for pattern-based transformation and chunking.
701
+
702
+ Args:
703
+ source: Path to source XML file
704
+ target_root: Root path for output files
705
+ chunk_size: Number of items per output file (default: 10000)
706
+ max_chunks: Maximum number of chunks to create (default: None)
707
+ pattern: Regular expression pattern for transformation
708
+ force_list: List of tags that should always be lists
709
+ root_tag: Root tag to start parsing from
710
+
711
+ Example:
712
+ >>> convert(
713
+ ... source="data.xml",
714
+ ... target_root="output",
715
+ ... chunk_size=1000,
716
+ ... pattern=r'xmlns="[^"]*"',
717
+ ... root_tag="PubmedArticle"
718
+ ... )
719
+ """
720
+ logger.info(f" chunksize : {chunk_size} | maxchunks {max_chunks} ")
721
+
722
+ good_cf = ChunkFlusherMono(target_root, chunk_size, max_chunks)
723
+ bad_cf = ChunkFlusherMono(target_root, chunk_size, max_chunks, suffix="bad")
724
+
725
+ if source.suffix == ".gz":
726
+ open_foo: Callable = gzip.open
727
+ elif source.suffix == ".xml":
728
+ open_foo = open
729
+ else:
730
+ raise ValueError("Unknown file type")
731
+ # pylint: disable-next=assignment
732
+ fp: gzip.GzipFile | FPSmart | None
733
+
734
+ with (
735
+ open_foo(source, "rb")
736
+ if isinstance( # type: ignore
737
+ source, pathlib.Path
738
+ )
739
+ else nullcontext() as fp
740
+ ):
741
+ if pattern is not None:
742
+ fp = FPSmart(fp, pattern)
743
+ else:
744
+ fp = fp
745
+ parse_simple(fp, good_cf, force_list, root_tag)
746
+
747
+ good_cf.flush_chunk()
748
+
749
+ logger.info(f" {good_cf.items_processed()} good records")
750
+ bad_cf.flush_chunk()
751
+ logger.info(f"{bad_cf.items_processed()} bad records")