graflo 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of graflo might be problematic. Click here for more details.

Files changed (45) hide show
  1. graflo/README.md +18 -0
  2. graflo/__init__.py +39 -0
  3. graflo/architecture/__init__.py +37 -0
  4. graflo/architecture/actor.py +974 -0
  5. graflo/architecture/actor_util.py +425 -0
  6. graflo/architecture/edge.py +295 -0
  7. graflo/architecture/onto.py +374 -0
  8. graflo/architecture/resource.py +161 -0
  9. graflo/architecture/schema.py +136 -0
  10. graflo/architecture/transform.py +292 -0
  11. graflo/architecture/util.py +93 -0
  12. graflo/architecture/vertex.py +277 -0
  13. graflo/caster.py +409 -0
  14. graflo/cli/__init__.py +14 -0
  15. graflo/cli/ingest.py +144 -0
  16. graflo/cli/manage_dbs.py +193 -0
  17. graflo/cli/plot_schema.py +132 -0
  18. graflo/cli/xml2json.py +93 -0
  19. graflo/db/__init__.py +32 -0
  20. graflo/db/arango/__init__.py +16 -0
  21. graflo/db/arango/conn.py +734 -0
  22. graflo/db/arango/query.py +180 -0
  23. graflo/db/arango/util.py +88 -0
  24. graflo/db/connection.py +304 -0
  25. graflo/db/manager.py +104 -0
  26. graflo/db/neo4j/__init__.py +16 -0
  27. graflo/db/neo4j/conn.py +432 -0
  28. graflo/db/util.py +49 -0
  29. graflo/filter/__init__.py +21 -0
  30. graflo/filter/onto.py +400 -0
  31. graflo/logging.conf +22 -0
  32. graflo/onto.py +186 -0
  33. graflo/plot/__init__.py +17 -0
  34. graflo/plot/plotter.py +556 -0
  35. graflo/util/__init__.py +23 -0
  36. graflo/util/chunker.py +739 -0
  37. graflo/util/merge.py +148 -0
  38. graflo/util/misc.py +37 -0
  39. graflo/util/onto.py +63 -0
  40. graflo/util/transform.py +406 -0
  41. graflo-1.1.0.dist-info/METADATA +157 -0
  42. graflo-1.1.0.dist-info/RECORD +45 -0
  43. graflo-1.1.0.dist-info/WHEEL +4 -0
  44. graflo-1.1.0.dist-info/entry_points.txt +5 -0
  45. graflo-1.1.0.dist-info/licenses/LICENSE +126 -0
graflo/util/chunker.py ADDED
@@ -0,0 +1,739 @@
1
+ """Data chunking utilities for efficient file processing.
2
+
3
+ This module provides utilities for processing large files by breaking them into
4
+ manageable chunks. It supports various file formats (JSON, JSONL, CSV) and provides
5
+ both file-based and in-memory chunking capabilities.
6
+
7
+ Key Components:
8
+ - AbstractChunker: Base class for chunking implementations
9
+ - FileChunker: File-based chunking with encoding support
10
+ - TableChunker: CSV/TSV file chunking
11
+ - JsonlChunker: JSON Lines file chunking
12
+ - JsonChunker: JSON file chunking
13
+ - TrivialChunker: In-memory list chunking
14
+ - ChunkerDataFrame: Pandas DataFrame chunking
15
+ - ChunkerFactory: Factory for creating appropriate chunkers
16
+
17
+ Example:
18
+ >>> chunker = ChunkerFactory.create_chunker(
19
+ ... resource="data.json",
20
+ ... type=ChunkerType.JSON,
21
+ ... batch_size=1000
22
+ ... )
23
+ >>> for batch in chunker:
24
+ ... process_batch(batch)
25
+ """
26
+
27
+ import abc
28
+ import csv
29
+ import gc
30
+ import gzip
31
+ import json
32
+ import logging
33
+ import pathlib
34
+ import re
35
+ from contextlib import contextmanager
36
+ from pathlib import Path
37
+ from shutil import copyfileobj
38
+ from typing import Any, Callable, TextIO, TypeVar
39
+ from xml.etree import ElementTree as et
40
+
41
+ import ijson
42
+ import pandas as pd
43
+ import xmltodict
44
+
45
+ from graflo.architecture.onto import BaseEnum, EncodingType
46
+
47
+ AbstractChunkerType = TypeVar("AbstractChunkerType", bound="AbstractChunker")
48
+
49
+ logger = logging.getLogger(__name__)
50
+
51
+
52
+ class ChunkerType(BaseEnum):
53
+ """Types of chunkers supported by the system.
54
+
55
+ JSON: For JSON files
56
+ JSONL: For JSON Lines files
57
+ TABLE: For CSV/TSV files
58
+ TRIVIAL: For in-memory lists
59
+ """
60
+
61
+ JSON = "json"
62
+ JSONL = "jsonl"
63
+ TABLE = "table"
64
+ TRIVIAL = "trivial"
65
+
66
+
67
+ class AbstractChunker(abc.ABC):
68
+ """Abstract base class for chunking implementations.
69
+
70
+ This class defines the interface for all chunkers, providing common
71
+ functionality for batch processing and iteration.
72
+
73
+ Args:
74
+ batch_size: Number of items per batch (default: 10)
75
+ limit: Maximum number of items to process (default: None)
76
+
77
+ Attributes:
78
+ units_processed: Number of items processed
79
+ batch_size: Size of each batch
80
+ limit: Maximum number of items to process
81
+ cnt: Current count of processed items
82
+ iteration_tried: Whether iteration has been attempted
83
+ """
84
+
85
+ def __init__(self, batch_size=10, limit=None):
86
+ self.units_processed = 0
87
+ self.batch_size = batch_size
88
+ self.limit: int | None = limit
89
+ self.cnt = 0
90
+ self.iteration_tried = False
91
+
92
+ def _limit_reached(self):
93
+ """Check if the processing limit has been reached.
94
+
95
+ Returns:
96
+ bool: True if limit is reached, False otherwise
97
+ """
98
+ return self.limit is not None and self.cnt >= self.limit
99
+
100
+ def __iter__(self):
101
+ """Initialize iteration if not already done.
102
+
103
+ Returns:
104
+ self: Iterator instance
105
+ """
106
+ if not self.iteration_tried:
107
+ self._prepare_iteration()
108
+ return self
109
+
110
+ def __next__(self):
111
+ """Get the next batch of items.
112
+
113
+ Returns:
114
+ list: Next batch of items
115
+
116
+ Raises:
117
+ StopIteration: When no more items are available or limit is reached
118
+ """
119
+ batch = self._next_item()
120
+ self.cnt += len(batch)
121
+ if not batch or self._limit_reached():
122
+ raise StopIteration
123
+ return batch
124
+
125
+ @abc.abstractmethod
126
+ def _next_item(self):
127
+ """Get the next item or batch of items.
128
+
129
+ This method must be implemented by subclasses.
130
+
131
+ Returns:
132
+ Any: Next item or batch of items
133
+ """
134
+ pass
135
+
136
+ def _prepare_iteration(self):
137
+ """Prepare for iteration.
138
+
139
+ This method is called before the first iteration attempt.
140
+ """
141
+ self.iteration_tried = True
142
+
143
+
144
+ class FileChunker(AbstractChunker):
145
+ """Base class for file-based chunking.
146
+
147
+ This class provides functionality for reading and chunking files,
148
+ with support for different encodings and compression.
149
+
150
+ Args:
151
+ filename: Path to the file to process
152
+ encoding: File encoding (default: UTF_8)
153
+ mode: File mode ('t' for text, 'b' for binary)
154
+ **kwargs: Additional arguments for AbstractChunker
155
+
156
+ Attributes:
157
+ filename: Path to the file
158
+ file_obj: File object for reading
159
+ encoding: File encoding
160
+ mode: File mode
161
+ """
162
+
163
+ def __init__(
164
+ self,
165
+ filename,
166
+ encoding: EncodingType = EncodingType.UTF_8,
167
+ mode="t",
168
+ **kwargs,
169
+ ):
170
+ super().__init__(**kwargs)
171
+ self.filename: Path = filename
172
+ self.file_obj: TextIO | gzip.GzipFile | None = None
173
+ self.encoding: EncodingType | None = encoding
174
+ self.mode = mode
175
+ if self.mode == "b":
176
+ self.encoding = None
177
+
178
+ def _next_item(self):
179
+ """Get the next line from the file.
180
+
181
+ Returns:
182
+ str: Next line from the file
183
+
184
+ Raises:
185
+ StopIteration: When end of file is reached
186
+ """
187
+ return next(self.file_obj)
188
+
189
+ def _prepare_iteration(self):
190
+ """Open the file for reading.
191
+
192
+ Handles both regular and gzipped files.
193
+ """
194
+ super()._prepare_iteration()
195
+ if ".gz" in self.filename.suffixes:
196
+ self.file_obj = gzip.open(
197
+ self.filename.absolute().as_posix(),
198
+ f"r{self.mode}",
199
+ encoding=self.encoding,
200
+ )
201
+ else:
202
+ self.file_obj = open(
203
+ self.filename.absolute().as_posix(),
204
+ f"r{self.mode}",
205
+ encoding=self.encoding,
206
+ )
207
+
208
+ def __next__(self):
209
+ """Get the next batch of lines.
210
+
211
+ Returns:
212
+ list[str]: Next batch of lines
213
+
214
+ Raises:
215
+ StopIteration: When end of file is reached or limit is reached
216
+ """
217
+ batch = []
218
+
219
+ if self._limit_reached():
220
+ self.file_obj.close()
221
+ raise StopIteration
222
+ while len(batch) < self.batch_size and not self._limit_reached():
223
+ try:
224
+ batch += [self._next_item()]
225
+ self.cnt += 1
226
+ except StopIteration:
227
+ if batch:
228
+ return batch
229
+ self.file_obj.close()
230
+ raise StopIteration
231
+
232
+ return batch
233
+
234
+
235
+ class TableChunker(FileChunker):
236
+ """Chunker for CSV/TSV files.
237
+
238
+ This class extends FileChunker to handle tabular data, converting
239
+ each row into a dictionary with column headers as keys.
240
+
241
+ Args:
242
+ **kwargs: Arguments for FileChunker, including:
243
+ sep: Field separator (default: ',')
244
+ """
245
+
246
+ def __init__(self, **kwargs):
247
+ self.sep = kwargs.pop("sep", ",")
248
+ super().__init__(**kwargs)
249
+ self.header: list[str]
250
+
251
+ def _prepare_iteration(self):
252
+ """Read the header row and prepare for iteration."""
253
+ super()._prepare_iteration()
254
+ header = next(self.file_obj)
255
+ self.header = header.rstrip("\n").split(self.sep)
256
+
257
+ def __next__(self):
258
+ """Get the next batch of rows as dictionaries.
259
+
260
+ Returns:
261
+ list[dict]: Next batch of rows as dictionaries
262
+ """
263
+ lines = super().__next__()
264
+ lines2 = [
265
+ next(csv.reader([line.rstrip()], skipinitialspace=True)) for line in lines
266
+ ]
267
+ dressed = [dict(zip(self.header, row)) for row in lines2]
268
+ return dressed
269
+
270
+
271
+ class JsonlChunker(FileChunker):
272
+ """Chunker for JSON Lines files.
273
+
274
+ This class extends FileChunker to handle JSON Lines format,
275
+ parsing each line as a JSON object.
276
+ """
277
+
278
+ def __init__(self, **kwargs):
279
+ super().__init__(**kwargs)
280
+
281
+ def __next__(self):
282
+ """Get the next batch of JSON objects.
283
+
284
+ Returns:
285
+ list[dict]: Next batch of parsed JSON objects
286
+ """
287
+ lines = super().__next__()
288
+ lines2 = [json.loads(line) for line in lines]
289
+ return lines2
290
+
291
+
292
+ class JsonChunker(FileChunker):
293
+ """Chunker for JSON files.
294
+
295
+ This class extends FileChunker to handle JSON files using
296
+ streaming JSON parsing for memory efficiency.
297
+ """
298
+
299
+ def __init__(self, **kwargs):
300
+ super().__init__(mode="b", **kwargs)
301
+ self.parser: Any
302
+
303
+ def _prepare_iteration(self):
304
+ """Initialize the JSON parser for streaming."""
305
+ super()._prepare_iteration()
306
+ self.parser = ijson.items(self.file_obj, "item")
307
+
308
+ def _next_item(self):
309
+ """Get the next JSON object.
310
+
311
+ Returns:
312
+ dict: Next parsed JSON object
313
+
314
+ Raises:
315
+ StopIteration: When end of file is reached
316
+ """
317
+ return next(self.parser)
318
+
319
+
320
+ class TrivialChunker(AbstractChunker):
321
+ """Chunker for in-memory lists.
322
+
323
+ This class provides chunking functionality for lists of dictionaries
324
+ that are already in memory.
325
+
326
+ Args:
327
+ array: List of dictionaries to chunk
328
+ **kwargs: Additional arguments for AbstractChunker
329
+ """
330
+
331
+ def __init__(self, array: list[dict], **kwargs):
332
+ super().__init__(**kwargs)
333
+ self.array = array
334
+
335
+ def _next_item(self):
336
+ """Get the next batch of items from the array.
337
+
338
+ Returns:
339
+ list[dict]: Next batch of items
340
+ """
341
+ return self.array[self.cnt : self.cnt + self.batch_size]
342
+
343
+ def __next__(self):
344
+ """Get the next batch of items.
345
+
346
+ Returns:
347
+ list[dict]: Next batch of items
348
+
349
+ Raises:
350
+ StopIteration: When no more items are available or limit is reached
351
+ """
352
+ batch = self._next_item()
353
+ self.cnt += len(batch)
354
+ if not batch or self._limit_reached():
355
+ raise StopIteration
356
+ return batch
357
+
358
+
359
+ class ChunkerDataFrame(AbstractChunker):
360
+ """Chunker for Pandas DataFrames.
361
+
362
+ This class provides chunking functionality for Pandas DataFrames,
363
+ converting each chunk into a list of dictionaries.
364
+
365
+ Args:
366
+ df: DataFrame to chunk
367
+ **kwargs: Additional arguments for AbstractChunker
368
+ """
369
+
370
+ def __init__(self, df: pd.DataFrame, **kwargs):
371
+ super().__init__(**kwargs)
372
+ self.df = df
373
+ self.columns = df.columns
374
+
375
+ def _next_item(self):
376
+ """Get the next batch of rows as dictionaries.
377
+
378
+ Returns:
379
+ list[dict]: Next batch of rows as dictionaries
380
+ """
381
+ cid = self.cnt
382
+ pre_batch = self.df.iloc[cid : cid + self.batch_size].values.tolist()
383
+ batch = [{k: v for k, v in zip(self.columns, item)} for item in pre_batch]
384
+ return batch
385
+
386
+
387
+ class ChunkerFactory:
388
+ """Factory for creating appropriate chunkers.
389
+
390
+ This class provides a factory method for creating chunkers based on
391
+ the type of resource and configuration provided.
392
+
393
+ Example:
394
+ >>> chunker = ChunkerFactory.create_chunker(
395
+ ... resource="data.json",
396
+ ... type=ChunkerType.JSON,
397
+ ... batch_size=1000
398
+ ... )
399
+ """
400
+
401
+ @classmethod
402
+ def _guess_chunker_type(cls, filename: Path) -> ChunkerType:
403
+ """Guess the appropriate chunker type based on file extension.
404
+
405
+ This method examines the file extension to determine the most appropriate
406
+ chunker type. It supports common file extensions for JSON, JSONL, and CSV/TSV files,
407
+ including compressed versions (e.g., .json.gz, .csv.gz).
408
+
409
+ Args:
410
+ filename: Path to the file to analyze
411
+
412
+ Returns:
413
+ ChunkerType: Guessed chunker type based on file extension
414
+
415
+ Raises:
416
+ ValueError: If file extension is not recognized
417
+ """
418
+ # Get all suffixes and remove compression extensions
419
+ suffixes = filename.suffixes
420
+ base_suffix = [y for y in suffixes if y.lower() not in (".gz", ".zip")][
421
+ -1
422
+ ].lower()
423
+
424
+ if base_suffix == ".json":
425
+ return ChunkerType.JSON
426
+ elif base_suffix == ".jsonl":
427
+ return ChunkerType.JSONL
428
+ elif base_suffix in (".csv", ".tsv", ".txt"):
429
+ return ChunkerType.TABLE
430
+ else:
431
+ raise ValueError(
432
+ f"Could not guess chunker type for file extension: {base_suffix}"
433
+ )
434
+
435
+ @classmethod
436
+ def create_chunker(cls, **kwargs) -> AbstractChunker:
437
+ """Create an appropriate chunker for the given resource.
438
+
439
+ Args:
440
+ **kwargs: Configuration for the chunker, including:
441
+ resource: Path to file, list, or DataFrame
442
+ type: Type of chunker to create (optional, will be guessed if None)
443
+ batch_size: Size of each batch
444
+ limit: Maximum number of items to process
445
+
446
+ Returns:
447
+ AbstractChunker: Appropriate chunker instance
448
+
449
+ Raises:
450
+ ValueError: If resource type is not supported or chunker type cannot be guessed
451
+ """
452
+ resource: Path | list[dict] | pd.DataFrame | None = kwargs.pop("resource", None)
453
+ chunker_type = kwargs.pop("type", None)
454
+
455
+ if isinstance(resource, list):
456
+ return TrivialChunker(array=resource, **kwargs)
457
+ elif isinstance(resource, pd.DataFrame):
458
+ return ChunkerDataFrame(df=resource, **kwargs)
459
+ elif isinstance(resource, Path):
460
+ if chunker_type is None:
461
+ chunker_type = cls._guess_chunker_type(resource)
462
+
463
+ if chunker_type == ChunkerType.JSON:
464
+ return JsonChunker(filename=resource, **kwargs)
465
+ elif chunker_type == ChunkerType.JSONL:
466
+ return JsonlChunker(filename=resource, **kwargs)
467
+ elif chunker_type == ChunkerType.TABLE:
468
+ return TableChunker(filename=resource, **kwargs)
469
+ else:
470
+ raise ValueError(f"Unknown chunker type: {chunker_type}")
471
+ else:
472
+ raise ValueError(f"Unsupported resource type: {type(resource)}")
473
+
474
+
475
+ class ChunkFlusherMono:
476
+ """Monolithic chunk flusher for writing data to files.
477
+
478
+ This class provides functionality for writing chunks of data to files,
479
+ with support for file naming and size limits.
480
+
481
+ Args:
482
+ target_prefix: Prefix for output files
483
+ chunksize: Maximum number of items per file
484
+ maxchunks: Maximum number of chunks to write
485
+ suffix: File suffix (default: '.json')
486
+ """
487
+
488
+ def __init__(self, target_prefix, chunksize, maxchunks=None, suffix=None):
489
+ self.target_prefix = target_prefix
490
+ self.acc = []
491
+ self.chunk_count = 0
492
+ self.chunksize = chunksize
493
+ self.maxchunks = maxchunks
494
+ self.iprocessed = 0
495
+ self.suffix = "good" if suffix is None else suffix
496
+ logger.info(f" in flush_chunk {self.chunksize}")
497
+
498
+ def flush_chunk(self):
499
+ """Write the current chunk to a file."""
500
+ logger.info(
501
+ f" in flush_chunk: : {len(self.acc)}; chunk count : {self.chunk_count}"
502
+ )
503
+ if len(self.acc) > 0:
504
+ filename = f"{self.target_prefix}#{self.suffix}#{self.chunk_count}.json.gz"
505
+ with gzip.GzipFile(filename, "w") as fout:
506
+ fout.write(json.dumps(self.acc, indent=4).encode("utf-8"))
507
+ logger.info(f" flushed {filename}")
508
+ self.chunk_count += 1
509
+ self.iprocessed += len(self.acc)
510
+ self.acc = []
511
+
512
+ def push(self, item):
513
+ """Add an item to the current chunk.
514
+
515
+ Args:
516
+ item: Item to add to the chunk
517
+ """
518
+ self.acc.append(item)
519
+ if len(self.acc) >= self.chunksize:
520
+ self.flush_chunk()
521
+ gc.collect()
522
+
523
+ def stop(self):
524
+ """Flush any remaining items and close."""
525
+ return self.maxchunks is not None and (self.chunk_count >= self.maxchunks)
526
+
527
+ def items_processed(self):
528
+ """Get the total number of items processed.
529
+
530
+ Returns:
531
+ int: Number of items processed
532
+ """
533
+ return self.iprocessed
534
+
535
+
536
+ class FPSmart:
537
+ """Smart file pointer for pattern-based file processing.
538
+
539
+ This class provides a file-like interface with pattern-based
540
+ transformation of the data being read.
541
+
542
+ Args:
543
+ fp: File pointer to wrap
544
+ pattern: Regular expression pattern to match
545
+ substitute: String to substitute for matches
546
+ count: Maximum number of substitutions (0 for unlimited)
547
+ """
548
+
549
+ def __init__(self, fp, pattern, substitute="", count=0):
550
+ self.fp = fp
551
+ self.pattern = pattern
552
+ self.p = re.compile(self.pattern)
553
+ self.count = count
554
+ self.sub = substitute
555
+
556
+ def read(self, n):
557
+ """Read and transform data from the file.
558
+
559
+ Args:
560
+ n: Number of bytes to read
561
+
562
+ Returns:
563
+ str: Transformed data
564
+ """
565
+ s = self.fp.read(n).decode()
566
+ return self.transform(s).encode()
567
+
568
+ def transform(self, s):
569
+ """Transform the data using the pattern.
570
+
571
+ Args:
572
+ s: Data to transform
573
+
574
+ Returns:
575
+ str: Transformed data
576
+ """
577
+ self.p.search(s)
578
+ r = self.p.sub(self.sub, s, count=self.count)
579
+ return r
580
+
581
+ def close(self):
582
+ """Close the underlying file pointer."""
583
+ self.fp.close()
584
+
585
+
586
+ tag_wos = "REC"
587
+ pattern_wos = r"xmlns=\".*[^\"]\"(?=>)"
588
+ force_list_wos = (
589
+ "abstract",
590
+ "address_name",
591
+ "book_note",
592
+ "conf_date",
593
+ "conf_info",
594
+ "conf_location",
595
+ "conf_title",
596
+ "conference",
597
+ "contributor",
598
+ "doctype",
599
+ "grant",
600
+ "grant_id",
601
+ "heading",
602
+ "identifier",
603
+ "keyword",
604
+ "language",
605
+ "name",
606
+ "organization",
607
+ "p",
608
+ "publisher",
609
+ "reference",
610
+ "rw_author",
611
+ "sponsor",
612
+ "subheading",
613
+ "subject",
614
+ "suborganization",
615
+ "title",
616
+ "edition",
617
+ "zip",
618
+ )
619
+
620
+
621
+ @contextmanager
622
+ def nullcontext(enter_result=None):
623
+ """Context manager that does nothing.
624
+
625
+ Args:
626
+ enter_result: Value to return when entering the context
627
+
628
+ Yields:
629
+ The enter_result value
630
+ """
631
+ yield enter_result
632
+
633
+
634
+ def gunzip_file(fname_in, fname_out):
635
+ """Decompress a gzipped file.
636
+
637
+ Args:
638
+ fname_in: Path to input gzipped file
639
+ fname_out: Path to output decompressed file
640
+ """
641
+ with gzip.open(fname_in, "rb") as f_in:
642
+ with open(fname_out, "wb") as f_out:
643
+ copyfileobj(f_in, f_out)
644
+
645
+
646
+ def parse_simple(fp, good_cf, force_list=None, root_tag=None):
647
+ """Parse XML file with simple structure.
648
+
649
+ Args:
650
+ fp: File pointer to parse
651
+ good_cf: Function to check if an element is valid
652
+ force_list: List of tags that should always be lists
653
+ root_tag: Root tag to start parsing from
654
+
655
+ Returns:
656
+ dict: Parsed XML data
657
+ """
658
+ events = ("start", "end")
659
+ tree = et.iterparse(fp, events)
660
+ context = iter(tree)
661
+ event, root = next(context)
662
+ for event, pub in context:
663
+ if event == "end" and (pub.tag == root_tag if root_tag is not None else True):
664
+ item = et.tostring(pub, encoding="utf8", method="xml").decode("utf")
665
+ obj = xmltodict.parse(
666
+ item,
667
+ force_cdata=True,
668
+ force_list=force_list,
669
+ )
670
+ good_cf.push(obj)
671
+ root.clear()
672
+ if good_cf.stop():
673
+ break
674
+
675
+
676
+ def convert(
677
+ source: pathlib.Path,
678
+ target_root: str,
679
+ chunk_size: int = 10000,
680
+ max_chunks=None,
681
+ pattern: str | None = None,
682
+ force_list=None,
683
+ root_tag=None,
684
+ ):
685
+ """Convert XML file to JSON chunks.
686
+
687
+ This function processes an XML file and converts it to a series of JSON files,
688
+ with support for pattern-based transformation and chunking.
689
+
690
+ Args:
691
+ source: Path to source XML file
692
+ target_root: Root path for output files
693
+ chunk_size: Number of items per output file (default: 10000)
694
+ max_chunks: Maximum number of chunks to create (default: None)
695
+ pattern: Regular expression pattern for transformation
696
+ force_list: List of tags that should always be lists
697
+ root_tag: Root tag to start parsing from
698
+
699
+ Example:
700
+ >>> convert(
701
+ ... source="data.xml",
702
+ ... target_root="output",
703
+ ... chunk_size=1000,
704
+ ... pattern=r'xmlns="[^"]*"',
705
+ ... root_tag="PubmedArticle"
706
+ ... )
707
+ """
708
+ logger.info(f" chunksize : {chunk_size} | maxchunks {max_chunks} ")
709
+
710
+ good_cf = ChunkFlusherMono(target_root, chunk_size, max_chunks)
711
+ bad_cf = ChunkFlusherMono(target_root, chunk_size, max_chunks, suffix="bad")
712
+
713
+ if source.suffix == ".gz":
714
+ open_foo: Callable = gzip.open
715
+ elif source.suffix == ".xml":
716
+ open_foo = open
717
+ else:
718
+ raise ValueError("Unknown file type")
719
+ # pylint: disable-next=assignment
720
+ fp: gzip.GzipFile | FPSmart | None
721
+
722
+ with (
723
+ open_foo(source, "rb")
724
+ if isinstance( # type: ignore
725
+ source, pathlib.Path
726
+ )
727
+ else nullcontext() as fp
728
+ ):
729
+ if pattern is not None:
730
+ fp = FPSmart(fp, pattern)
731
+ else:
732
+ fp = fp
733
+ parse_simple(fp, good_cf, force_list, root_tag)
734
+
735
+ good_cf.flush_chunk()
736
+
737
+ logger.info(f" {good_cf.items_processed()} good records")
738
+ bad_cf.flush_chunk()
739
+ logger.info(f"{bad_cf.items_processed()} bad records")