endnote-utils 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
endnote_utils/core.py ADDED
@@ -0,0 +1,673 @@
1
+ # src/endnote_utils/core.py
2
+ from __future__ import annotations
3
+
4
+ import csv
5
+ import json
6
+ import logging
7
+ import time
8
+ import xml.etree.ElementTree as ET
9
+ from collections import Counter
10
+ from datetime import datetime
11
+ from pathlib import Path
12
+ from typing import Dict, Iterable, List, Optional, Tuple
13
+
14
+ # ----------------------------
15
+ # Public constants
16
+ # ----------------------------
17
+
18
+ DEFAULT_FIELDNAMES: List[str] = [
19
+ "database",
20
+ "ref_type",
21
+ "title",
22
+ "journal",
23
+ "authors",
24
+ "year",
25
+ "volume",
26
+ "number",
27
+ "abstract",
28
+ "doi",
29
+ "urls",
30
+ "keywords",
31
+ "publisher",
32
+ "isbn",
33
+ "language",
34
+ "extracted_date",
35
+ ]
36
+
37
+ CSV_QUOTING_MAP = {
38
+ "minimal": csv.QUOTE_MINIMAL,
39
+ "all": csv.QUOTE_ALL,
40
+ "nonnumeric": csv.QUOTE_NONNUMERIC,
41
+ "none": csv.QUOTE_NONE,
42
+ }
43
+
44
+ # Report layout
45
+ DUPES_DETAILS_LIMIT = 50
46
+ STATS_LIST_LIMIT = 20
47
+
48
+
49
+ # ----------------------------
50
+ # FS helpers
51
+ # ----------------------------
52
+
53
+ def ensure_parent_dir(p: Path) -> None:
54
+ """Create parent directory if it doesn't exist."""
55
+ p.parent.mkdir(parents=True, exist_ok=True)
56
+
57
+
58
+ # ----------------------------
59
+ # Text helpers
60
+ # ----------------------------
61
+
62
+ def clean_text(text: Optional[str]) -> str:
63
+ """
64
+ Trim, collapse internal whitespace, remove stray CRs, keep punctuation intact.
65
+ """
66
+ if not text:
67
+ return ""
68
+ text = text.replace("\r", " ")
69
+ return " ".join(text.split()).strip()
70
+
71
+
72
+ def safe_find_text(node: ET.Element, path: str) -> str:
73
+ """Find text with XPath and return cleaned string."""
74
+ elem = node.find(path)
75
+ return clean_text(elem.text) if elem is not None and elem.text is not None else ""
76
+
77
+
78
+ def join_nonempty(items: Iterable[str], sep: str) -> str:
79
+ return sep.join(x for x in (i.strip() for i in items) if x)
80
+
81
+
82
+ def normalize_text_for_key(s: str) -> str:
83
+ """Lowercase + strip non-alnum + single-space. Good for stable keys."""
84
+ if not s:
85
+ return ""
86
+ s = s.lower()
87
+ s = "".join(ch for ch in s if ch.isalnum() or ch.isspace())
88
+ return " ".join(s.split())
89
+
90
+
91
+ # ----------------------------
92
+ # Record extraction
93
+ # ----------------------------
94
+
95
+ def process_doi(record: ET.Element) -> str:
96
+ """Extract and format DOI information to a canonical URL if possible."""
97
+ doi_raw = safe_find_text(record, ".//electronic-resource-num/style")
98
+ if not doi_raw:
99
+ return ""
100
+ if doi_raw.startswith("10."):
101
+ return f"https://doi.org/{doi_raw}"
102
+ if doi_raw.startswith(("http://", "https://")):
103
+ return doi_raw
104
+ return ""
105
+
106
+
107
+ def extract_authors(record: ET.Element) -> str:
108
+ """Collect authors from //author/style, joined by '; '."""
109
+ authors: List[str] = []
110
+ for author in record.findall(".//author"):
111
+ style = author.find("style")
112
+ if style is not None and style.text:
113
+ authors.append(clean_text(style.text))
114
+ return join_nonempty(authors, "; ")
115
+
116
+
117
+ def extract_urls(record: ET.Element) -> str:
118
+ """Collect related URLs from //urls/related-urls/url/style, joined by ' | '."""
119
+ urls: List[str] = []
120
+ for url in record.findall(".//urls/related-urls/url"):
121
+ style = url.find("style")
122
+ if style is not None and style.text:
123
+ urls.append(clean_text(style.text))
124
+ # Deduplicate while preserving order
125
+ seen = set()
126
+ deduped = []
127
+ for u in urls:
128
+ if u not in seen:
129
+ seen.add(u)
130
+ deduped.append(u)
131
+ return join_nonempty(deduped, " | ")
132
+
133
+
134
+ def extract_keywords(record: ET.Element) -> str:
135
+ """Collect keywords from //keywords/keyword/style, joined by '; ' (deduped)."""
136
+ items: List[str] = []
137
+ for kw in record.findall(".//keywords/keyword"):
138
+ style = kw.find("style")
139
+ if style is not None and style.text:
140
+ items.append(clean_text(style.text))
141
+ seen = set()
142
+ out: List[str] = []
143
+ for x in items:
144
+ if x not in seen:
145
+ seen.add(x)
146
+ out.append(x)
147
+ return join_nonempty(out, "; ")
148
+
149
+
150
+ def process_record(record: ET.Element, database: str) -> Dict[str, str]:
151
+ """Transform a <record> element into a flat dictionary."""
152
+ ref_type_name = ""
153
+ ref_type = record.find("ref-type")
154
+ if ref_type is not None:
155
+ ref_type_name = ref_type.get("name") or ""
156
+
157
+ return {
158
+ "database": database,
159
+ "ref_type": clean_text(ref_type_name),
160
+ "title": safe_find_text(record, ".//title/style"),
161
+ "journal": safe_find_text(record, ".//secondary-title/style"),
162
+ "authors": extract_authors(record),
163
+ "year": safe_find_text(record, ".//year/style"),
164
+ "volume": safe_find_text(record, ".//volume/style"),
165
+ "number": safe_find_text(record, ".//number/style"),
166
+ "abstract": safe_find_text(record, ".//abstract/style"),
167
+ "doi": process_doi(record),
168
+ "urls": extract_urls(record),
169
+ "keywords": extract_keywords(record),
170
+ "publisher": safe_find_text(record, ".//publisher/style"),
171
+ "isbn": safe_find_text(record, ".//isbn/style"),
172
+ "language": safe_find_text(record, ".//language/style"),
173
+ "extracted_date": datetime.now().strftime("%Y-%m-%d"),
174
+ }
175
+
176
+
177
+ # ----------------------------
178
+ # XML streaming + filters
179
+ # ----------------------------
180
+
181
+ def iter_records(xml_path: Path) -> Iterable[ET.Element]:
182
+ """Stream <record> elements with low memory footprint."""
183
+ context = ET.iterparse(str(xml_path), events=("start", "end"))
184
+ _, root = next(context)
185
+ for event, elem in context:
186
+ if event == "end" and elem.tag == "record":
187
+ yield elem
188
+ elem.clear()
189
+ root.clear()
190
+
191
+
192
+ def record_matches_filters(row: Dict[str, str], ref_type: Optional[str], year: Optional[str]) -> bool:
193
+ if ref_type and row.get("ref_type") != ref_type:
194
+ return False
195
+ if year and row.get("year") != str(year):
196
+ return False
197
+ return True
198
+
199
+
200
+ # ----------------------------
201
+ # Deduplication helpers
202
+ # ----------------------------
203
+
204
+ def dedupe_key(row: Dict[str, str], mode: str) -> Optional[str]:
205
+ """
206
+ mode: 'none' | 'doi' | 'title-year'
207
+ Returns None when no applicable key can be formed (row passes through).
208
+ """
209
+ if mode == "doi":
210
+ k = (row.get("doi") or "").strip()
211
+ return k or None
212
+ if mode == "title-year":
213
+ title = normalize_text_for_key(row.get("title", ""))
214
+ year = (row.get("year") or "").strip()
215
+ if title and year:
216
+ return f"{title}::{year}"
217
+ return None
218
+ return None
219
+
220
+
221
+ # ----------------------------
222
+ # Retraction detection (basic heuristic)
223
+ # ----------------------------
224
+
225
+ def is_retraction(record: ET.Element) -> bool:
226
+ """
227
+ Heuristic: consider a record 'retraction' if its notes or title contain substrings like:
228
+ 'retraction', 'retracted', 'withdrawn', 'erratum'.
229
+ """
230
+ text_blob = " ".join(
231
+ [
232
+ safe_find_text(record, ".//notes/style"),
233
+ safe_find_text(record, ".//title/style"),
234
+ ]
235
+ ).lower()
236
+ indicators = ("retraction", "retracted", "withdrawn", "erratum")
237
+ return any(tok in text_blob for tok in indicators)
238
+
239
+
240
+ # ----------------------------
241
+ # Writers (CSV / JSON / XLSX)
242
+ # ----------------------------
243
+
244
+ def _write_rows_csv(
245
+ rows_iter: Iterable[Dict[str, str]],
246
+ out_path: Path,
247
+ fieldnames: List[str],
248
+ delimiter: str,
249
+ quoting: str,
250
+ include_header: bool,
251
+ encoding: str,
252
+ ) -> int:
253
+ qmode = CSV_QUOTING_MAP[quoting.lower()]
254
+ ensure_parent_dir(out_path)
255
+ count = 0
256
+ with open(out_path, "w", newline="", encoding=encoding) as f:
257
+ writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter=delimiter, quoting=qmode)
258
+ if include_header:
259
+ writer.writeheader()
260
+ for row in rows_iter:
261
+ writer.writerow({k: row.get(k, "") for k in fieldnames})
262
+ count += 1
263
+ return count
264
+
265
+
266
+ def _write_rows_json(
267
+ rows_iter: Iterable[Dict[str, str]],
268
+ out_path: Path,
269
+ fieldnames: List[str],
270
+ encoding: str,
271
+ ) -> int:
272
+ """Write a JSON array streaming without holding all rows in memory."""
273
+ ensure_parent_dir(out_path)
274
+ count = 0
275
+ with open(out_path, "w", encoding=encoding) as f:
276
+ f.write("[")
277
+ first = True
278
+ for row in rows_iter:
279
+ obj = {k: row.get(k, "") for k in fieldnames}
280
+ if first:
281
+ first = False
282
+ else:
283
+ f.write(",")
284
+ f.write(json.dumps(obj, ensure_ascii=False))
285
+ count += 1
286
+ f.write("]")
287
+ return count
288
+
289
+
290
+ def _write_rows_xlsx(
291
+ rows_iter: Iterable[Dict[str, str]],
292
+ out_path: Path,
293
+ fieldnames: List[str],
294
+ ) -> int:
295
+ """Write an Excel file using openpyxl (installed via project dependencies)."""
296
+ try:
297
+ from openpyxl import Workbook
298
+ except ImportError as e:
299
+ raise RuntimeError(
300
+ "Excel output requires 'openpyxl'. Ensure it is installed."
301
+ ) from e
302
+
303
+ ensure_parent_dir(out_path)
304
+ wb = Workbook()
305
+ ws = wb.active
306
+ ws.title = "records"
307
+ ws.append(fieldnames) # header
308
+
309
+ count = 0
310
+ for row in rows_iter:
311
+ ws.append([row.get(k, "") for k in fieldnames])
312
+ count += 1
313
+
314
+ wb.save(out_path)
315
+ return count
316
+
317
+
318
+ # ----------------------------
319
+ # Generic export + report (+ dedupe + stats, pretty report + duplicates table)
320
+ # ----------------------------
321
+
322
+ def export_files_with_report(
323
+ inputs: List[Path],
324
+ out_path: Path,
325
+ out_format: str, # "csv" | "json" | "xlsx"
326
+ *,
327
+ fieldnames: List[str] = None,
328
+ delimiter: str = ",",
329
+ quoting: str = "minimal",
330
+ include_header: bool = True,
331
+ encoding: str = "utf-8",
332
+ ref_type: Optional[str] = None,
333
+ year: Optional[str] = None,
334
+ max_records_per_file: Optional[int] = None,
335
+ report_path: Optional[Path] = None,
336
+ # Dedup + stats
337
+ dedupe: str = "none",
338
+ dedupe_keep: str = "first",
339
+ stats: bool = False,
340
+ stats_json: Optional[Path] = None,
341
+ top_authors: int = 10,
342
+ ) -> Tuple[int, Path, Optional[Path]]:
343
+ """
344
+ Stream records from one or many EndNote XML files and write to CSV/JSON/XLSX.
345
+ Writes a pretty TXT report unless report_path is None.
346
+
347
+ Deduplication:
348
+ - dedupe='doi' → unique by DOI
349
+ - dedupe='title-year' → unique by normalized (title, year)
350
+ - dedupe_keep='first' or 'last' (applies within each input file)
351
+
352
+ Stats (when stats=True) add counts by year/ref_type/journal and top authors.
353
+ stats_json (if provided) writes a JSON snapshot of these stats + duplicates.
354
+
355
+ The report now includes a per-database table: Origin / Retractions / Duplicates / Remaining.
356
+
357
+ Returns (total_rows_written, out_path, report_path or None if disabled).
358
+ """
359
+ fieldnames = fieldnames or DEFAULT_FIELDNAMES
360
+ out_format = out_format.lower()
361
+ if out_format not in {"csv", "json", "xlsx"}:
362
+ raise ValueError(f"Unknown out_format: {out_format}")
363
+
364
+ # Per-run accumulators
365
+ per_file_lines: List[str] = []
366
+
367
+ year_counter = Counter()
368
+ type_counter = Counter()
369
+ journal_counter = Counter()
370
+ author_counter = Counter()
371
+
372
+ # Dedupe state
373
+ seen_keys: set[str] = set()
374
+ duplicates_counter = Counter() # global key -> duplicate count
375
+ dupes_removed_per_db: Dict[str, int] = {}
376
+
377
+ # Per-database accounting for the table
378
+ # Origin: records after filters, before dedupe (and before max_records per file cap)
379
+ # Retractions: simple heuristic via is_retraction()
380
+ # Duplicates: removed due to dedupe
381
+ # Remaining: exported (post-dedupe)
382
+ per_db: Dict[str, Dict[str, int]] = {}
383
+
384
+ def rows() -> Iterable[Dict[str, str]]:
385
+ nonlocal per_file_lines, seen_keys, duplicates_counter, per_db
386
+ nonlocal year_counter, type_counter, journal_counter, author_counter, dupes_removed_per_db
387
+
388
+ for xml_path in inputs:
389
+ database = xml_path.stem
390
+ per_db.setdefault(database, {"origin": 0, "retractions": 0, "duplicates": 0, "remaining": 0})
391
+ dupes_removed_per_db.setdefault(database, 0)
392
+
393
+ logging.info("Processing %s (database=%s)", xml_path.name, database)
394
+
395
+ produced = 0
396
+ skipped = 0
397
+
398
+ buffered: List[Dict[str, str]] = []
399
+ buffered_keys_index: Dict[str, int] = {}
400
+
401
+ for rec in iter_records(xml_path):
402
+ try:
403
+ # Build row (for filters & output)
404
+ row = process_record(rec, database=database)
405
+
406
+ # Filter
407
+ if not record_matches_filters(row, ref_type, year):
408
+ continue
409
+
410
+ # Origin++ (count any passing-filter record before dedupe)
411
+ per_db[database]["origin"] += 1
412
+
413
+ # Retraction heuristic
414
+ if is_retraction(rec):
415
+ per_db[database]["retractions"] += 1
416
+
417
+ # Dedup
418
+ k = dedupe_key(row, dedupe)
419
+ if k and dedupe != "none":
420
+ if dedupe_keep == "first":
421
+ if k in seen_keys:
422
+ duplicates_counter[k] += 1
423
+ per_db[database]["duplicates"] += 1
424
+ dupes_removed_per_db[database] += 1
425
+ continue
426
+ seen_keys.add(k)
427
+ buffered.append(row)
428
+ produced += 1
429
+ else: # keep last within this file
430
+ if k in buffered_keys_index:
431
+ # replace old occurrence in this file buffer
432
+ prev_idx = buffered_keys_index[k]
433
+ buffered[prev_idx] = row
434
+ duplicates_counter[k] += 1
435
+ per_db[database]["duplicates"] += 1
436
+ dupes_removed_per_db[database] += 1
437
+ else:
438
+ buffered_keys_index[k] = len(buffered)
439
+ buffered.append(row)
440
+ produced += 1
441
+ seen_keys.add(k)
442
+ else:
443
+ buffered.append(row)
444
+ produced += 1
445
+
446
+ if max_records_per_file and produced >= max_records_per_file:
447
+ break
448
+
449
+ except Exception:
450
+ skipped += 1
451
+ logging.debug("Record error in %s", xml_path, exc_info=True)
452
+
453
+ # Remaining = exported from this file
454
+ per_db[database]["remaining"] += len(buffered)
455
+
456
+ per_file_lines.append(f"{xml_path.name:<15} : {len(buffered)} exported, {skipped} skipped")
457
+
458
+ # Stats
459
+ if stats:
460
+ for r in buffered:
461
+ y = (r.get("year") or "").strip()
462
+ t = (r.get("ref_type") or "").strip()
463
+ j = (r.get("journal") or "").strip()
464
+ if y:
465
+ year_counter[y] += 1
466
+ if t:
467
+ type_counter[t] += 1
468
+ if j:
469
+ journal_counter[j] += 1
470
+ if r.get("authors"):
471
+ for a in (x.strip() for x in r["authors"].split(";")):
472
+ if a:
473
+ author_counter[a] += 1
474
+
475
+ # Yield to writer
476
+ for r in buffered:
477
+ yield r
478
+
479
+ # Select writer
480
+ start_ts = time.time()
481
+ run_start = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
482
+
483
+ if out_format == "csv":
484
+ total = _write_rows_csv(rows(), out_path, fieldnames, delimiter, quoting, include_header, encoding)
485
+ elif out_format == "json":
486
+ total = _write_rows_json(rows(), out_path, fieldnames, encoding)
487
+ else: # xlsx
488
+ total = _write_rows_xlsx(rows(), out_path, fieldnames)
489
+
490
+ duration = time.time() - start_ts
491
+
492
+ # ---------- Pretty report builder ----------
493
+ def _header_line(title: str) -> List[str]:
494
+ bar = "=" * 40
495
+ return [bar, title, bar]
496
+
497
+ def _section_line(title: str) -> List[str]:
498
+ return ["", title, "-" * 40]
499
+
500
+ report_lines: List[str] = []
501
+ report_lines += _header_line("EndNote Export Report")
502
+ report_lines += [
503
+ f"Run started : {run_start}",
504
+ f"Files : {len(inputs)}",
505
+ f"Duration : {duration:.2f} seconds",
506
+ ]
507
+
508
+ # Per-file section
509
+ report_lines += _section_line("Per-file results")
510
+ report_lines += per_file_lines
511
+ report_lines.append(f"TOTAL exported: {total}")
512
+
513
+ # Per-database duplicates table
514
+ # Build totals row
515
+ if per_db:
516
+ report_lines += _section_line("Duplicates table (by database)")
517
+ # compute column widths
518
+ db_names = list(per_db.keys())
519
+ db_col_w = max([len("Database")] + [len(db) for db in db_names])
520
+
521
+ # totals
522
+ tot_origin = sum(d["origin"] for d in per_db.values())
523
+ tot_retract = sum(d["retractions"] for d in per_db.values())
524
+ tot_dupes = sum(d["duplicates"] for d in per_db.values())
525
+ tot_remain = sum(d["remaining"] for d in per_db.values())
526
+
527
+ header = f"{'Database':<{db_col_w}} {'Origin':>8} {'Retractions':>12} {'Duplicates':>10} {'Remaining':>10}"
528
+ report_lines.append(header)
529
+ report_lines.append("-" * len(header))
530
+
531
+ for db in sorted(per_db.keys()):
532
+ d = per_db[db]
533
+ line = (
534
+ f"{db:<{db_col_w}} "
535
+ f"{d['origin']:>8} "
536
+ f"{d['retractions']:>12} "
537
+ f"{d['duplicates']:>10} "
538
+ f"{d['remaining']:>10}"
539
+ )
540
+ report_lines.append(line)
541
+
542
+ total_line = (
543
+ f"{'TOTAL':<{db_col_w}} "
544
+ f"{tot_origin:>8} "
545
+ f"{tot_retract:>12} "
546
+ f"{tot_dupes:>10} "
547
+ f"{tot_remain:>10}"
548
+ )
549
+ report_lines.append(total_line)
550
+
551
+ # Duplicates key summary (top)
552
+ if dedupe != "none":
553
+ report_lines += _section_line("Duplicate keys (top)")
554
+ total_dupes_global = sum(duplicates_counter.values())
555
+ report_lines.append(f"Mode : {dedupe}")
556
+ report_lines.append(f"Keep : {dedupe_keep}")
557
+ report_lines.append(f"Removed: {total_dupes_global}")
558
+ if total_dupes_global > 0:
559
+ report_lines.append("Details (top):")
560
+ for k, c in duplicates_counter.most_common(DUPES_DETAILS_LIMIT):
561
+ report_lines.append(f" {k} : {c} duplicate(s)")
562
+
563
+ # Summary stats
564
+ if stats:
565
+ def head(counter: Counter, n: int = 10):
566
+ return [(k, c) for k, c in counter.most_common(n) if k]
567
+
568
+ report_lines += _section_line("Summary stats")
569
+ # Year
570
+ report_lines.append("By year:")
571
+ for y in sorted(year_counter.keys()):
572
+ report_lines.append(f" {y:>6} : {year_counter[y]}")
573
+ # Ref type
574
+ report_lines.append("")
575
+ report_lines.append("By ref_type (top):")
576
+ for k, c in head(type_counter, STATS_LIST_LIMIT):
577
+ report_lines.append(f" {k}: {c}")
578
+ # Journal
579
+ report_lines.append("")
580
+ report_lines.append(f"By journal (top {STATS_LIST_LIMIT}):")
581
+ for k, c in head(journal_counter, STATS_LIST_LIMIT):
582
+ report_lines.append(f" {k}: {c}")
583
+ # Authors
584
+ report_lines.append("")
585
+ report_lines.append(f"Top authors (top {top_authors}):")
586
+ for k, c in head(author_counter, top_authors):
587
+ report_lines.append(f" {k}: {c}")
588
+
589
+ # Optional JSON dump
590
+ if stats_json:
591
+ ensure_parent_dir(stats_json)
592
+ with open(stats_json, "w", encoding="utf-8") as jf:
593
+ json.dump(
594
+ {
595
+ "totals": {
596
+ "exported": total,
597
+ "files_processed": len(inputs),
598
+ "duration_seconds": duration,
599
+ },
600
+ "by_year": dict(year_counter),
601
+ "by_ref_type": dict(type_counter),
602
+ "by_journal": dict(journal_counter),
603
+ "top_authors": author_counter.most_common(top_authors),
604
+ "duplicates": {
605
+ "mode": dedupe,
606
+ "keep": dedupe_keep,
607
+ "removed": sum(duplicates_counter.values()) if dedupe != "none" else 0,
608
+ "top": duplicates_counter.most_common(DUPES_DETAILS_LIMIT) if dedupe != "none" else [],
609
+ "by_database": per_db,
610
+ },
611
+ },
612
+ jf,
613
+ ensure_ascii=False,
614
+ indent=2,
615
+ )
616
+
617
+ # Write report unless disabled
618
+ final_report_path: Optional[Path] = report_path
619
+ if final_report_path is not None:
620
+ final_report_path = final_report_path or out_path.with_name(out_path.stem + "_report.txt")
621
+ ensure_parent_dir(final_report_path)
622
+ with open(final_report_path, "w", encoding="utf-8") as rf:
623
+ rf.write("\n".join(report_lines))
624
+
625
+ return total, out_path, final_report_path
626
+
627
+
628
+ # ----------------------------
629
+ # Back-compat convenience wrappers (CSV only)
630
+ # ----------------------------
631
+
632
+ def export_files_to_csv_with_report(
633
+ inputs: List[Path],
634
+ csv_path: Path,
635
+ report_path: Optional[Path] = None,
636
+ *,
637
+ fieldnames: List[str] = None,
638
+ delimiter: str = ",",
639
+ quoting: str = "minimal",
640
+ include_header: bool = True,
641
+ encoding: str = "utf-8",
642
+ ref_type: Optional[str] = None,
643
+ year: Optional[str] = None,
644
+ max_records_per_file: Optional[int] = None,
645
+ ) -> Tuple[int, Path, Optional[Path]]:
646
+ """Legacy API: export to CSV + TXT report (or no report if report_path=None)."""
647
+ return export_files_with_report(
648
+ inputs=inputs,
649
+ out_path=csv_path,
650
+ out_format="csv",
651
+ fieldnames=fieldnames,
652
+ delimiter=delimiter,
653
+ quoting=quoting,
654
+ include_header=include_header,
655
+ encoding=encoding,
656
+ ref_type=ref_type,
657
+ year=year,
658
+ max_records_per_file=max_records_per_file,
659
+ report_path=report_path,
660
+ )
661
+
662
+
663
+ def export(xml_file: Path, csv_path: Path, **kwargs) -> Tuple[int, Path, Optional[Path]]:
664
+ """Convenience: single XML file to CSV (+report unless disabled)."""
665
+ return export_files_to_csv_with_report([xml_file], csv_path, **kwargs)
666
+
667
+
668
+ def export_folder(folder: Path, csv_path: Path, **kwargs) -> Tuple[int, Path, Optional[Path]]:
669
+ """Convenience: all *.xml in folder to CSV (+report unless disabled)."""
670
+ inputs = sorted(p for p in Path(folder).glob("*.xml") if p.is_file())
671
+ if not inputs:
672
+ raise FileNotFoundError(f"No *.xml found in {folder}")
673
+ return export_files_to_csv_with_report(inputs, csv_path, **kwargs)