endnote-utils 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- endnote_utils/cli.py +186 -0
- endnote_utils/core.py +673 -0
- endnote_utils-0.2.0.dist-info/METADATA +223 -0
- endnote_utils-0.2.0.dist-info/RECORD +8 -0
- endnote_utils-0.2.0.dist-info/top_level.txt +1 -0
- endnote-utils/cli.py +0 -54
- endnote-utils/core.py +0 -209
- endnote_utils-0.1.3.dist-info/METADATA +0 -145
- endnote_utils-0.1.3.dist-info/RECORD +0 -8
- endnote_utils-0.1.3.dist-info/top_level.txt +0 -1
- {endnote-utils → endnote_utils}/__init__.py +0 -0
- {endnote_utils-0.1.3.dist-info → endnote_utils-0.2.0.dist-info}/WHEEL +0 -0
- {endnote_utils-0.1.3.dist-info → endnote_utils-0.2.0.dist-info}/entry_points.txt +0 -0
endnote_utils/core.py
ADDED
@@ -0,0 +1,673 @@
|
|
1
|
+
# src/endnote_utils/core.py
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
import csv
|
5
|
+
import json
|
6
|
+
import logging
|
7
|
+
import time
|
8
|
+
import xml.etree.ElementTree as ET
|
9
|
+
from collections import Counter
|
10
|
+
from datetime import datetime
|
11
|
+
from pathlib import Path
|
12
|
+
from typing import Dict, Iterable, List, Optional, Tuple
|
13
|
+
|
14
|
+
# ----------------------------
|
15
|
+
# Public constants
|
16
|
+
# ----------------------------
|
17
|
+
|
18
|
+
DEFAULT_FIELDNAMES: List[str] = [
|
19
|
+
"database",
|
20
|
+
"ref_type",
|
21
|
+
"title",
|
22
|
+
"journal",
|
23
|
+
"authors",
|
24
|
+
"year",
|
25
|
+
"volume",
|
26
|
+
"number",
|
27
|
+
"abstract",
|
28
|
+
"doi",
|
29
|
+
"urls",
|
30
|
+
"keywords",
|
31
|
+
"publisher",
|
32
|
+
"isbn",
|
33
|
+
"language",
|
34
|
+
"extracted_date",
|
35
|
+
]
|
36
|
+
|
37
|
+
CSV_QUOTING_MAP = {
|
38
|
+
"minimal": csv.QUOTE_MINIMAL,
|
39
|
+
"all": csv.QUOTE_ALL,
|
40
|
+
"nonnumeric": csv.QUOTE_NONNUMERIC,
|
41
|
+
"none": csv.QUOTE_NONE,
|
42
|
+
}
|
43
|
+
|
44
|
+
# Report layout
|
45
|
+
DUPES_DETAILS_LIMIT = 50
|
46
|
+
STATS_LIST_LIMIT = 20
|
47
|
+
|
48
|
+
|
49
|
+
# ----------------------------
|
50
|
+
# FS helpers
|
51
|
+
# ----------------------------
|
52
|
+
|
53
|
+
def ensure_parent_dir(p: Path) -> None:
|
54
|
+
"""Create parent directory if it doesn't exist."""
|
55
|
+
p.parent.mkdir(parents=True, exist_ok=True)
|
56
|
+
|
57
|
+
|
58
|
+
# ----------------------------
|
59
|
+
# Text helpers
|
60
|
+
# ----------------------------
|
61
|
+
|
62
|
+
def clean_text(text: Optional[str]) -> str:
|
63
|
+
"""
|
64
|
+
Trim, collapse internal whitespace, remove stray CRs, keep punctuation intact.
|
65
|
+
"""
|
66
|
+
if not text:
|
67
|
+
return ""
|
68
|
+
text = text.replace("\r", " ")
|
69
|
+
return " ".join(text.split()).strip()
|
70
|
+
|
71
|
+
|
72
|
+
def safe_find_text(node: ET.Element, path: str) -> str:
|
73
|
+
"""Find text with XPath and return cleaned string."""
|
74
|
+
elem = node.find(path)
|
75
|
+
return clean_text(elem.text) if elem is not None and elem.text is not None else ""
|
76
|
+
|
77
|
+
|
78
|
+
def join_nonempty(items: Iterable[str], sep: str) -> str:
|
79
|
+
return sep.join(x for x in (i.strip() for i in items) if x)
|
80
|
+
|
81
|
+
|
82
|
+
def normalize_text_for_key(s: str) -> str:
|
83
|
+
"""Lowercase + strip non-alnum + single-space. Good for stable keys."""
|
84
|
+
if not s:
|
85
|
+
return ""
|
86
|
+
s = s.lower()
|
87
|
+
s = "".join(ch for ch in s if ch.isalnum() or ch.isspace())
|
88
|
+
return " ".join(s.split())
|
89
|
+
|
90
|
+
|
91
|
+
# ----------------------------
|
92
|
+
# Record extraction
|
93
|
+
# ----------------------------
|
94
|
+
|
95
|
+
def process_doi(record: ET.Element) -> str:
|
96
|
+
"""Extract and format DOI information to a canonical URL if possible."""
|
97
|
+
doi_raw = safe_find_text(record, ".//electronic-resource-num/style")
|
98
|
+
if not doi_raw:
|
99
|
+
return ""
|
100
|
+
if doi_raw.startswith("10."):
|
101
|
+
return f"https://doi.org/{doi_raw}"
|
102
|
+
if doi_raw.startswith(("http://", "https://")):
|
103
|
+
return doi_raw
|
104
|
+
return ""
|
105
|
+
|
106
|
+
|
107
|
+
def extract_authors(record: ET.Element) -> str:
|
108
|
+
"""Collect authors from //author/style, joined by '; '."""
|
109
|
+
authors: List[str] = []
|
110
|
+
for author in record.findall(".//author"):
|
111
|
+
style = author.find("style")
|
112
|
+
if style is not None and style.text:
|
113
|
+
authors.append(clean_text(style.text))
|
114
|
+
return join_nonempty(authors, "; ")
|
115
|
+
|
116
|
+
|
117
|
+
def extract_urls(record: ET.Element) -> str:
|
118
|
+
"""Collect related URLs from //urls/related-urls/url/style, joined by ' | '."""
|
119
|
+
urls: List[str] = []
|
120
|
+
for url in record.findall(".//urls/related-urls/url"):
|
121
|
+
style = url.find("style")
|
122
|
+
if style is not None and style.text:
|
123
|
+
urls.append(clean_text(style.text))
|
124
|
+
# Deduplicate while preserving order
|
125
|
+
seen = set()
|
126
|
+
deduped = []
|
127
|
+
for u in urls:
|
128
|
+
if u not in seen:
|
129
|
+
seen.add(u)
|
130
|
+
deduped.append(u)
|
131
|
+
return join_nonempty(deduped, " | ")
|
132
|
+
|
133
|
+
|
134
|
+
def extract_keywords(record: ET.Element) -> str:
|
135
|
+
"""Collect keywords from //keywords/keyword/style, joined by '; ' (deduped)."""
|
136
|
+
items: List[str] = []
|
137
|
+
for kw in record.findall(".//keywords/keyword"):
|
138
|
+
style = kw.find("style")
|
139
|
+
if style is not None and style.text:
|
140
|
+
items.append(clean_text(style.text))
|
141
|
+
seen = set()
|
142
|
+
out: List[str] = []
|
143
|
+
for x in items:
|
144
|
+
if x not in seen:
|
145
|
+
seen.add(x)
|
146
|
+
out.append(x)
|
147
|
+
return join_nonempty(out, "; ")
|
148
|
+
|
149
|
+
|
150
|
+
def process_record(record: ET.Element, database: str) -> Dict[str, str]:
|
151
|
+
"""Transform a <record> element into a flat dictionary."""
|
152
|
+
ref_type_name = ""
|
153
|
+
ref_type = record.find("ref-type")
|
154
|
+
if ref_type is not None:
|
155
|
+
ref_type_name = ref_type.get("name") or ""
|
156
|
+
|
157
|
+
return {
|
158
|
+
"database": database,
|
159
|
+
"ref_type": clean_text(ref_type_name),
|
160
|
+
"title": safe_find_text(record, ".//title/style"),
|
161
|
+
"journal": safe_find_text(record, ".//secondary-title/style"),
|
162
|
+
"authors": extract_authors(record),
|
163
|
+
"year": safe_find_text(record, ".//year/style"),
|
164
|
+
"volume": safe_find_text(record, ".//volume/style"),
|
165
|
+
"number": safe_find_text(record, ".//number/style"),
|
166
|
+
"abstract": safe_find_text(record, ".//abstract/style"),
|
167
|
+
"doi": process_doi(record),
|
168
|
+
"urls": extract_urls(record),
|
169
|
+
"keywords": extract_keywords(record),
|
170
|
+
"publisher": safe_find_text(record, ".//publisher/style"),
|
171
|
+
"isbn": safe_find_text(record, ".//isbn/style"),
|
172
|
+
"language": safe_find_text(record, ".//language/style"),
|
173
|
+
"extracted_date": datetime.now().strftime("%Y-%m-%d"),
|
174
|
+
}
|
175
|
+
|
176
|
+
|
177
|
+
# ----------------------------
|
178
|
+
# XML streaming + filters
|
179
|
+
# ----------------------------
|
180
|
+
|
181
|
+
def iter_records(xml_path: Path) -> Iterable[ET.Element]:
|
182
|
+
"""Stream <record> elements with low memory footprint."""
|
183
|
+
context = ET.iterparse(str(xml_path), events=("start", "end"))
|
184
|
+
_, root = next(context)
|
185
|
+
for event, elem in context:
|
186
|
+
if event == "end" and elem.tag == "record":
|
187
|
+
yield elem
|
188
|
+
elem.clear()
|
189
|
+
root.clear()
|
190
|
+
|
191
|
+
|
192
|
+
def record_matches_filters(row: Dict[str, str], ref_type: Optional[str], year: Optional[str]) -> bool:
|
193
|
+
if ref_type and row.get("ref_type") != ref_type:
|
194
|
+
return False
|
195
|
+
if year and row.get("year") != str(year):
|
196
|
+
return False
|
197
|
+
return True
|
198
|
+
|
199
|
+
|
200
|
+
# ----------------------------
|
201
|
+
# Deduplication helpers
|
202
|
+
# ----------------------------
|
203
|
+
|
204
|
+
def dedupe_key(row: Dict[str, str], mode: str) -> Optional[str]:
|
205
|
+
"""
|
206
|
+
mode: 'none' | 'doi' | 'title-year'
|
207
|
+
Returns None when no applicable key can be formed (row passes through).
|
208
|
+
"""
|
209
|
+
if mode == "doi":
|
210
|
+
k = (row.get("doi") or "").strip()
|
211
|
+
return k or None
|
212
|
+
if mode == "title-year":
|
213
|
+
title = normalize_text_for_key(row.get("title", ""))
|
214
|
+
year = (row.get("year") or "").strip()
|
215
|
+
if title and year:
|
216
|
+
return f"{title}::{year}"
|
217
|
+
return None
|
218
|
+
return None
|
219
|
+
|
220
|
+
|
221
|
+
# ----------------------------
|
222
|
+
# Retraction detection (basic heuristic)
|
223
|
+
# ----------------------------
|
224
|
+
|
225
|
+
def is_retraction(record: ET.Element) -> bool:
|
226
|
+
"""
|
227
|
+
Heuristic: consider a record 'retraction' if its notes or title contain substrings like:
|
228
|
+
'retraction', 'retracted', 'withdrawn', 'erratum'.
|
229
|
+
"""
|
230
|
+
text_blob = " ".join(
|
231
|
+
[
|
232
|
+
safe_find_text(record, ".//notes/style"),
|
233
|
+
safe_find_text(record, ".//title/style"),
|
234
|
+
]
|
235
|
+
).lower()
|
236
|
+
indicators = ("retraction", "retracted", "withdrawn", "erratum")
|
237
|
+
return any(tok in text_blob for tok in indicators)
|
238
|
+
|
239
|
+
|
240
|
+
# ----------------------------
|
241
|
+
# Writers (CSV / JSON / XLSX)
|
242
|
+
# ----------------------------
|
243
|
+
|
244
|
+
def _write_rows_csv(
|
245
|
+
rows_iter: Iterable[Dict[str, str]],
|
246
|
+
out_path: Path,
|
247
|
+
fieldnames: List[str],
|
248
|
+
delimiter: str,
|
249
|
+
quoting: str,
|
250
|
+
include_header: bool,
|
251
|
+
encoding: str,
|
252
|
+
) -> int:
|
253
|
+
qmode = CSV_QUOTING_MAP[quoting.lower()]
|
254
|
+
ensure_parent_dir(out_path)
|
255
|
+
count = 0
|
256
|
+
with open(out_path, "w", newline="", encoding=encoding) as f:
|
257
|
+
writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter=delimiter, quoting=qmode)
|
258
|
+
if include_header:
|
259
|
+
writer.writeheader()
|
260
|
+
for row in rows_iter:
|
261
|
+
writer.writerow({k: row.get(k, "") for k in fieldnames})
|
262
|
+
count += 1
|
263
|
+
return count
|
264
|
+
|
265
|
+
|
266
|
+
def _write_rows_json(
|
267
|
+
rows_iter: Iterable[Dict[str, str]],
|
268
|
+
out_path: Path,
|
269
|
+
fieldnames: List[str],
|
270
|
+
encoding: str,
|
271
|
+
) -> int:
|
272
|
+
"""Write a JSON array streaming without holding all rows in memory."""
|
273
|
+
ensure_parent_dir(out_path)
|
274
|
+
count = 0
|
275
|
+
with open(out_path, "w", encoding=encoding) as f:
|
276
|
+
f.write("[")
|
277
|
+
first = True
|
278
|
+
for row in rows_iter:
|
279
|
+
obj = {k: row.get(k, "") for k in fieldnames}
|
280
|
+
if first:
|
281
|
+
first = False
|
282
|
+
else:
|
283
|
+
f.write(",")
|
284
|
+
f.write(json.dumps(obj, ensure_ascii=False))
|
285
|
+
count += 1
|
286
|
+
f.write("]")
|
287
|
+
return count
|
288
|
+
|
289
|
+
|
290
|
+
def _write_rows_xlsx(
|
291
|
+
rows_iter: Iterable[Dict[str, str]],
|
292
|
+
out_path: Path,
|
293
|
+
fieldnames: List[str],
|
294
|
+
) -> int:
|
295
|
+
"""Write an Excel file using openpyxl (installed via project dependencies)."""
|
296
|
+
try:
|
297
|
+
from openpyxl import Workbook
|
298
|
+
except ImportError as e:
|
299
|
+
raise RuntimeError(
|
300
|
+
"Excel output requires 'openpyxl'. Ensure it is installed."
|
301
|
+
) from e
|
302
|
+
|
303
|
+
ensure_parent_dir(out_path)
|
304
|
+
wb = Workbook()
|
305
|
+
ws = wb.active
|
306
|
+
ws.title = "records"
|
307
|
+
ws.append(fieldnames) # header
|
308
|
+
|
309
|
+
count = 0
|
310
|
+
for row in rows_iter:
|
311
|
+
ws.append([row.get(k, "") for k in fieldnames])
|
312
|
+
count += 1
|
313
|
+
|
314
|
+
wb.save(out_path)
|
315
|
+
return count
|
316
|
+
|
317
|
+
|
318
|
+
# ----------------------------
|
319
|
+
# Generic export + report (+ dedupe + stats, pretty report + duplicates table)
|
320
|
+
# ----------------------------
|
321
|
+
|
322
|
+
def export_files_with_report(
|
323
|
+
inputs: List[Path],
|
324
|
+
out_path: Path,
|
325
|
+
out_format: str, # "csv" | "json" | "xlsx"
|
326
|
+
*,
|
327
|
+
fieldnames: List[str] = None,
|
328
|
+
delimiter: str = ",",
|
329
|
+
quoting: str = "minimal",
|
330
|
+
include_header: bool = True,
|
331
|
+
encoding: str = "utf-8",
|
332
|
+
ref_type: Optional[str] = None,
|
333
|
+
year: Optional[str] = None,
|
334
|
+
max_records_per_file: Optional[int] = None,
|
335
|
+
report_path: Optional[Path] = None,
|
336
|
+
# Dedup + stats
|
337
|
+
dedupe: str = "none",
|
338
|
+
dedupe_keep: str = "first",
|
339
|
+
stats: bool = False,
|
340
|
+
stats_json: Optional[Path] = None,
|
341
|
+
top_authors: int = 10,
|
342
|
+
) -> Tuple[int, Path, Optional[Path]]:
|
343
|
+
"""
|
344
|
+
Stream records from one or many EndNote XML files and write to CSV/JSON/XLSX.
|
345
|
+
Writes a pretty TXT report unless report_path is None.
|
346
|
+
|
347
|
+
Deduplication:
|
348
|
+
- dedupe='doi' → unique by DOI
|
349
|
+
- dedupe='title-year' → unique by normalized (title, year)
|
350
|
+
- dedupe_keep='first' or 'last' (applies within each input file)
|
351
|
+
|
352
|
+
Stats (when stats=True) add counts by year/ref_type/journal and top authors.
|
353
|
+
stats_json (if provided) writes a JSON snapshot of these stats + duplicates.
|
354
|
+
|
355
|
+
The report now includes a per-database table: Origin / Retractions / Duplicates / Remaining.
|
356
|
+
|
357
|
+
Returns (total_rows_written, out_path, report_path or None if disabled).
|
358
|
+
"""
|
359
|
+
fieldnames = fieldnames or DEFAULT_FIELDNAMES
|
360
|
+
out_format = out_format.lower()
|
361
|
+
if out_format not in {"csv", "json", "xlsx"}:
|
362
|
+
raise ValueError(f"Unknown out_format: {out_format}")
|
363
|
+
|
364
|
+
# Per-run accumulators
|
365
|
+
per_file_lines: List[str] = []
|
366
|
+
|
367
|
+
year_counter = Counter()
|
368
|
+
type_counter = Counter()
|
369
|
+
journal_counter = Counter()
|
370
|
+
author_counter = Counter()
|
371
|
+
|
372
|
+
# Dedupe state
|
373
|
+
seen_keys: set[str] = set()
|
374
|
+
duplicates_counter = Counter() # global key -> duplicate count
|
375
|
+
dupes_removed_per_db: Dict[str, int] = {}
|
376
|
+
|
377
|
+
# Per-database accounting for the table
|
378
|
+
# Origin: records after filters, before dedupe (and before max_records per file cap)
|
379
|
+
# Retractions: simple heuristic via is_retraction()
|
380
|
+
# Duplicates: removed due to dedupe
|
381
|
+
# Remaining: exported (post-dedupe)
|
382
|
+
per_db: Dict[str, Dict[str, int]] = {}
|
383
|
+
|
384
|
+
def rows() -> Iterable[Dict[str, str]]:
|
385
|
+
nonlocal per_file_lines, seen_keys, duplicates_counter, per_db
|
386
|
+
nonlocal year_counter, type_counter, journal_counter, author_counter, dupes_removed_per_db
|
387
|
+
|
388
|
+
for xml_path in inputs:
|
389
|
+
database = xml_path.stem
|
390
|
+
per_db.setdefault(database, {"origin": 0, "retractions": 0, "duplicates": 0, "remaining": 0})
|
391
|
+
dupes_removed_per_db.setdefault(database, 0)
|
392
|
+
|
393
|
+
logging.info("Processing %s (database=%s)", xml_path.name, database)
|
394
|
+
|
395
|
+
produced = 0
|
396
|
+
skipped = 0
|
397
|
+
|
398
|
+
buffered: List[Dict[str, str]] = []
|
399
|
+
buffered_keys_index: Dict[str, int] = {}
|
400
|
+
|
401
|
+
for rec in iter_records(xml_path):
|
402
|
+
try:
|
403
|
+
# Build row (for filters & output)
|
404
|
+
row = process_record(rec, database=database)
|
405
|
+
|
406
|
+
# Filter
|
407
|
+
if not record_matches_filters(row, ref_type, year):
|
408
|
+
continue
|
409
|
+
|
410
|
+
# Origin++ (count any passing-filter record before dedupe)
|
411
|
+
per_db[database]["origin"] += 1
|
412
|
+
|
413
|
+
# Retraction heuristic
|
414
|
+
if is_retraction(rec):
|
415
|
+
per_db[database]["retractions"] += 1
|
416
|
+
|
417
|
+
# Dedup
|
418
|
+
k = dedupe_key(row, dedupe)
|
419
|
+
if k and dedupe != "none":
|
420
|
+
if dedupe_keep == "first":
|
421
|
+
if k in seen_keys:
|
422
|
+
duplicates_counter[k] += 1
|
423
|
+
per_db[database]["duplicates"] += 1
|
424
|
+
dupes_removed_per_db[database] += 1
|
425
|
+
continue
|
426
|
+
seen_keys.add(k)
|
427
|
+
buffered.append(row)
|
428
|
+
produced += 1
|
429
|
+
else: # keep last within this file
|
430
|
+
if k in buffered_keys_index:
|
431
|
+
# replace old occurrence in this file buffer
|
432
|
+
prev_idx = buffered_keys_index[k]
|
433
|
+
buffered[prev_idx] = row
|
434
|
+
duplicates_counter[k] += 1
|
435
|
+
per_db[database]["duplicates"] += 1
|
436
|
+
dupes_removed_per_db[database] += 1
|
437
|
+
else:
|
438
|
+
buffered_keys_index[k] = len(buffered)
|
439
|
+
buffered.append(row)
|
440
|
+
produced += 1
|
441
|
+
seen_keys.add(k)
|
442
|
+
else:
|
443
|
+
buffered.append(row)
|
444
|
+
produced += 1
|
445
|
+
|
446
|
+
if max_records_per_file and produced >= max_records_per_file:
|
447
|
+
break
|
448
|
+
|
449
|
+
except Exception:
|
450
|
+
skipped += 1
|
451
|
+
logging.debug("Record error in %s", xml_path, exc_info=True)
|
452
|
+
|
453
|
+
# Remaining = exported from this file
|
454
|
+
per_db[database]["remaining"] += len(buffered)
|
455
|
+
|
456
|
+
per_file_lines.append(f"{xml_path.name:<15} : {len(buffered)} exported, {skipped} skipped")
|
457
|
+
|
458
|
+
# Stats
|
459
|
+
if stats:
|
460
|
+
for r in buffered:
|
461
|
+
y = (r.get("year") or "").strip()
|
462
|
+
t = (r.get("ref_type") or "").strip()
|
463
|
+
j = (r.get("journal") or "").strip()
|
464
|
+
if y:
|
465
|
+
year_counter[y] += 1
|
466
|
+
if t:
|
467
|
+
type_counter[t] += 1
|
468
|
+
if j:
|
469
|
+
journal_counter[j] += 1
|
470
|
+
if r.get("authors"):
|
471
|
+
for a in (x.strip() for x in r["authors"].split(";")):
|
472
|
+
if a:
|
473
|
+
author_counter[a] += 1
|
474
|
+
|
475
|
+
# Yield to writer
|
476
|
+
for r in buffered:
|
477
|
+
yield r
|
478
|
+
|
479
|
+
# Select writer
|
480
|
+
start_ts = time.time()
|
481
|
+
run_start = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
482
|
+
|
483
|
+
if out_format == "csv":
|
484
|
+
total = _write_rows_csv(rows(), out_path, fieldnames, delimiter, quoting, include_header, encoding)
|
485
|
+
elif out_format == "json":
|
486
|
+
total = _write_rows_json(rows(), out_path, fieldnames, encoding)
|
487
|
+
else: # xlsx
|
488
|
+
total = _write_rows_xlsx(rows(), out_path, fieldnames)
|
489
|
+
|
490
|
+
duration = time.time() - start_ts
|
491
|
+
|
492
|
+
# ---------- Pretty report builder ----------
|
493
|
+
def _header_line(title: str) -> List[str]:
|
494
|
+
bar = "=" * 40
|
495
|
+
return [bar, title, bar]
|
496
|
+
|
497
|
+
def _section_line(title: str) -> List[str]:
|
498
|
+
return ["", title, "-" * 40]
|
499
|
+
|
500
|
+
report_lines: List[str] = []
|
501
|
+
report_lines += _header_line("EndNote Export Report")
|
502
|
+
report_lines += [
|
503
|
+
f"Run started : {run_start}",
|
504
|
+
f"Files : {len(inputs)}",
|
505
|
+
f"Duration : {duration:.2f} seconds",
|
506
|
+
]
|
507
|
+
|
508
|
+
# Per-file section
|
509
|
+
report_lines += _section_line("Per-file results")
|
510
|
+
report_lines += per_file_lines
|
511
|
+
report_lines.append(f"TOTAL exported: {total}")
|
512
|
+
|
513
|
+
# Per-database duplicates table
|
514
|
+
# Build totals row
|
515
|
+
if per_db:
|
516
|
+
report_lines += _section_line("Duplicates table (by database)")
|
517
|
+
# compute column widths
|
518
|
+
db_names = list(per_db.keys())
|
519
|
+
db_col_w = max([len("Database")] + [len(db) for db in db_names])
|
520
|
+
|
521
|
+
# totals
|
522
|
+
tot_origin = sum(d["origin"] for d in per_db.values())
|
523
|
+
tot_retract = sum(d["retractions"] for d in per_db.values())
|
524
|
+
tot_dupes = sum(d["duplicates"] for d in per_db.values())
|
525
|
+
tot_remain = sum(d["remaining"] for d in per_db.values())
|
526
|
+
|
527
|
+
header = f"{'Database':<{db_col_w}} {'Origin':>8} {'Retractions':>12} {'Duplicates':>10} {'Remaining':>10}"
|
528
|
+
report_lines.append(header)
|
529
|
+
report_lines.append("-" * len(header))
|
530
|
+
|
531
|
+
for db in sorted(per_db.keys()):
|
532
|
+
d = per_db[db]
|
533
|
+
line = (
|
534
|
+
f"{db:<{db_col_w}} "
|
535
|
+
f"{d['origin']:>8} "
|
536
|
+
f"{d['retractions']:>12} "
|
537
|
+
f"{d['duplicates']:>10} "
|
538
|
+
f"{d['remaining']:>10}"
|
539
|
+
)
|
540
|
+
report_lines.append(line)
|
541
|
+
|
542
|
+
total_line = (
|
543
|
+
f"{'TOTAL':<{db_col_w}} "
|
544
|
+
f"{tot_origin:>8} "
|
545
|
+
f"{tot_retract:>12} "
|
546
|
+
f"{tot_dupes:>10} "
|
547
|
+
f"{tot_remain:>10}"
|
548
|
+
)
|
549
|
+
report_lines.append(total_line)
|
550
|
+
|
551
|
+
# Duplicates key summary (top)
|
552
|
+
if dedupe != "none":
|
553
|
+
report_lines += _section_line("Duplicate keys (top)")
|
554
|
+
total_dupes_global = sum(duplicates_counter.values())
|
555
|
+
report_lines.append(f"Mode : {dedupe}")
|
556
|
+
report_lines.append(f"Keep : {dedupe_keep}")
|
557
|
+
report_lines.append(f"Removed: {total_dupes_global}")
|
558
|
+
if total_dupes_global > 0:
|
559
|
+
report_lines.append("Details (top):")
|
560
|
+
for k, c in duplicates_counter.most_common(DUPES_DETAILS_LIMIT):
|
561
|
+
report_lines.append(f" {k} : {c} duplicate(s)")
|
562
|
+
|
563
|
+
# Summary stats
|
564
|
+
if stats:
|
565
|
+
def head(counter: Counter, n: int = 10):
|
566
|
+
return [(k, c) for k, c in counter.most_common(n) if k]
|
567
|
+
|
568
|
+
report_lines += _section_line("Summary stats")
|
569
|
+
# Year
|
570
|
+
report_lines.append("By year:")
|
571
|
+
for y in sorted(year_counter.keys()):
|
572
|
+
report_lines.append(f" {y:>6} : {year_counter[y]}")
|
573
|
+
# Ref type
|
574
|
+
report_lines.append("")
|
575
|
+
report_lines.append("By ref_type (top):")
|
576
|
+
for k, c in head(type_counter, STATS_LIST_LIMIT):
|
577
|
+
report_lines.append(f" {k}: {c}")
|
578
|
+
# Journal
|
579
|
+
report_lines.append("")
|
580
|
+
report_lines.append(f"By journal (top {STATS_LIST_LIMIT}):")
|
581
|
+
for k, c in head(journal_counter, STATS_LIST_LIMIT):
|
582
|
+
report_lines.append(f" {k}: {c}")
|
583
|
+
# Authors
|
584
|
+
report_lines.append("")
|
585
|
+
report_lines.append(f"Top authors (top {top_authors}):")
|
586
|
+
for k, c in head(author_counter, top_authors):
|
587
|
+
report_lines.append(f" {k}: {c}")
|
588
|
+
|
589
|
+
# Optional JSON dump
|
590
|
+
if stats_json:
|
591
|
+
ensure_parent_dir(stats_json)
|
592
|
+
with open(stats_json, "w", encoding="utf-8") as jf:
|
593
|
+
json.dump(
|
594
|
+
{
|
595
|
+
"totals": {
|
596
|
+
"exported": total,
|
597
|
+
"files_processed": len(inputs),
|
598
|
+
"duration_seconds": duration,
|
599
|
+
},
|
600
|
+
"by_year": dict(year_counter),
|
601
|
+
"by_ref_type": dict(type_counter),
|
602
|
+
"by_journal": dict(journal_counter),
|
603
|
+
"top_authors": author_counter.most_common(top_authors),
|
604
|
+
"duplicates": {
|
605
|
+
"mode": dedupe,
|
606
|
+
"keep": dedupe_keep,
|
607
|
+
"removed": sum(duplicates_counter.values()) if dedupe != "none" else 0,
|
608
|
+
"top": duplicates_counter.most_common(DUPES_DETAILS_LIMIT) if dedupe != "none" else [],
|
609
|
+
"by_database": per_db,
|
610
|
+
},
|
611
|
+
},
|
612
|
+
jf,
|
613
|
+
ensure_ascii=False,
|
614
|
+
indent=2,
|
615
|
+
)
|
616
|
+
|
617
|
+
# Write report unless disabled
|
618
|
+
final_report_path: Optional[Path] = report_path
|
619
|
+
if final_report_path is not None:
|
620
|
+
final_report_path = final_report_path or out_path.with_name(out_path.stem + "_report.txt")
|
621
|
+
ensure_parent_dir(final_report_path)
|
622
|
+
with open(final_report_path, "w", encoding="utf-8") as rf:
|
623
|
+
rf.write("\n".join(report_lines))
|
624
|
+
|
625
|
+
return total, out_path, final_report_path
|
626
|
+
|
627
|
+
|
628
|
+
# ----------------------------
|
629
|
+
# Back-compat convenience wrappers (CSV only)
|
630
|
+
# ----------------------------
|
631
|
+
|
632
|
+
def export_files_to_csv_with_report(
|
633
|
+
inputs: List[Path],
|
634
|
+
csv_path: Path,
|
635
|
+
report_path: Optional[Path] = None,
|
636
|
+
*,
|
637
|
+
fieldnames: List[str] = None,
|
638
|
+
delimiter: str = ",",
|
639
|
+
quoting: str = "minimal",
|
640
|
+
include_header: bool = True,
|
641
|
+
encoding: str = "utf-8",
|
642
|
+
ref_type: Optional[str] = None,
|
643
|
+
year: Optional[str] = None,
|
644
|
+
max_records_per_file: Optional[int] = None,
|
645
|
+
) -> Tuple[int, Path, Optional[Path]]:
|
646
|
+
"""Legacy API: export to CSV + TXT report (or no report if report_path=None)."""
|
647
|
+
return export_files_with_report(
|
648
|
+
inputs=inputs,
|
649
|
+
out_path=csv_path,
|
650
|
+
out_format="csv",
|
651
|
+
fieldnames=fieldnames,
|
652
|
+
delimiter=delimiter,
|
653
|
+
quoting=quoting,
|
654
|
+
include_header=include_header,
|
655
|
+
encoding=encoding,
|
656
|
+
ref_type=ref_type,
|
657
|
+
year=year,
|
658
|
+
max_records_per_file=max_records_per_file,
|
659
|
+
report_path=report_path,
|
660
|
+
)
|
661
|
+
|
662
|
+
|
663
|
+
def export(xml_file: Path, csv_path: Path, **kwargs) -> Tuple[int, Path, Optional[Path]]:
|
664
|
+
"""Convenience: single XML file to CSV (+report unless disabled)."""
|
665
|
+
return export_files_to_csv_with_report([xml_file], csv_path, **kwargs)
|
666
|
+
|
667
|
+
|
668
|
+
def export_folder(folder: Path, csv_path: Path, **kwargs) -> Tuple[int, Path, Optional[Path]]:
|
669
|
+
"""Convenience: all *.xml in folder to CSV (+report unless disabled)."""
|
670
|
+
inputs = sorted(p for p in Path(folder).glob("*.xml") if p.is_file())
|
671
|
+
if not inputs:
|
672
|
+
raise FileNotFoundError(f"No *.xml found in {folder}")
|
673
|
+
return export_files_to_csv_with_report(inputs, csv_path, **kwargs)
|