endnote-utils 0.1.4__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- endnote_utils/cli.py +164 -32
- endnote_utils/core.py +518 -54
- endnote_utils-0.2.0.dist-info/METADATA +223 -0
- endnote_utils-0.2.0.dist-info/RECORD +8 -0
- endnote_utils-0.1.4.dist-info/METADATA +0 -145
- endnote_utils-0.1.4.dist-info/RECORD +0 -8
- {endnote_utils-0.1.4.dist-info → endnote_utils-0.2.0.dist-info}/WHEEL +0 -0
- {endnote_utils-0.1.4.dist-info → endnote_utils-0.2.0.dist-info}/entry_points.txt +0 -0
- {endnote_utils-0.1.4.dist-info → endnote_utils-0.2.0.dist-info}/top_level.txt +0 -0
endnote_utils/cli.py
CHANGED
@@ -4,51 +4,183 @@ import argparse
|
|
4
4
|
import logging
|
5
5
|
import sys
|
6
6
|
from pathlib import Path
|
7
|
+
from typing import List, Optional, Tuple
|
7
8
|
|
8
|
-
from .core import
|
9
|
+
from .core import (
|
10
|
+
DEFAULT_FIELDNAMES,
|
11
|
+
export_files_with_report, # generic writer: csv/json/xlsx
|
12
|
+
)
|
13
|
+
|
14
|
+
SUPPORTED_FORMATS = ("csv", "json", "xlsx")
|
15
|
+
EXT_TO_FORMAT = {".csv": "csv", ".json": "json", ".xlsx": "xlsx"}
|
9
16
|
|
10
17
|
|
11
18
|
def build_parser() -> argparse.ArgumentParser:
|
12
|
-
p = argparse.ArgumentParser(
|
19
|
+
p = argparse.ArgumentParser(
|
20
|
+
description="Export EndNote XML (file or folder) to CSV/JSON/XLSX with a TXT report."
|
21
|
+
)
|
22
|
+
|
23
|
+
# Input source (mutually exclusive)
|
13
24
|
g = p.add_mutually_exclusive_group(required=True)
|
14
25
|
g.add_argument("--xml", help="Path to a single EndNote XML file.")
|
15
26
|
g.add_argument("--folder", help="Path to a folder containing *.xml files.")
|
16
|
-
|
17
|
-
|
18
|
-
p.add_argument(
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
p.add_argument(
|
24
|
-
|
25
|
-
|
27
|
+
|
28
|
+
# Output selection (CSV legacy flag + new generic flags)
|
29
|
+
p.add_argument(
|
30
|
+
"--csv",
|
31
|
+
required=False,
|
32
|
+
help="(Legacy) Output CSV path. Prefer --out for csv/json/xlsx.",
|
33
|
+
)
|
34
|
+
p.add_argument(
|
35
|
+
"--out",
|
36
|
+
required=False,
|
37
|
+
help="Generic output path; format inferred from file extension if --format not provided. "
|
38
|
+
"Supported extensions: .csv, .json, .xlsx",
|
39
|
+
)
|
40
|
+
p.add_argument(
|
41
|
+
"--format",
|
42
|
+
choices=SUPPORTED_FORMATS,
|
43
|
+
help="Output format. If omitted, inferred from --out extension or --csv.",
|
44
|
+
)
|
45
|
+
|
46
|
+
# Report controls
|
47
|
+
p.add_argument("--report", required=False, help="Path to TXT report (default: <output>_report.txt).")
|
48
|
+
p.add_argument(
|
49
|
+
"--no-report",
|
50
|
+
action="store_true",
|
51
|
+
help="Disable writing the TXT report (by default, a report is always generated).",
|
52
|
+
)
|
53
|
+
|
54
|
+
# CSV-specific formatting options (ignored for JSON/XLSX except delimiter/quoting/header)
|
55
|
+
p.add_argument("--delimiter", default=",", help="CSV delimiter (default: ',').")
|
56
|
+
p.add_argument(
|
57
|
+
"--quoting",
|
58
|
+
default="minimal",
|
59
|
+
choices=["minimal", "all", "nonnumeric", "none"],
|
60
|
+
help="CSV quoting mode (default: minimal).",
|
61
|
+
)
|
62
|
+
p.add_argument("--no-header", action="store_true", help="Do not write CSV header row.")
|
63
|
+
p.add_argument("--encoding", default="utf-8", help="Output text encoding (default: utf-8).")
|
64
|
+
|
65
|
+
# Filters / limits
|
66
|
+
p.add_argument("--ref-type", default=None, help="Filter by ref_type name.")
|
67
|
+
p.add_argument("--year", default=None, help="Filter by year.")
|
68
|
+
p.add_argument("--max-records", type=int, default=None, help="Max records per file (testing).")
|
69
|
+
|
70
|
+
# Deduplication & Stats
|
71
|
+
p.add_argument("--dedupe", choices=["none", "doi", "title-year"], default="none",
|
72
|
+
help="Deduplicate records by key. Default: none.")
|
73
|
+
p.add_argument("--dedupe-keep", choices=["first", "last"], default="first",
|
74
|
+
help="When duplicates found, keep the first or last occurrence. Default: first.")
|
75
|
+
p.add_argument("--stats", action="store_true",
|
76
|
+
help="Compute summary stats and include them in the TXT report.")
|
77
|
+
p.add_argument("--stats-json",
|
78
|
+
help="Optional JSON file path to write detailed stats (when --stats is used).")
|
79
|
+
p.add_argument("--top-authors", type=int, default=10,
|
80
|
+
help="How many top authors to list in the report/stats JSON. Default: 10.")
|
81
|
+
|
82
|
+
# Verbosity
|
83
|
+
p.add_argument("--verbose", action="store_true", help="Verbose logging.")
|
84
|
+
|
26
85
|
return p
|
27
86
|
|
87
|
+
|
88
|
+
def _resolve_inputs(args: argparse.Namespace) -> List[Path]:
|
89
|
+
if args.xml:
|
90
|
+
xml_path = Path(args.xml)
|
91
|
+
if not xml_path.is_file():
|
92
|
+
raise FileNotFoundError(xml_path)
|
93
|
+
return [xml_path]
|
94
|
+
|
95
|
+
folder = Path(args.folder)
|
96
|
+
if not folder.is_dir():
|
97
|
+
raise FileNotFoundError(folder)
|
98
|
+
inputs = sorted(p for p in folder.glob("*.xml") if p.is_file())
|
99
|
+
if not inputs:
|
100
|
+
raise FileNotFoundError(f"No *.xml files found in folder: {folder}")
|
101
|
+
return inputs
|
102
|
+
|
103
|
+
|
104
|
+
def _resolve_output_and_format(args: argparse.Namespace) -> tuple[Path, str, Optional[Path]]:
|
105
|
+
"""
|
106
|
+
Decide final out_path, out_format, and report_path using:
|
107
|
+
- Prefer --out/--format if provided
|
108
|
+
- Fallback to --csv (legacy) which implies CSV
|
109
|
+
- If --no-report, return report_path=None
|
110
|
+
"""
|
111
|
+
target_path: Optional[Path] = None
|
112
|
+
out_format: Optional[str] = None
|
113
|
+
|
114
|
+
if args.out:
|
115
|
+
target_path = Path(args.out)
|
116
|
+
out_format = args.format
|
117
|
+
if not out_format:
|
118
|
+
# infer from extension
|
119
|
+
out_format = EXT_TO_FORMAT.get(target_path.suffix.lower())
|
120
|
+
if not out_format:
|
121
|
+
raise SystemExit(
|
122
|
+
"Cannot infer output format from extension. "
|
123
|
+
"Use --format {csv,json,xlsx} or set a supported extension."
|
124
|
+
)
|
125
|
+
elif args.csv:
|
126
|
+
target_path = Path(args.csv)
|
127
|
+
out_format = args.format or "csv"
|
128
|
+
if out_format != "csv":
|
129
|
+
# user asked for non-csv but used --csv path
|
130
|
+
raise SystemExit("When using --csv, --format must be 'csv'. Use --out for json/xlsx.")
|
131
|
+
else:
|
132
|
+
raise SystemExit("You must provide either --out (preferred) or --csv (legacy).")
|
133
|
+
|
134
|
+
# Report path defaults next to chosen output file (unless disabled)
|
135
|
+
if args.no_report:
|
136
|
+
report_path: Optional[Path] = None
|
137
|
+
else:
|
138
|
+
report_path = Path(args.report) if args.report else target_path.with_name(target_path.stem + "_report.txt")
|
139
|
+
|
140
|
+
return target_path, out_format, report_path
|
141
|
+
|
142
|
+
|
28
143
|
def main() -> None:
|
29
144
|
args = build_parser().parse_args()
|
30
145
|
logging.basicConfig(
|
31
146
|
level=logging.DEBUG if args.verbose else logging.INFO,
|
32
|
-
format="%(levelname)s: %(message)s",
|
33
|
-
|
34
|
-
csv_path = Path(args.csv)
|
35
|
-
report_path = Path(args.report) if args.report else csv_path.with_name(csv_path.stem + "_report.txt")
|
36
|
-
kwargs = dict(
|
37
|
-
report_path=report_path,
|
38
|
-
fieldnames=DEFAULT_FIELDNAMES,
|
39
|
-
delimiter=args.delimiter,
|
40
|
-
quoting=args.quoting,
|
41
|
-
include_header=not args.no_header,
|
42
|
-
encoding=args.encoding,
|
43
|
-
ref_type=args.ref_type,
|
44
|
-
year=args.year,
|
45
|
-
max_records_per_file=args.max_records,
|
147
|
+
format="%(levelname)s: %(message)s",
|
148
|
+
stream=sys.stderr,
|
46
149
|
)
|
47
150
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
151
|
+
try:
|
152
|
+
inputs = _resolve_inputs(args)
|
153
|
+
out_path, out_format, report_path = _resolve_output_and_format(args)
|
154
|
+
|
155
|
+
total, final_out, final_report = export_files_with_report(
|
156
|
+
inputs=inputs,
|
157
|
+
out_path=out_path,
|
158
|
+
out_format=out_format,
|
159
|
+
fieldnames=DEFAULT_FIELDNAMES,
|
160
|
+
delimiter=args.delimiter,
|
161
|
+
quoting=args.quoting,
|
162
|
+
include_header=not args.no_header,
|
163
|
+
encoding=args.encoding,
|
164
|
+
ref_type=args.ref_type,
|
165
|
+
year=args.year,
|
166
|
+
max_records_per_file=args.max_records,
|
167
|
+
dedupe=args.dedupe,
|
168
|
+
dedupe_keep=args.dedupe_keep,
|
169
|
+
stats=args.stats,
|
170
|
+
stats_json=Path(args.stats_json) if args.stats_json else None,
|
171
|
+
top_authors=args.top_authors,
|
172
|
+
report_path=report_path, # may be None → core should skip writing report
|
173
|
+
)
|
174
|
+
|
175
|
+
logging.info("Exported %d record(s) → %s", total, final_out)
|
176
|
+
if report_path is None:
|
177
|
+
logging.info("Report disabled by --no-report.")
|
178
|
+
else:
|
179
|
+
logging.info("Report → %s", final_report)
|
52
180
|
|
53
|
-
|
54
|
-
|
181
|
+
except FileNotFoundError as e:
|
182
|
+
logging.error("File/folder not found: %s", e)
|
183
|
+
sys.exit(1)
|
184
|
+
except Exception as e:
|
185
|
+
logging.error("Unexpected error: %s", e)
|
186
|
+
sys.exit(2)
|
endnote_utils/core.py
CHANGED
@@ -1,17 +1,37 @@
|
|
1
|
-
# src/
|
1
|
+
# src/endnote_utils/core.py
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
4
|
import csv
|
5
|
+
import json
|
5
6
|
import logging
|
6
7
|
import time
|
7
8
|
import xml.etree.ElementTree as ET
|
9
|
+
from collections import Counter
|
8
10
|
from datetime import datetime
|
9
11
|
from pathlib import Path
|
10
12
|
from typing import Dict, Iterable, List, Optional, Tuple
|
11
13
|
|
14
|
+
# ----------------------------
|
15
|
+
# Public constants
|
16
|
+
# ----------------------------
|
17
|
+
|
12
18
|
DEFAULT_FIELDNAMES: List[str] = [
|
13
|
-
"database",
|
14
|
-
"
|
19
|
+
"database",
|
20
|
+
"ref_type",
|
21
|
+
"title",
|
22
|
+
"journal",
|
23
|
+
"authors",
|
24
|
+
"year",
|
25
|
+
"volume",
|
26
|
+
"number",
|
27
|
+
"abstract",
|
28
|
+
"doi",
|
29
|
+
"urls",
|
30
|
+
"keywords",
|
31
|
+
"publisher",
|
32
|
+
"isbn",
|
33
|
+
"language",
|
34
|
+
"extracted_date",
|
15
35
|
]
|
16
36
|
|
17
37
|
CSV_QUOTING_MAP = {
|
@@ -21,17 +41,27 @@ CSV_QUOTING_MAP = {
|
|
21
41
|
"none": csv.QUOTE_NONE,
|
22
42
|
}
|
23
43
|
|
44
|
+
# Report layout
|
45
|
+
DUPES_DETAILS_LIMIT = 50
|
46
|
+
STATS_LIST_LIMIT = 20
|
47
|
+
|
48
|
+
|
49
|
+
# ----------------------------
|
50
|
+
# FS helpers
|
51
|
+
# ----------------------------
|
52
|
+
|
24
53
|
def ensure_parent_dir(p: Path) -> None:
|
54
|
+
"""Create parent directory if it doesn't exist."""
|
25
55
|
p.parent.mkdir(parents=True, exist_ok=True)
|
26
56
|
|
57
|
+
|
27
58
|
# ----------------------------
|
28
|
-
#
|
59
|
+
# Text helpers
|
29
60
|
# ----------------------------
|
30
61
|
|
31
62
|
def clean_text(text: Optional[str]) -> str:
|
32
63
|
"""
|
33
64
|
Trim, collapse internal whitespace, remove stray CRs, keep punctuation intact.
|
34
|
-
Safer for CSV fields than aggressive normalization.
|
35
65
|
"""
|
36
66
|
if not text:
|
37
67
|
return ""
|
@@ -48,14 +78,18 @@ def safe_find_text(node: ET.Element, path: str) -> str:
|
|
48
78
|
def join_nonempty(items: Iterable[str], sep: str) -> str:
|
49
79
|
return sep.join(x for x in (i.strip() for i in items) if x)
|
50
80
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
81
|
+
|
82
|
+
def normalize_text_for_key(s: str) -> str:
|
83
|
+
"""Lowercase + strip non-alnum + single-space. Good for stable keys."""
|
84
|
+
if not s:
|
85
|
+
return ""
|
86
|
+
s = s.lower()
|
87
|
+
s = "".join(ch for ch in s if ch.isalnum() or ch.isspace())
|
88
|
+
return " ".join(s.split())
|
55
89
|
|
56
90
|
|
57
91
|
# ----------------------------
|
58
|
-
# Record
|
92
|
+
# Record extraction
|
59
93
|
# ----------------------------
|
60
94
|
|
61
95
|
def process_doi(record: ET.Element) -> str:
|
@@ -97,8 +131,24 @@ def extract_urls(record: ET.Element) -> str:
|
|
97
131
|
return join_nonempty(deduped, " | ")
|
98
132
|
|
99
133
|
|
134
|
+
def extract_keywords(record: ET.Element) -> str:
|
135
|
+
"""Collect keywords from //keywords/keyword/style, joined by '; ' (deduped)."""
|
136
|
+
items: List[str] = []
|
137
|
+
for kw in record.findall(".//keywords/keyword"):
|
138
|
+
style = kw.find("style")
|
139
|
+
if style is not None and style.text:
|
140
|
+
items.append(clean_text(style.text))
|
141
|
+
seen = set()
|
142
|
+
out: List[str] = []
|
143
|
+
for x in items:
|
144
|
+
if x not in seen:
|
145
|
+
seen.add(x)
|
146
|
+
out.append(x)
|
147
|
+
return join_nonempty(out, "; ")
|
148
|
+
|
149
|
+
|
100
150
|
def process_record(record: ET.Element, database: str) -> Dict[str, str]:
|
101
|
-
"""Transform a <record> element into a dictionary
|
151
|
+
"""Transform a <record> element into a flat dictionary."""
|
102
152
|
ref_type_name = ""
|
103
153
|
ref_type = record.find("ref-type")
|
104
154
|
if ref_type is not None:
|
@@ -116,10 +166,20 @@ def process_record(record: ET.Element, database: str) -> Dict[str, str]:
|
|
116
166
|
"abstract": safe_find_text(record, ".//abstract/style"),
|
117
167
|
"doi": process_doi(record),
|
118
168
|
"urls": extract_urls(record),
|
169
|
+
"keywords": extract_keywords(record),
|
170
|
+
"publisher": safe_find_text(record, ".//publisher/style"),
|
171
|
+
"isbn": safe_find_text(record, ".//isbn/style"),
|
172
|
+
"language": safe_find_text(record, ".//language/style"),
|
119
173
|
"extracted_date": datetime.now().strftime("%Y-%m-%d"),
|
120
174
|
}
|
121
175
|
|
176
|
+
|
177
|
+
# ----------------------------
|
178
|
+
# XML streaming + filters
|
179
|
+
# ----------------------------
|
180
|
+
|
122
181
|
def iter_records(xml_path: Path) -> Iterable[ET.Element]:
|
182
|
+
"""Stream <record> elements with low memory footprint."""
|
123
183
|
context = ET.iterparse(str(xml_path), events=("start", "end"))
|
124
184
|
_, root = next(context)
|
125
185
|
for event, elem in context:
|
@@ -128,15 +188,141 @@ def iter_records(xml_path: Path) -> Iterable[ET.Element]:
|
|
128
188
|
elem.clear()
|
129
189
|
root.clear()
|
130
190
|
|
191
|
+
|
131
192
|
def record_matches_filters(row: Dict[str, str], ref_type: Optional[str], year: Optional[str]) -> bool:
|
132
|
-
if ref_type and row.get("ref_type") != ref_type:
|
133
|
-
|
193
|
+
if ref_type and row.get("ref_type") != ref_type:
|
194
|
+
return False
|
195
|
+
if year and row.get("year") != str(year):
|
196
|
+
return False
|
134
197
|
return True
|
135
198
|
|
136
|
-
|
199
|
+
|
200
|
+
# ----------------------------
|
201
|
+
# Deduplication helpers
|
202
|
+
# ----------------------------
|
203
|
+
|
204
|
+
def dedupe_key(row: Dict[str, str], mode: str) -> Optional[str]:
|
205
|
+
"""
|
206
|
+
mode: 'none' | 'doi' | 'title-year'
|
207
|
+
Returns None when no applicable key can be formed (row passes through).
|
208
|
+
"""
|
209
|
+
if mode == "doi":
|
210
|
+
k = (row.get("doi") or "").strip()
|
211
|
+
return k or None
|
212
|
+
if mode == "title-year":
|
213
|
+
title = normalize_text_for_key(row.get("title", ""))
|
214
|
+
year = (row.get("year") or "").strip()
|
215
|
+
if title and year:
|
216
|
+
return f"{title}::{year}"
|
217
|
+
return None
|
218
|
+
return None
|
219
|
+
|
220
|
+
|
221
|
+
# ----------------------------
|
222
|
+
# Retraction detection (basic heuristic)
|
223
|
+
# ----------------------------
|
224
|
+
|
225
|
+
def is_retraction(record: ET.Element) -> bool:
|
226
|
+
"""
|
227
|
+
Heuristic: consider a record 'retraction' if its notes or title contain substrings like:
|
228
|
+
'retraction', 'retracted', 'withdrawn', 'erratum'.
|
229
|
+
"""
|
230
|
+
text_blob = " ".join(
|
231
|
+
[
|
232
|
+
safe_find_text(record, ".//notes/style"),
|
233
|
+
safe_find_text(record, ".//title/style"),
|
234
|
+
]
|
235
|
+
).lower()
|
236
|
+
indicators = ("retraction", "retracted", "withdrawn", "erratum")
|
237
|
+
return any(tok in text_blob for tok in indicators)
|
238
|
+
|
239
|
+
|
240
|
+
# ----------------------------
|
241
|
+
# Writers (CSV / JSON / XLSX)
|
242
|
+
# ----------------------------
|
243
|
+
|
244
|
+
def _write_rows_csv(
|
245
|
+
rows_iter: Iterable[Dict[str, str]],
|
246
|
+
out_path: Path,
|
247
|
+
fieldnames: List[str],
|
248
|
+
delimiter: str,
|
249
|
+
quoting: str,
|
250
|
+
include_header: bool,
|
251
|
+
encoding: str,
|
252
|
+
) -> int:
|
253
|
+
qmode = CSV_QUOTING_MAP[quoting.lower()]
|
254
|
+
ensure_parent_dir(out_path)
|
255
|
+
count = 0
|
256
|
+
with open(out_path, "w", newline="", encoding=encoding) as f:
|
257
|
+
writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter=delimiter, quoting=qmode)
|
258
|
+
if include_header:
|
259
|
+
writer.writeheader()
|
260
|
+
for row in rows_iter:
|
261
|
+
writer.writerow({k: row.get(k, "") for k in fieldnames})
|
262
|
+
count += 1
|
263
|
+
return count
|
264
|
+
|
265
|
+
|
266
|
+
def _write_rows_json(
|
267
|
+
rows_iter: Iterable[Dict[str, str]],
|
268
|
+
out_path: Path,
|
269
|
+
fieldnames: List[str],
|
270
|
+
encoding: str,
|
271
|
+
) -> int:
|
272
|
+
"""Write a JSON array streaming without holding all rows in memory."""
|
273
|
+
ensure_parent_dir(out_path)
|
274
|
+
count = 0
|
275
|
+
with open(out_path, "w", encoding=encoding) as f:
|
276
|
+
f.write("[")
|
277
|
+
first = True
|
278
|
+
for row in rows_iter:
|
279
|
+
obj = {k: row.get(k, "") for k in fieldnames}
|
280
|
+
if first:
|
281
|
+
first = False
|
282
|
+
else:
|
283
|
+
f.write(",")
|
284
|
+
f.write(json.dumps(obj, ensure_ascii=False))
|
285
|
+
count += 1
|
286
|
+
f.write("]")
|
287
|
+
return count
|
288
|
+
|
289
|
+
|
290
|
+
def _write_rows_xlsx(
|
291
|
+
rows_iter: Iterable[Dict[str, str]],
|
292
|
+
out_path: Path,
|
293
|
+
fieldnames: List[str],
|
294
|
+
) -> int:
|
295
|
+
"""Write an Excel file using openpyxl (installed via project dependencies)."""
|
296
|
+
try:
|
297
|
+
from openpyxl import Workbook
|
298
|
+
except ImportError as e:
|
299
|
+
raise RuntimeError(
|
300
|
+
"Excel output requires 'openpyxl'. Ensure it is installed."
|
301
|
+
) from e
|
302
|
+
|
303
|
+
ensure_parent_dir(out_path)
|
304
|
+
wb = Workbook()
|
305
|
+
ws = wb.active
|
306
|
+
ws.title = "records"
|
307
|
+
ws.append(fieldnames) # header
|
308
|
+
|
309
|
+
count = 0
|
310
|
+
for row in rows_iter:
|
311
|
+
ws.append([row.get(k, "") for k in fieldnames])
|
312
|
+
count += 1
|
313
|
+
|
314
|
+
wb.save(out_path)
|
315
|
+
return count
|
316
|
+
|
317
|
+
|
318
|
+
# ----------------------------
|
319
|
+
# Generic export + report (+ dedupe + stats, pretty report + duplicates table)
|
320
|
+
# ----------------------------
|
321
|
+
|
322
|
+
def export_files_with_report(
|
137
323
|
inputs: List[Path],
|
138
|
-
|
139
|
-
|
324
|
+
out_path: Path,
|
325
|
+
out_format: str, # "csv" | "json" | "xlsx"
|
140
326
|
*,
|
141
327
|
fieldnames: List[str] = None,
|
142
328
|
delimiter: str = ",",
|
@@ -146,63 +332,341 @@ def export_files_to_csv_with_report(
|
|
146
332
|
ref_type: Optional[str] = None,
|
147
333
|
year: Optional[str] = None,
|
148
334
|
max_records_per_file: Optional[int] = None,
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
335
|
+
report_path: Optional[Path] = None,
|
336
|
+
# Dedup + stats
|
337
|
+
dedupe: str = "none",
|
338
|
+
dedupe_keep: str = "first",
|
339
|
+
stats: bool = False,
|
340
|
+
stats_json: Optional[Path] = None,
|
341
|
+
top_authors: int = 10,
|
342
|
+
) -> Tuple[int, Path, Optional[Path]]:
|
343
|
+
"""
|
344
|
+
Stream records from one or many EndNote XML files and write to CSV/JSON/XLSX.
|
345
|
+
Writes a pretty TXT report unless report_path is None.
|
154
346
|
|
155
|
-
|
156
|
-
|
347
|
+
Deduplication:
|
348
|
+
- dedupe='doi' → unique by DOI
|
349
|
+
- dedupe='title-year' → unique by normalized (title, year)
|
350
|
+
- dedupe_keep='first' or 'last' (applies within each input file)
|
157
351
|
|
158
|
-
|
159
|
-
|
160
|
-
run_start = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
352
|
+
Stats (when stats=True) add counts by year/ref_type/journal and top authors.
|
353
|
+
stats_json (if provided) writes a JSON snapshot of these stats + duplicates.
|
161
354
|
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
355
|
+
The report now includes a per-database table: Origin / Retractions / Duplicates / Remaining.
|
356
|
+
|
357
|
+
Returns (total_rows_written, out_path, report_path or None if disabled).
|
358
|
+
"""
|
359
|
+
fieldnames = fieldnames or DEFAULT_FIELDNAMES
|
360
|
+
out_format = out_format.lower()
|
361
|
+
if out_format not in {"csv", "json", "xlsx"}:
|
362
|
+
raise ValueError(f"Unknown out_format: {out_format}")
|
363
|
+
|
364
|
+
# Per-run accumulators
|
365
|
+
per_file_lines: List[str] = []
|
366
|
+
|
367
|
+
year_counter = Counter()
|
368
|
+
type_counter = Counter()
|
369
|
+
journal_counter = Counter()
|
370
|
+
author_counter = Counter()
|
371
|
+
|
372
|
+
# Dedupe state
|
373
|
+
seen_keys: set[str] = set()
|
374
|
+
duplicates_counter = Counter() # global key -> duplicate count
|
375
|
+
dupes_removed_per_db: Dict[str, int] = {}
|
376
|
+
|
377
|
+
# Per-database accounting for the table
|
378
|
+
# Origin: records after filters, before dedupe (and before max_records per file cap)
|
379
|
+
# Retractions: simple heuristic via is_retraction()
|
380
|
+
# Duplicates: removed due to dedupe
|
381
|
+
# Remaining: exported (post-dedupe)
|
382
|
+
per_db: Dict[str, Dict[str, int]] = {}
|
383
|
+
|
384
|
+
def rows() -> Iterable[Dict[str, str]]:
|
385
|
+
nonlocal per_file_lines, seen_keys, duplicates_counter, per_db
|
386
|
+
nonlocal year_counter, type_counter, journal_counter, author_counter, dupes_removed_per_db
|
166
387
|
|
167
388
|
for xml_path in inputs:
|
168
389
|
database = xml_path.stem
|
390
|
+
per_db.setdefault(database, {"origin": 0, "retractions": 0, "duplicates": 0, "remaining": 0})
|
391
|
+
dupes_removed_per_db.setdefault(database, 0)
|
392
|
+
|
169
393
|
logging.info("Processing %s (database=%s)", xml_path.name, database)
|
170
|
-
|
394
|
+
|
395
|
+
produced = 0
|
396
|
+
skipped = 0
|
397
|
+
|
398
|
+
buffered: List[Dict[str, str]] = []
|
399
|
+
buffered_keys_index: Dict[str, int] = {}
|
171
400
|
|
172
401
|
for rec in iter_records(xml_path):
|
173
402
|
try:
|
174
|
-
row
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
403
|
+
# Build row (for filters & output)
|
404
|
+
row = process_record(rec, database=database)
|
405
|
+
|
406
|
+
# Filter
|
407
|
+
if not record_matches_filters(row, ref_type, year):
|
408
|
+
continue
|
409
|
+
|
410
|
+
# Origin++ (count any passing-filter record before dedupe)
|
411
|
+
per_db[database]["origin"] += 1
|
412
|
+
|
413
|
+
# Retraction heuristic
|
414
|
+
if is_retraction(rec):
|
415
|
+
per_db[database]["retractions"] += 1
|
416
|
+
|
417
|
+
# Dedup
|
418
|
+
k = dedupe_key(row, dedupe)
|
419
|
+
if k and dedupe != "none":
|
420
|
+
if dedupe_keep == "first":
|
421
|
+
if k in seen_keys:
|
422
|
+
duplicates_counter[k] += 1
|
423
|
+
per_db[database]["duplicates"] += 1
|
424
|
+
dupes_removed_per_db[database] += 1
|
425
|
+
continue
|
426
|
+
seen_keys.add(k)
|
427
|
+
buffered.append(row)
|
428
|
+
produced += 1
|
429
|
+
else: # keep last within this file
|
430
|
+
if k in buffered_keys_index:
|
431
|
+
# replace old occurrence in this file buffer
|
432
|
+
prev_idx = buffered_keys_index[k]
|
433
|
+
buffered[prev_idx] = row
|
434
|
+
duplicates_counter[k] += 1
|
435
|
+
per_db[database]["duplicates"] += 1
|
436
|
+
dupes_removed_per_db[database] += 1
|
437
|
+
else:
|
438
|
+
buffered_keys_index[k] = len(buffered)
|
439
|
+
buffered.append(row)
|
440
|
+
produced += 1
|
441
|
+
seen_keys.add(k)
|
442
|
+
else:
|
443
|
+
buffered.append(row)
|
444
|
+
produced += 1
|
445
|
+
|
446
|
+
if max_records_per_file and produced >= max_records_per_file:
|
447
|
+
break
|
448
|
+
|
181
449
|
except Exception:
|
182
|
-
|
450
|
+
skipped += 1
|
183
451
|
logging.debug("Record error in %s", xml_path, exc_info=True)
|
184
452
|
|
185
|
-
|
453
|
+
# Remaining = exported from this file
|
454
|
+
per_db[database]["remaining"] += len(buffered)
|
455
|
+
|
456
|
+
per_file_lines.append(f"{xml_path.name:<15} : {len(buffered)} exported, {skipped} skipped")
|
457
|
+
|
458
|
+
# Stats
|
459
|
+
if stats:
|
460
|
+
for r in buffered:
|
461
|
+
y = (r.get("year") or "").strip()
|
462
|
+
t = (r.get("ref_type") or "").strip()
|
463
|
+
j = (r.get("journal") or "").strip()
|
464
|
+
if y:
|
465
|
+
year_counter[y] += 1
|
466
|
+
if t:
|
467
|
+
type_counter[t] += 1
|
468
|
+
if j:
|
469
|
+
journal_counter[j] += 1
|
470
|
+
if r.get("authors"):
|
471
|
+
for a in (x.strip() for x in r["authors"].split(";")):
|
472
|
+
if a:
|
473
|
+
author_counter[a] += 1
|
474
|
+
|
475
|
+
# Yield to writer
|
476
|
+
for r in buffered:
|
477
|
+
yield r
|
478
|
+
|
479
|
+
# Select writer
|
480
|
+
start_ts = time.time()
|
481
|
+
run_start = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
186
482
|
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
483
|
+
if out_format == "csv":
|
484
|
+
total = _write_rows_csv(rows(), out_path, fieldnames, delimiter, quoting, include_header, encoding)
|
485
|
+
elif out_format == "json":
|
486
|
+
total = _write_rows_json(rows(), out_path, fieldnames, encoding)
|
487
|
+
else: # xlsx
|
488
|
+
total = _write_rows_xlsx(rows(), out_path, fieldnames)
|
489
|
+
|
490
|
+
duration = time.time() - start_ts
|
491
|
+
|
492
|
+
# ---------- Pretty report builder ----------
|
493
|
+
def _header_line(title: str) -> List[str]:
|
494
|
+
bar = "=" * 40
|
495
|
+
return [bar, title, bar]
|
496
|
+
|
497
|
+
def _section_line(title: str) -> List[str]:
|
498
|
+
return ["", title, "-" * 40]
|
499
|
+
|
500
|
+
report_lines: List[str] = []
|
501
|
+
report_lines += _header_line("EndNote Export Report")
|
502
|
+
report_lines += [
|
503
|
+
f"Run started : {run_start}",
|
504
|
+
f"Files : {len(inputs)}",
|
505
|
+
f"Duration : {duration:.2f} seconds",
|
194
506
|
]
|
195
|
-
with open(report_path, "w", encoding="utf-8") as rf:
|
196
|
-
rf.write("\n".join(report_lines))
|
197
507
|
|
198
|
-
|
508
|
+
# Per-file section
|
509
|
+
report_lines += _section_line("Per-file results")
|
510
|
+
report_lines += per_file_lines
|
511
|
+
report_lines.append(f"TOTAL exported: {total}")
|
512
|
+
|
513
|
+
# Per-database duplicates table
|
514
|
+
# Build totals row
|
515
|
+
if per_db:
|
516
|
+
report_lines += _section_line("Duplicates table (by database)")
|
517
|
+
# compute column widths
|
518
|
+
db_names = list(per_db.keys())
|
519
|
+
db_col_w = max([len("Database")] + [len(db) for db in db_names])
|
520
|
+
|
521
|
+
# totals
|
522
|
+
tot_origin = sum(d["origin"] for d in per_db.values())
|
523
|
+
tot_retract = sum(d["retractions"] for d in per_db.values())
|
524
|
+
tot_dupes = sum(d["duplicates"] for d in per_db.values())
|
525
|
+
tot_remain = sum(d["remaining"] for d in per_db.values())
|
526
|
+
|
527
|
+
header = f"{'Database':<{db_col_w}} {'Origin':>8} {'Retractions':>12} {'Duplicates':>10} {'Remaining':>10}"
|
528
|
+
report_lines.append(header)
|
529
|
+
report_lines.append("-" * len(header))
|
530
|
+
|
531
|
+
for db in sorted(per_db.keys()):
|
532
|
+
d = per_db[db]
|
533
|
+
line = (
|
534
|
+
f"{db:<{db_col_w}} "
|
535
|
+
f"{d['origin']:>8} "
|
536
|
+
f"{d['retractions']:>12} "
|
537
|
+
f"{d['duplicates']:>10} "
|
538
|
+
f"{d['remaining']:>10}"
|
539
|
+
)
|
540
|
+
report_lines.append(line)
|
541
|
+
|
542
|
+
total_line = (
|
543
|
+
f"{'TOTAL':<{db_col_w}} "
|
544
|
+
f"{tot_origin:>8} "
|
545
|
+
f"{tot_retract:>12} "
|
546
|
+
f"{tot_dupes:>10} "
|
547
|
+
f"{tot_remain:>10}"
|
548
|
+
)
|
549
|
+
report_lines.append(total_line)
|
550
|
+
|
551
|
+
# Duplicates key summary (top)
|
552
|
+
if dedupe != "none":
|
553
|
+
report_lines += _section_line("Duplicate keys (top)")
|
554
|
+
total_dupes_global = sum(duplicates_counter.values())
|
555
|
+
report_lines.append(f"Mode : {dedupe}")
|
556
|
+
report_lines.append(f"Keep : {dedupe_keep}")
|
557
|
+
report_lines.append(f"Removed: {total_dupes_global}")
|
558
|
+
if total_dupes_global > 0:
|
559
|
+
report_lines.append("Details (top):")
|
560
|
+
for k, c in duplicates_counter.most_common(DUPES_DETAILS_LIMIT):
|
561
|
+
report_lines.append(f" {k} : {c} duplicate(s)")
|
562
|
+
|
563
|
+
# Summary stats
|
564
|
+
if stats:
|
565
|
+
def head(counter: Counter, n: int = 10):
|
566
|
+
return [(k, c) for k, c in counter.most_common(n) if k]
|
567
|
+
|
568
|
+
report_lines += _section_line("Summary stats")
|
569
|
+
# Year
|
570
|
+
report_lines.append("By year:")
|
571
|
+
for y in sorted(year_counter.keys()):
|
572
|
+
report_lines.append(f" {y:>6} : {year_counter[y]}")
|
573
|
+
# Ref type
|
574
|
+
report_lines.append("")
|
575
|
+
report_lines.append("By ref_type (top):")
|
576
|
+
for k, c in head(type_counter, STATS_LIST_LIMIT):
|
577
|
+
report_lines.append(f" {k}: {c}")
|
578
|
+
# Journal
|
579
|
+
report_lines.append("")
|
580
|
+
report_lines.append(f"By journal (top {STATS_LIST_LIMIT}):")
|
581
|
+
for k, c in head(journal_counter, STATS_LIST_LIMIT):
|
582
|
+
report_lines.append(f" {k}: {c}")
|
583
|
+
# Authors
|
584
|
+
report_lines.append("")
|
585
|
+
report_lines.append(f"Top authors (top {top_authors}):")
|
586
|
+
for k, c in head(author_counter, top_authors):
|
587
|
+
report_lines.append(f" {k}: {c}")
|
588
|
+
|
589
|
+
# Optional JSON dump
|
590
|
+
if stats_json:
|
591
|
+
ensure_parent_dir(stats_json)
|
592
|
+
with open(stats_json, "w", encoding="utf-8") as jf:
|
593
|
+
json.dump(
|
594
|
+
{
|
595
|
+
"totals": {
|
596
|
+
"exported": total,
|
597
|
+
"files_processed": len(inputs),
|
598
|
+
"duration_seconds": duration,
|
599
|
+
},
|
600
|
+
"by_year": dict(year_counter),
|
601
|
+
"by_ref_type": dict(type_counter),
|
602
|
+
"by_journal": dict(journal_counter),
|
603
|
+
"top_authors": author_counter.most_common(top_authors),
|
604
|
+
"duplicates": {
|
605
|
+
"mode": dedupe,
|
606
|
+
"keep": dedupe_keep,
|
607
|
+
"removed": sum(duplicates_counter.values()) if dedupe != "none" else 0,
|
608
|
+
"top": duplicates_counter.most_common(DUPES_DETAILS_LIMIT) if dedupe != "none" else [],
|
609
|
+
"by_database": per_db,
|
610
|
+
},
|
611
|
+
},
|
612
|
+
jf,
|
613
|
+
ensure_ascii=False,
|
614
|
+
indent=2,
|
615
|
+
)
|
616
|
+
|
617
|
+
# Write report unless disabled
|
618
|
+
final_report_path: Optional[Path] = report_path
|
619
|
+
if final_report_path is not None:
|
620
|
+
final_report_path = final_report_path or out_path.with_name(out_path.stem + "_report.txt")
|
621
|
+
ensure_parent_dir(final_report_path)
|
622
|
+
with open(final_report_path, "w", encoding="utf-8") as rf:
|
623
|
+
rf.write("\n".join(report_lines))
|
624
|
+
|
625
|
+
return total, out_path, final_report_path
|
199
626
|
|
200
|
-
|
201
|
-
|
627
|
+
|
628
|
+
# ----------------------------
|
629
|
+
# Back-compat convenience wrappers (CSV only)
|
630
|
+
# ----------------------------
|
631
|
+
|
632
|
+
def export_files_to_csv_with_report(
|
633
|
+
inputs: List[Path],
|
634
|
+
csv_path: Path,
|
635
|
+
report_path: Optional[Path] = None,
|
636
|
+
*,
|
637
|
+
fieldnames: List[str] = None,
|
638
|
+
delimiter: str = ",",
|
639
|
+
quoting: str = "minimal",
|
640
|
+
include_header: bool = True,
|
641
|
+
encoding: str = "utf-8",
|
642
|
+
ref_type: Optional[str] = None,
|
643
|
+
year: Optional[str] = None,
|
644
|
+
max_records_per_file: Optional[int] = None,
|
645
|
+
) -> Tuple[int, Path, Optional[Path]]:
|
646
|
+
"""Legacy API: export to CSV + TXT report (or no report if report_path=None)."""
|
647
|
+
return export_files_with_report(
|
648
|
+
inputs=inputs,
|
649
|
+
out_path=csv_path,
|
650
|
+
out_format="csv",
|
651
|
+
fieldnames=fieldnames,
|
652
|
+
delimiter=delimiter,
|
653
|
+
quoting=quoting,
|
654
|
+
include_header=include_header,
|
655
|
+
encoding=encoding,
|
656
|
+
ref_type=ref_type,
|
657
|
+
year=year,
|
658
|
+
max_records_per_file=max_records_per_file,
|
659
|
+
report_path=report_path,
|
660
|
+
)
|
661
|
+
|
662
|
+
|
663
|
+
def export(xml_file: Path, csv_path: Path, **kwargs) -> Tuple[int, Path, Optional[Path]]:
|
664
|
+
"""Convenience: single XML file to CSV (+report unless disabled)."""
|
202
665
|
return export_files_to_csv_with_report([xml_file], csv_path, **kwargs)
|
203
666
|
|
204
|
-
|
205
|
-
|
667
|
+
|
668
|
+
def export_folder(folder: Path, csv_path: Path, **kwargs) -> Tuple[int, Path, Optional[Path]]:
|
669
|
+
"""Convenience: all *.xml in folder to CSV (+report unless disabled)."""
|
206
670
|
inputs = sorted(p for p in Path(folder).glob("*.xml") if p.is_file())
|
207
671
|
if not inputs:
|
208
672
|
raise FileNotFoundError(f"No *.xml found in {folder}")
|
@@ -0,0 +1,223 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: endnote-utils
|
3
|
+
Version: 0.2.0
|
4
|
+
Summary: Convert EndNote XML to CSV/JSON/XLSX with streaming parse and TXT report.
|
5
|
+
Author-email: Minh Quach <minhquach8@gmail.com>
|
6
|
+
License: MIT
|
7
|
+
Keywords: endnote,xml,csv,bibliography,research
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
10
|
+
Classifier: Operating System :: OS Independent
|
11
|
+
Requires-Python: >=3.8
|
12
|
+
Description-Content-Type: text/markdown
|
13
|
+
Requires-Dist: openpyxl>=3.1.0
|
14
|
+
|
15
|
+
# EndNote Utils
|
16
|
+
|
17
|
+
Convert **EndNote XML files** into clean CSV/JSON/XLSX with automatic TXT reports.
|
18
|
+
Supports both **Python API** and **command-line interface (CLI)**.
|
19
|
+
|
20
|
+
---
|
21
|
+
|
22
|
+
## Features
|
23
|
+
|
24
|
+
- ✅ Parse one XML file (`--xml`) or an entire folder of `*.xml` (`--folder`)
|
25
|
+
- ✅ Streams `<record>` elements using `iterparse` (low memory usage)
|
26
|
+
- ✅ Extracts fields:
|
27
|
+
`database, ref_type, title, journal, authors, year, volume, number, abstract, doi, urls, keywords, publisher, isbn, language, extracted_date`
|
28
|
+
- ✅ Adds a `database` column from the XML filename stem (`IEEE.xml → IEEE`)
|
29
|
+
- ✅ Normalizes DOI (`10.xxxx` → `https://doi.org/...`)
|
30
|
+
- ✅ Supports **multiple output formats**: CSV, JSON, XLSX
|
31
|
+
- ✅ Always generates a **TXT report** (default: `<out>_report.txt`) with:
|
32
|
+
- per-file counts (exported/skipped)
|
33
|
+
- totals, files processed
|
34
|
+
- run timestamp & duration
|
35
|
+
- **duplicate table** per database (Origin / Retractions / Duplicates / Remaining)
|
36
|
+
- optional duplicate key list (top-N)
|
37
|
+
- optional summary stats (year, ref_type, journal, top authors)
|
38
|
+
- ✅ Auto-creates output folders if missing
|
39
|
+
- ✅ Deduplication:
|
40
|
+
- `--dedupe doi` (unique by DOI)
|
41
|
+
- `--dedupe title-year` (unique by normalized title + year)
|
42
|
+
- `--dedupe-keep first|last` (keep first or last occurrence within each file)
|
43
|
+
- ✅ Summary stats (`--stats`) with optional JSON export (`--stats-json`)
|
44
|
+
- ✅ CLI options for CSV formatting, filters, verbosity
|
45
|
+
- ✅ Importable Python API for scripting & integration
|
46
|
+
|
47
|
+
---
|
48
|
+
|
49
|
+
## Installation
|
50
|
+
|
51
|
+
### From PyPI
|
52
|
+
|
53
|
+
```bash
|
54
|
+
pip install endnote-utils
|
55
|
+
```
|
56
|
+
|
57
|
+
Requires **Python 3.8+**.
|
58
|
+
|
59
|
+
---
|
60
|
+
|
61
|
+
## Usage
|
62
|
+
|
63
|
+
### Command Line
|
64
|
+
|
65
|
+
#### Single file
|
66
|
+
|
67
|
+
```bash
|
68
|
+
endnote-utils --xml data/IEEE.xml --out output/ieee.csv
|
69
|
+
```
|
70
|
+
|
71
|
+
#### Folder with multiple files
|
72
|
+
|
73
|
+
```bash
|
74
|
+
endnote-utils --folder data/xmls --out output/all_records.csv
|
75
|
+
```
|
76
|
+
|
77
|
+
#### Custom report path
|
78
|
+
|
79
|
+
```bash
|
80
|
+
endnote-utils \
|
81
|
+
--xml data/Scopus.xml \
|
82
|
+
--out output/scopus.csv \
|
83
|
+
--report reports/scopus_run.txt \
|
84
|
+
--stats \
|
85
|
+
--verbose
|
86
|
+
```
|
87
|
+
|
88
|
+
If `--report` is not provided, it defaults to `<out>_report.txt`.
|
89
|
+
Use `--no-report` to disable report generation.
|
90
|
+
|
91
|
+
---
|
92
|
+
|
93
|
+
### CLI Options
|
94
|
+
|
95
|
+
| Option | Description | Default |
|
96
|
+
| --------------- | --------------------------------------------------- | ------------------ |
|
97
|
+
| `--xml` | Path to a single EndNote XML file | – |
|
98
|
+
| `--folder` | Path to a folder containing multiple `*.xml` files | – |
|
99
|
+
| `--csv` | (Legacy) Output CSV path | – |
|
100
|
+
| `--out` | Generic output path (`.csv`, `.json`, `.xlsx`) | – |
|
101
|
+
| `--format` | Explicit format (`csv`, `json`, `xlsx`) | inferred |
|
102
|
+
| `--report` | Output TXT report path | `<out>_report.txt` |
|
103
|
+
| `--no-report` | Disable TXT report completely | – |
|
104
|
+
| `--delimiter` | CSV delimiter | `,` |
|
105
|
+
| `--quoting` | CSV quoting: `minimal`, `all`, `nonnumeric`, `none` | `minimal` |
|
106
|
+
| `--no-header` | Suppress CSV header row | – |
|
107
|
+
| `--encoding` | Output text encoding | `utf-8` |
|
108
|
+
| `--ref-type` | Only include records with this `ref_type` name | – |
|
109
|
+
| `--year` | Only include records with this year | – |
|
110
|
+
| `--max-records` | Stop after N records per file (for testing) | – |
|
111
|
+
| `--dedupe` | Deduplicate mode: `none`, `doi`, `title-year` | `none` |
|
112
|
+
| `--dedupe-keep` | Deduplication strategy: `first`, `last` | `first` |
|
113
|
+
| `--stats` | Include summary stats in TXT report | – |
|
114
|
+
| `--stats-json` | Path to JSON file to save stats & duplicate info | – |
|
115
|
+
| `--verbose` | Verbose logging with debug details | – |
|
116
|
+
|
117
|
+
---
|
118
|
+
|
119
|
+
### Example Report (snippet)
|
120
|
+
|
121
|
+
```
|
122
|
+
========================================
|
123
|
+
EndNote Export Report
|
124
|
+
========================================
|
125
|
+
Run started : 2025-09-11 14:30:22
|
126
|
+
Files : 4
|
127
|
+
Duration : 0.47 seconds
|
128
|
+
|
129
|
+
Per-file results
|
130
|
+
----------------------------------------
|
131
|
+
GGScholar.xml : 13 exported, 0 skipped
|
132
|
+
IEEE.xml : 2147 exported, 0 skipped
|
133
|
+
PubMed.xml : 504 exported, 0 skipped
|
134
|
+
Scopus.xml : 847 exported, 0 skipped
|
135
|
+
TOTAL exported: 3511
|
136
|
+
|
137
|
+
Duplicates table (by database)
|
138
|
+
----------------------------------------
|
139
|
+
Database Origin Retractions Duplicates Remaining
|
140
|
+
------------------------------------------------------------
|
141
|
+
GGScholar 179 0 27 152
|
142
|
+
IEEE 1900 0 589 1311
|
143
|
+
PubMed 320 0 225 95
|
144
|
+
Scopus 1999 1 511 1489
|
145
|
+
TOTAL 4410 1 1352 3047
|
146
|
+
|
147
|
+
Duplicate keys (top)
|
148
|
+
----------------------------------------
|
149
|
+
Mode : doi
|
150
|
+
Keep : first
|
151
|
+
Removed: 1352
|
152
|
+
Details (top):
|
153
|
+
10.1109/SPMB55497.2022.10014965 : 3 duplicate(s)
|
154
|
+
10.1109/TSSA63730.2024.10864368 : 2 duplicate(s)
|
155
|
+
|
156
|
+
Summary stats
|
157
|
+
----------------------------------------
|
158
|
+
By year:
|
159
|
+
2022 : 569
|
160
|
+
2023 : 684
|
161
|
+
2024 : 1148
|
162
|
+
2025 : 1108
|
163
|
+
|
164
|
+
By ref_type (top):
|
165
|
+
Journal Article: 2037
|
166
|
+
Conference Proceedings: 1470
|
167
|
+
Book Section: 4
|
168
|
+
|
169
|
+
By journal (top 20):
|
170
|
+
IEEE Access: 175
|
171
|
+
IEEE Journal of Biomedical and Health Informatics: 67
|
172
|
+
...
|
173
|
+
|
174
|
+
Top authors (top 10):
|
175
|
+
Y. Wang: 50
|
176
|
+
X. Wang: 35
|
177
|
+
...
|
178
|
+
```
|
179
|
+
|
180
|
+
---
|
181
|
+
|
182
|
+
## Python API
|
183
|
+
|
184
|
+
```python
|
185
|
+
from pathlib import Path
|
186
|
+
from endnote_utils import export, export_folder
|
187
|
+
|
188
|
+
# Single file
|
189
|
+
total, out_file, report_file = export(
|
190
|
+
Path("data/IEEE.xml"),
|
191
|
+
Path("output/ieee.csv"),
|
192
|
+
dedupe="doi", stats=True
|
193
|
+
)
|
194
|
+
|
195
|
+
# Folder
|
196
|
+
total, out_file, report_file = export_folder(
|
197
|
+
Path("data/xmls"),
|
198
|
+
Path("output/all.csv"),
|
199
|
+
ref_type="Conference Proceedings",
|
200
|
+
year="2024",
|
201
|
+
dedupe="title-year",
|
202
|
+
dedupe_keep="last",
|
203
|
+
stats=True,
|
204
|
+
stats_json=Path("output/stats.json"),
|
205
|
+
)
|
206
|
+
```
|
207
|
+
|
208
|
+
---
|
209
|
+
|
210
|
+
## Development Notes
|
211
|
+
|
212
|
+
* Pure Python, uses only standard library (`argparse`, `csv`, `xml.etree.ElementTree`, `logging`, `pathlib`, `json`).
|
213
|
+
* Optional dependency: `openpyxl` (for Excel `.xlsx` export).
|
214
|
+
* Streaming XML parsing avoids high memory usage.
|
215
|
+
* Deduplication strategies configurable (`doi` / `title-year`).
|
216
|
+
* Report includes per-database table and optional JSON snapshot.
|
217
|
+
* Follows [PEP 621](https://peps.python.org/pep-0621/) packaging (`pyproject.toml`).
|
218
|
+
|
219
|
+
---
|
220
|
+
|
221
|
+
## License
|
222
|
+
|
223
|
+
MIT License © 2025 Minh Quach
|
@@ -0,0 +1,8 @@
|
|
1
|
+
endnote_utils/__init__.py,sha256=yuPzjVRcBiQPUCFG5DbSvwqz1O42_hYJ3_7dzWotE20,284
|
2
|
+
endnote_utils/cli.py,sha256=QFE73sKPMEbRiOuCVpMMQXT3RBx854uU-GS-ZHQv1Kw,7025
|
3
|
+
endnote_utils/core.py,sha256=e52ebYHx2QdY3juS3Jt8-SQhJyDLvIycaj0WhIatang,22960
|
4
|
+
endnote_utils-0.2.0.dist-info/METADATA,sha256=wllJhkRJlwO1eUROFNqvunl-rdSNiaSKzrTVH4p8zVs,7252
|
5
|
+
endnote_utils-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
6
|
+
endnote_utils-0.2.0.dist-info/entry_points.txt,sha256=l8OEYTGiRj49CND6Xmpk4cIlAE8WJg6UInRo-YRvg8w,57
|
7
|
+
endnote_utils-0.2.0.dist-info/top_level.txt,sha256=6ZlEkqvnKvYAHI7P3wlh5j3vDQF4-bKLIdYCwPTL-G8,14
|
8
|
+
endnote_utils-0.2.0.dist-info/RECORD,,
|
@@ -1,145 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: endnote-utils
|
3
|
-
Version: 0.1.4
|
4
|
-
Summary: Convert EndNote XML to CSV with streaming parse and TXT report.
|
5
|
-
Author-email: Minh Quach <minhquach8@gmail.com>
|
6
|
-
License: MIT
|
7
|
-
Keywords: endnote,xml,csv,bibliography,research
|
8
|
-
Classifier: Programming Language :: Python :: 3
|
9
|
-
Classifier: License :: OSI Approved :: MIT License
|
10
|
-
Classifier: Operating System :: OS Independent
|
11
|
-
Requires-Python: >=3.8
|
12
|
-
Description-Content-Type: text/markdown
|
13
|
-
|
14
|
-
# EndNote Utils
|
15
|
-
|
16
|
-
Convert **EndNote XML files** into clean CSVs with automatic TXT reports.
|
17
|
-
Supports both **Python API** and **command-line interface (CLI)**.
|
18
|
-
|
19
|
-
---
|
20
|
-
|
21
|
-
## Features
|
22
|
-
|
23
|
-
- ✅ Parse one XML file (`--xml`) or an entire folder of `*.xml` (`--folder`)
|
24
|
-
- ✅ Streams `<record>` elements using `iterparse` (low memory usage)
|
25
|
-
- ✅ Extracts fields:
|
26
|
-
`database, ref_type, title, journal, authors, year, volume, number, abstract, doi, urls, extracted_date`
|
27
|
-
- ✅ Adds a `database` column from the XML filename stem (`IEEE.xml → IEEE`)
|
28
|
-
- ✅ Normalizes DOI (`10.xxxx` → `https://doi.org/...`)
|
29
|
-
- ✅ Always generates a **TXT report** (default: `<csv>_report.txt`) with:
|
30
|
-
- per-file counts (exported/skipped records)
|
31
|
-
- totals, files processed
|
32
|
-
- run timestamp & duration
|
33
|
-
- ✅ Auto-creates output folders if missing
|
34
|
-
- ✅ CLI options for CSV formatting, filters, verbosity
|
35
|
-
- ✅ Importable Python API for scripting & integration
|
36
|
-
|
37
|
-
---
|
38
|
-
|
39
|
-
## Installation
|
40
|
-
|
41
|
-
### From PyPI
|
42
|
-
|
43
|
-
```bash
|
44
|
-
pip install endnote-utils
|
45
|
-
```
|
46
|
-
|
47
|
-
Requires **Python 3.8+**.
|
48
|
-
|
49
|
-
---
|
50
|
-
|
51
|
-
## Usage
|
52
|
-
|
53
|
-
### Command Line
|
54
|
-
|
55
|
-
#### Single file
|
56
|
-
|
57
|
-
```bash
|
58
|
-
endnote-utils --xml data/IEEE.xml --csv output/ieee.csv
|
59
|
-
```
|
60
|
-
|
61
|
-
#### Folder with multiple files
|
62
|
-
|
63
|
-
```bash
|
64
|
-
endnote-utils --folder data/xmls --csv output/all_records.csv
|
65
|
-
```
|
66
|
-
|
67
|
-
#### Custom report path
|
68
|
-
|
69
|
-
```bash
|
70
|
-
endnote-utils \
|
71
|
-
--xml data/Scopus.xml \
|
72
|
-
--csv output/scopus.csv \
|
73
|
-
--report reports/scopus_run.txt
|
74
|
-
```
|
75
|
-
|
76
|
-
If `--report` is not provided, it defaults to `<csv>_report.txt`.
|
77
|
-
|
78
|
-
---
|
79
|
-
|
80
|
-
### CLI Options
|
81
|
-
|
82
|
-
| Option | Description | Default |
|
83
|
-
| --------------- | --------------------------------------------------- | ------------------ |
|
84
|
-
| `--xml` | Path to a single EndNote XML file | – |
|
85
|
-
| `--folder` | Path to a folder containing multiple `*.xml` files | – |
|
86
|
-
| `--csv` | Output CSV path | – |
|
87
|
-
| `--report` | Output TXT report path | `<csv>_report.txt` |
|
88
|
-
| `--delimiter` | CSV delimiter | `,` |
|
89
|
-
| `--quoting` | CSV quoting: `minimal`, `all`, `nonnumeric`, `none` | `minimal` |
|
90
|
-
| `--no-header` | Suppress CSV header row | – |
|
91
|
-
| `--encoding` | Output CSV encoding | `utf-8` |
|
92
|
-
| `--ref-type` | Only include records with this `ref_type` name | – |
|
93
|
-
| `--year` | Only include records with this year | – |
|
94
|
-
| `--max-records` | Stop after N records per file (useful for testing) | – |
|
95
|
-
| `--verbose` | Verbose logging with debug details | – |
|
96
|
-
|
97
|
-
---
|
98
|
-
|
99
|
-
### Example Report
|
100
|
-
|
101
|
-
```
|
102
|
-
Run started: 2025-09-11 14:30:22
|
103
|
-
IEEE.xml: 120 exported, 0 skipped
|
104
|
-
Scopus.xml: 95 exported, 2 skipped
|
105
|
-
TOTAL exported: 215
|
106
|
-
Files processed: 2
|
107
|
-
Duration: 3.14 seconds
|
108
|
-
```
|
109
|
-
|
110
|
-
---
|
111
|
-
|
112
|
-
## Python API
|
113
|
-
|
114
|
-
You can also use it directly in Python scripts:
|
115
|
-
|
116
|
-
```python
|
117
|
-
from pathlib import Path
|
118
|
-
from endnote_utils import export, export_folder
|
119
|
-
|
120
|
-
# Single file
|
121
|
-
total, csv_out, report_out = export(
|
122
|
-
Path("data/IEEE.xml"), Path("output/ieee.csv")
|
123
|
-
)
|
124
|
-
|
125
|
-
# Folder
|
126
|
-
total, csv_out, report_out = export_folder(
|
127
|
-
Path("data/xmls"), Path("output/all.csv"),
|
128
|
-
ref_type="Conference Proceedings", year="2024"
|
129
|
-
)
|
130
|
-
```
|
131
|
-
|
132
|
-
---
|
133
|
-
|
134
|
-
## Development Notes
|
135
|
-
|
136
|
-
* Pure Python, uses only standard library (`argparse`, `csv`, `xml.etree.ElementTree`, `logging`, `pathlib`).
|
137
|
-
* Streaming XML parsing avoids high memory usage.
|
138
|
-
* Robust error handling: skips malformed records but logs them in verbose mode.
|
139
|
-
* Follows [PEP 621](https://peps.python.org/pep-0621/) packaging (`pyproject.toml`).
|
140
|
-
|
141
|
-
---
|
142
|
-
|
143
|
-
## License
|
144
|
-
|
145
|
-
MIT License © 2025 Minh Quach
|
@@ -1,8 +0,0 @@
|
|
1
|
-
endnote_utils/__init__.py,sha256=yuPzjVRcBiQPUCFG5DbSvwqz1O42_hYJ3_7dzWotE20,284
|
2
|
-
endnote_utils/cli.py,sha256=TQxdO7IlaRXwNTm0MpBVk9CeUTUGgtlcI0O3O9xhgdM,2160
|
3
|
-
endnote_utils/core.py,sha256=cddpuRMF5RC5mp3Lll0eTA9MXLzcVDnDl1Z7IMHOr0k,7480
|
4
|
-
endnote_utils-0.1.4.dist-info/METADATA,sha256=FXD6AXEFT1_lqYpuDYtX89XnNsQpZrftudMp-YzodQI,4316
|
5
|
-
endnote_utils-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
6
|
-
endnote_utils-0.1.4.dist-info/entry_points.txt,sha256=l8OEYTGiRj49CND6Xmpk4cIlAE8WJg6UInRo-YRvg8w,57
|
7
|
-
endnote_utils-0.1.4.dist-info/top_level.txt,sha256=6ZlEkqvnKvYAHI7P3wlh5j3vDQF4-bKLIdYCwPTL-G8,14
|
8
|
-
endnote_utils-0.1.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|