endnote-utils 0.1.4__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
endnote_utils/cli.py CHANGED
@@ -4,51 +4,183 @@ import argparse
4
4
  import logging
5
5
  import sys
6
6
  from pathlib import Path
7
+ from typing import List, Optional, Tuple
7
8
 
8
- from .core import DEFAULT_FIELDNAMES, export, export_folder
9
+ from .core import (
10
+ DEFAULT_FIELDNAMES,
11
+ export_files_with_report, # generic writer: csv/json/xlsx
12
+ )
13
+
14
+ SUPPORTED_FORMATS = ("csv", "json", "xlsx")
15
+ EXT_TO_FORMAT = {".csv": "csv", ".json": "json", ".xlsx": "xlsx"}
9
16
 
10
17
 
11
18
  def build_parser() -> argparse.ArgumentParser:
12
- p = argparse.ArgumentParser(description="Export EndNote XML (file or folder) to CSV + TXT report.")
19
+ p = argparse.ArgumentParser(
20
+ description="Export EndNote XML (file or folder) to CSV/JSON/XLSX with a TXT report."
21
+ )
22
+
23
+ # Input source (mutually exclusive)
13
24
  g = p.add_mutually_exclusive_group(required=True)
14
25
  g.add_argument("--xml", help="Path to a single EndNote XML file.")
15
26
  g.add_argument("--folder", help="Path to a folder containing *.xml files.")
16
- p.add_argument("--csv", required=True, help="Path to CSV output file.")
17
- p.add_argument("--report", required=False, help="Path to TXT report (default: <csv>_report.txt).")
18
- p.add_argument("--delimiter", default=",")
19
- p.add_argument("--quoting", default="minimal", choices=["minimal","all","nonnumeric","none"])
20
- p.add_argument("--no-header", action="store_true")
21
- p.add_argument("--encoding", default="utf-8")
22
- p.add_argument("--ref-type", default=None)
23
- p.add_argument("--year", default=None)
24
- p.add_argument("--max-records", type=int, default=None)
25
- p.add_argument("--verbose", action="store_true")
27
+
28
+ # Output selection (CSV legacy flag + new generic flags)
29
+ p.add_argument(
30
+ "--csv",
31
+ required=False,
32
+ help="(Legacy) Output CSV path. Prefer --out for csv/json/xlsx.",
33
+ )
34
+ p.add_argument(
35
+ "--out",
36
+ required=False,
37
+ help="Generic output path; format inferred from file extension if --format not provided. "
38
+ "Supported extensions: .csv, .json, .xlsx",
39
+ )
40
+ p.add_argument(
41
+ "--format",
42
+ choices=SUPPORTED_FORMATS,
43
+ help="Output format. If omitted, inferred from --out extension or --csv.",
44
+ )
45
+
46
+ # Report controls
47
+ p.add_argument("--report", required=False, help="Path to TXT report (default: <output>_report.txt).")
48
+ p.add_argument(
49
+ "--no-report",
50
+ action="store_true",
51
+ help="Disable writing the TXT report (by default, a report is always generated).",
52
+ )
53
+
54
+ # CSV-specific formatting options (ignored for JSON/XLSX except delimiter/quoting/header)
55
+ p.add_argument("--delimiter", default=",", help="CSV delimiter (default: ',').")
56
+ p.add_argument(
57
+ "--quoting",
58
+ default="minimal",
59
+ choices=["minimal", "all", "nonnumeric", "none"],
60
+ help="CSV quoting mode (default: minimal).",
61
+ )
62
+ p.add_argument("--no-header", action="store_true", help="Do not write CSV header row.")
63
+ p.add_argument("--encoding", default="utf-8", help="Output text encoding (default: utf-8).")
64
+
65
+ # Filters / limits
66
+ p.add_argument("--ref-type", default=None, help="Filter by ref_type name.")
67
+ p.add_argument("--year", default=None, help="Filter by year.")
68
+ p.add_argument("--max-records", type=int, default=None, help="Max records per file (testing).")
69
+
70
+ # Deduplication & Stats
71
+ p.add_argument("--dedupe", choices=["none", "doi", "title-year"], default="none",
72
+ help="Deduplicate records by key. Default: none.")
73
+ p.add_argument("--dedupe-keep", choices=["first", "last"], default="first",
74
+ help="When duplicates found, keep the first or last occurrence. Default: first.")
75
+ p.add_argument("--stats", action="store_true",
76
+ help="Compute summary stats and include them in the TXT report.")
77
+ p.add_argument("--stats-json",
78
+ help="Optional JSON file path to write detailed stats (when --stats is used).")
79
+ p.add_argument("--top-authors", type=int, default=10,
80
+ help="How many top authors to list in the report/stats JSON. Default: 10.")
81
+
82
+ # Verbosity
83
+ p.add_argument("--verbose", action="store_true", help="Verbose logging.")
84
+
26
85
  return p
27
86
 
87
+
88
+ def _resolve_inputs(args: argparse.Namespace) -> List[Path]:
89
+ if args.xml:
90
+ xml_path = Path(args.xml)
91
+ if not xml_path.is_file():
92
+ raise FileNotFoundError(xml_path)
93
+ return [xml_path]
94
+
95
+ folder = Path(args.folder)
96
+ if not folder.is_dir():
97
+ raise FileNotFoundError(folder)
98
+ inputs = sorted(p for p in folder.glob("*.xml") if p.is_file())
99
+ if not inputs:
100
+ raise FileNotFoundError(f"No *.xml files found in folder: {folder}")
101
+ return inputs
102
+
103
+
104
+ def _resolve_output_and_format(args: argparse.Namespace) -> tuple[Path, str, Optional[Path]]:
105
+ """
106
+ Decide final out_path, out_format, and report_path using:
107
+ - Prefer --out/--format if provided
108
+ - Fallback to --csv (legacy) which implies CSV
109
+ - If --no-report, return report_path=None
110
+ """
111
+ target_path: Optional[Path] = None
112
+ out_format: Optional[str] = None
113
+
114
+ if args.out:
115
+ target_path = Path(args.out)
116
+ out_format = args.format
117
+ if not out_format:
118
+ # infer from extension
119
+ out_format = EXT_TO_FORMAT.get(target_path.suffix.lower())
120
+ if not out_format:
121
+ raise SystemExit(
122
+ "Cannot infer output format from extension. "
123
+ "Use --format {csv,json,xlsx} or set a supported extension."
124
+ )
125
+ elif args.csv:
126
+ target_path = Path(args.csv)
127
+ out_format = args.format or "csv"
128
+ if out_format != "csv":
129
+ # user asked for non-csv but used --csv path
130
+ raise SystemExit("When using --csv, --format must be 'csv'. Use --out for json/xlsx.")
131
+ else:
132
+ raise SystemExit("You must provide either --out (preferred) or --csv (legacy).")
133
+
134
+ # Report path defaults next to chosen output file (unless disabled)
135
+ if args.no_report:
136
+ report_path: Optional[Path] = None
137
+ else:
138
+ report_path = Path(args.report) if args.report else target_path.with_name(target_path.stem + "_report.txt")
139
+
140
+ return target_path, out_format, report_path
141
+
142
+
28
143
  def main() -> None:
29
144
  args = build_parser().parse_args()
30
145
  logging.basicConfig(
31
146
  level=logging.DEBUG if args.verbose else logging.INFO,
32
- format="%(levelname)s: %(message)s", stream=sys.stderr
33
- )
34
- csv_path = Path(args.csv)
35
- report_path = Path(args.report) if args.report else csv_path.with_name(csv_path.stem + "_report.txt")
36
- kwargs = dict(
37
- report_path=report_path,
38
- fieldnames=DEFAULT_FIELDNAMES,
39
- delimiter=args.delimiter,
40
- quoting=args.quoting,
41
- include_header=not args.no_header,
42
- encoding=args.encoding,
43
- ref_type=args.ref_type,
44
- year=args.year,
45
- max_records_per_file=args.max_records,
147
+ format="%(levelname)s: %(message)s",
148
+ stream=sys.stderr,
46
149
  )
47
150
 
48
- if args.xml:
49
- total, csv_out, rep_out = export(Path(args.xml), csv_path, **kwargs)
50
- else:
51
- total, csv_out, rep_out = export_folder(Path(args.folder), csv_path, **kwargs)
151
+ try:
152
+ inputs = _resolve_inputs(args)
153
+ out_path, out_format, report_path = _resolve_output_and_format(args)
154
+
155
+ total, final_out, final_report = export_files_with_report(
156
+ inputs=inputs,
157
+ out_path=out_path,
158
+ out_format=out_format,
159
+ fieldnames=DEFAULT_FIELDNAMES,
160
+ delimiter=args.delimiter,
161
+ quoting=args.quoting,
162
+ include_header=not args.no_header,
163
+ encoding=args.encoding,
164
+ ref_type=args.ref_type,
165
+ year=args.year,
166
+ max_records_per_file=args.max_records,
167
+ dedupe=args.dedupe,
168
+ dedupe_keep=args.dedupe_keep,
169
+ stats=args.stats,
170
+ stats_json=Path(args.stats_json) if args.stats_json else None,
171
+ top_authors=args.top_authors,
172
+ report_path=report_path, # may be None → core should skip writing report
173
+ )
174
+
175
+ logging.info("Exported %d record(s) → %s", total, final_out)
176
+ if report_path is None:
177
+ logging.info("Report disabled by --no-report.")
178
+ else:
179
+ logging.info("Report → %s", final_report)
52
180
 
53
- logging.info("Exported %d record(s) → %s", total, csv_out)
54
- logging.info("Report %s", rep_out)
181
+ except FileNotFoundError as e:
182
+ logging.error("File/folder not found: %s", e)
183
+ sys.exit(1)
184
+ except Exception as e:
185
+ logging.error("Unexpected error: %s", e)
186
+ sys.exit(2)
endnote_utils/core.py CHANGED
@@ -1,17 +1,37 @@
1
- # src/endnote_exporter/core.py
1
+ # src/endnote_utils/core.py
2
2
  from __future__ import annotations
3
3
 
4
4
  import csv
5
+ import json
5
6
  import logging
6
7
  import time
7
8
  import xml.etree.ElementTree as ET
9
+ from collections import Counter
8
10
  from datetime import datetime
9
11
  from pathlib import Path
10
12
  from typing import Dict, Iterable, List, Optional, Tuple
11
13
 
14
+ # ----------------------------
15
+ # Public constants
16
+ # ----------------------------
17
+
12
18
  DEFAULT_FIELDNAMES: List[str] = [
13
- "database", "ref_type", "title", "journal", "authors", "year",
14
- "volume", "number", "abstract", "doi", "urls", "extracted_date",
19
+ "database",
20
+ "ref_type",
21
+ "title",
22
+ "journal",
23
+ "authors",
24
+ "year",
25
+ "volume",
26
+ "number",
27
+ "abstract",
28
+ "doi",
29
+ "urls",
30
+ "keywords",
31
+ "publisher",
32
+ "isbn",
33
+ "language",
34
+ "extracted_date",
15
35
  ]
16
36
 
17
37
  CSV_QUOTING_MAP = {
@@ -21,17 +41,27 @@ CSV_QUOTING_MAP = {
21
41
  "none": csv.QUOTE_NONE,
22
42
  }
23
43
 
44
+ # Report layout
45
+ DUPES_DETAILS_LIMIT = 50
46
+ STATS_LIST_LIMIT = 20
47
+
48
+
49
+ # ----------------------------
50
+ # FS helpers
51
+ # ----------------------------
52
+
24
53
  def ensure_parent_dir(p: Path) -> None:
54
+ """Create parent directory if it doesn't exist."""
25
55
  p.parent.mkdir(parents=True, exist_ok=True)
26
56
 
57
+
27
58
  # ----------------------------
28
- # Utilities
59
+ # Text helpers
29
60
  # ----------------------------
30
61
 
31
62
  def clean_text(text: Optional[str]) -> str:
32
63
  """
33
64
  Trim, collapse internal whitespace, remove stray CRs, keep punctuation intact.
34
- Safer for CSV fields than aggressive normalization.
35
65
  """
36
66
  if not text:
37
67
  return ""
@@ -48,14 +78,18 @@ def safe_find_text(node: ET.Element, path: str) -> str:
48
78
  def join_nonempty(items: Iterable[str], sep: str) -> str:
49
79
  return sep.join(x for x in (i.strip() for i in items) if x)
50
80
 
51
- def ensure_parent_dir(p: Path) -> None:
52
- """Create parent directory if it doesn't exist."""
53
- if not p.parent.exists():
54
- p.parent.mkdir(parents=True, exist_ok=True)
81
+
82
+ def normalize_text_for_key(s: str) -> str:
83
+ """Lowercase + strip non-alnum + single-space. Good for stable keys."""
84
+ if not s:
85
+ return ""
86
+ s = s.lower()
87
+ s = "".join(ch for ch in s if ch.isalnum() or ch.isspace())
88
+ return " ".join(s.split())
55
89
 
56
90
 
57
91
  # ----------------------------
58
- # Record processing
92
+ # Record extraction
59
93
  # ----------------------------
60
94
 
61
95
  def process_doi(record: ET.Element) -> str:
@@ -97,8 +131,24 @@ def extract_urls(record: ET.Element) -> str:
97
131
  return join_nonempty(deduped, " | ")
98
132
 
99
133
 
134
+ def extract_keywords(record: ET.Element) -> str:
135
+ """Collect keywords from //keywords/keyword/style, joined by '; ' (deduped)."""
136
+ items: List[str] = []
137
+ for kw in record.findall(".//keywords/keyword"):
138
+ style = kw.find("style")
139
+ if style is not None and style.text:
140
+ items.append(clean_text(style.text))
141
+ seen = set()
142
+ out: List[str] = []
143
+ for x in items:
144
+ if x not in seen:
145
+ seen.add(x)
146
+ out.append(x)
147
+ return join_nonempty(out, "; ")
148
+
149
+
100
150
  def process_record(record: ET.Element, database: str) -> Dict[str, str]:
101
- """Transform a <record> element into a dictionary for CSV."""
151
+ """Transform a <record> element into a flat dictionary."""
102
152
  ref_type_name = ""
103
153
  ref_type = record.find("ref-type")
104
154
  if ref_type is not None:
@@ -116,10 +166,20 @@ def process_record(record: ET.Element, database: str) -> Dict[str, str]:
116
166
  "abstract": safe_find_text(record, ".//abstract/style"),
117
167
  "doi": process_doi(record),
118
168
  "urls": extract_urls(record),
169
+ "keywords": extract_keywords(record),
170
+ "publisher": safe_find_text(record, ".//publisher/style"),
171
+ "isbn": safe_find_text(record, ".//isbn/style"),
172
+ "language": safe_find_text(record, ".//language/style"),
119
173
  "extracted_date": datetime.now().strftime("%Y-%m-%d"),
120
174
  }
121
175
 
176
+
177
+ # ----------------------------
178
+ # XML streaming + filters
179
+ # ----------------------------
180
+
122
181
  def iter_records(xml_path: Path) -> Iterable[ET.Element]:
182
+ """Stream <record> elements with low memory footprint."""
123
183
  context = ET.iterparse(str(xml_path), events=("start", "end"))
124
184
  _, root = next(context)
125
185
  for event, elem in context:
@@ -128,15 +188,141 @@ def iter_records(xml_path: Path) -> Iterable[ET.Element]:
128
188
  elem.clear()
129
189
  root.clear()
130
190
 
191
+
131
192
  def record_matches_filters(row: Dict[str, str], ref_type: Optional[str], year: Optional[str]) -> bool:
132
- if ref_type and row.get("ref_type") != ref_type: return False
133
- if year and row.get("year") != str(year): return False
193
+ if ref_type and row.get("ref_type") != ref_type:
194
+ return False
195
+ if year and row.get("year") != str(year):
196
+ return False
134
197
  return True
135
198
 
136
- def export_files_to_csv_with_report(
199
+
200
+ # ----------------------------
201
+ # Deduplication helpers
202
+ # ----------------------------
203
+
204
+ def dedupe_key(row: Dict[str, str], mode: str) -> Optional[str]:
205
+ """
206
+ mode: 'none' | 'doi' | 'title-year'
207
+ Returns None when no applicable key can be formed (row passes through).
208
+ """
209
+ if mode == "doi":
210
+ k = (row.get("doi") or "").strip()
211
+ return k or None
212
+ if mode == "title-year":
213
+ title = normalize_text_for_key(row.get("title", ""))
214
+ year = (row.get("year") or "").strip()
215
+ if title and year:
216
+ return f"{title}::{year}"
217
+ return None
218
+ return None
219
+
220
+
221
+ # ----------------------------
222
+ # Retraction detection (basic heuristic)
223
+ # ----------------------------
224
+
225
+ def is_retraction(record: ET.Element) -> bool:
226
+ """
227
+ Heuristic: consider a record 'retraction' if its notes or title contain substrings like:
228
+ 'retraction', 'retracted', 'withdrawn', 'erratum'.
229
+ """
230
+ text_blob = " ".join(
231
+ [
232
+ safe_find_text(record, ".//notes/style"),
233
+ safe_find_text(record, ".//title/style"),
234
+ ]
235
+ ).lower()
236
+ indicators = ("retraction", "retracted", "withdrawn", "erratum")
237
+ return any(tok in text_blob for tok in indicators)
238
+
239
+
240
+ # ----------------------------
241
+ # Writers (CSV / JSON / XLSX)
242
+ # ----------------------------
243
+
244
+ def _write_rows_csv(
245
+ rows_iter: Iterable[Dict[str, str]],
246
+ out_path: Path,
247
+ fieldnames: List[str],
248
+ delimiter: str,
249
+ quoting: str,
250
+ include_header: bool,
251
+ encoding: str,
252
+ ) -> int:
253
+ qmode = CSV_QUOTING_MAP[quoting.lower()]
254
+ ensure_parent_dir(out_path)
255
+ count = 0
256
+ with open(out_path, "w", newline="", encoding=encoding) as f:
257
+ writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter=delimiter, quoting=qmode)
258
+ if include_header:
259
+ writer.writeheader()
260
+ for row in rows_iter:
261
+ writer.writerow({k: row.get(k, "") for k in fieldnames})
262
+ count += 1
263
+ return count
264
+
265
+
266
+ def _write_rows_json(
267
+ rows_iter: Iterable[Dict[str, str]],
268
+ out_path: Path,
269
+ fieldnames: List[str],
270
+ encoding: str,
271
+ ) -> int:
272
+ """Write a JSON array streaming without holding all rows in memory."""
273
+ ensure_parent_dir(out_path)
274
+ count = 0
275
+ with open(out_path, "w", encoding=encoding) as f:
276
+ f.write("[")
277
+ first = True
278
+ for row in rows_iter:
279
+ obj = {k: row.get(k, "") for k in fieldnames}
280
+ if first:
281
+ first = False
282
+ else:
283
+ f.write(",")
284
+ f.write(json.dumps(obj, ensure_ascii=False))
285
+ count += 1
286
+ f.write("]")
287
+ return count
288
+
289
+
290
+ def _write_rows_xlsx(
291
+ rows_iter: Iterable[Dict[str, str]],
292
+ out_path: Path,
293
+ fieldnames: List[str],
294
+ ) -> int:
295
+ """Write an Excel file using openpyxl (installed via project dependencies)."""
296
+ try:
297
+ from openpyxl import Workbook
298
+ except ImportError as e:
299
+ raise RuntimeError(
300
+ "Excel output requires 'openpyxl'. Ensure it is installed."
301
+ ) from e
302
+
303
+ ensure_parent_dir(out_path)
304
+ wb = Workbook()
305
+ ws = wb.active
306
+ ws.title = "records"
307
+ ws.append(fieldnames) # header
308
+
309
+ count = 0
310
+ for row in rows_iter:
311
+ ws.append([row.get(k, "") for k in fieldnames])
312
+ count += 1
313
+
314
+ wb.save(out_path)
315
+ return count
316
+
317
+
318
+ # ----------------------------
319
+ # Generic export + report (+ dedupe + stats, pretty report + duplicates table)
320
+ # ----------------------------
321
+
322
+ def export_files_with_report(
137
323
  inputs: List[Path],
138
- csv_path: Path,
139
- report_path: Optional[Path] = None,
324
+ out_path: Path,
325
+ out_format: str, # "csv" | "json" | "xlsx"
140
326
  *,
141
327
  fieldnames: List[str] = None,
142
328
  delimiter: str = ",",
@@ -146,63 +332,341 @@ def export_files_to_csv_with_report(
146
332
  ref_type: Optional[str] = None,
147
333
  year: Optional[str] = None,
148
334
  max_records_per_file: Optional[int] = None,
149
- ) -> Tuple[int, Path, Path]:
150
- """Primary library API: export one or many XML files to a single CSV + TXT report."""
151
- fieldnames = fieldnames or DEFAULT_FIELDNAMES
152
- qmode = CSV_QUOTING_MAP[quoting]
153
- report_path = report_path or csv_path.with_name(csv_path.stem + "_report.txt")
335
+ report_path: Optional[Path] = None,
336
+ # Dedup + stats
337
+ dedupe: str = "none",
338
+ dedupe_keep: str = "first",
339
+ stats: bool = False,
340
+ stats_json: Optional[Path] = None,
341
+ top_authors: int = 10,
342
+ ) -> Tuple[int, Path, Optional[Path]]:
343
+ """
344
+ Stream records from one or many EndNote XML files and write to CSV/JSON/XLSX.
345
+ Writes a pretty TXT report unless report_path is None.
154
346
 
155
- ensure_parent_dir(csv_path)
156
- ensure_parent_dir(report_path)
347
+ Deduplication:
348
+ - dedupe='doi' → unique by DOI
349
+ - dedupe='title-year' → unique by normalized (title, year)
350
+ - dedupe_keep='first' or 'last' (applies within each input file)
157
351
 
158
- total_written, report_lines = 0, []
159
- start_ts = time.time()
160
- run_start = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
352
+ Stats (when stats=True) add counts by year/ref_type/journal and top authors.
353
+ stats_json (if provided) writes a JSON snapshot of these stats + duplicates.
161
354
 
162
- with open(csv_path, "w", newline="", encoding=encoding) as f:
163
- writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter=delimiter, quoting=qmode)
164
- if include_header:
165
- writer.writeheader()
355
+ The report now includes a per-database table: Origin / Retractions / Duplicates / Remaining.
356
+
357
+ Returns (total_rows_written, out_path, report_path or None if disabled).
358
+ """
359
+ fieldnames = fieldnames or DEFAULT_FIELDNAMES
360
+ out_format = out_format.lower()
361
+ if out_format not in {"csv", "json", "xlsx"}:
362
+ raise ValueError(f"Unknown out_format: {out_format}")
363
+
364
+ # Per-run accumulators
365
+ per_file_lines: List[str] = []
366
+
367
+ year_counter = Counter()
368
+ type_counter = Counter()
369
+ journal_counter = Counter()
370
+ author_counter = Counter()
371
+
372
+ # Dedupe state
373
+ seen_keys: set[str] = set()
374
+ duplicates_counter = Counter() # global key -> duplicate count
375
+ dupes_removed_per_db: Dict[str, int] = {}
376
+
377
+ # Per-database accounting for the table
378
+ # Origin: records after filters, before dedupe (and before max_records per file cap)
379
+ # Retractions: simple heuristic via is_retraction()
380
+ # Duplicates: removed due to dedupe
381
+ # Remaining: exported (post-dedupe)
382
+ per_db: Dict[str, Dict[str, int]] = {}
383
+
384
+ def rows() -> Iterable[Dict[str, str]]:
385
+ nonlocal per_file_lines, seen_keys, duplicates_counter, per_db
386
+ nonlocal year_counter, type_counter, journal_counter, author_counter, dupes_removed_per_db
166
387
 
167
388
  for xml_path in inputs:
168
389
  database = xml_path.stem
390
+ per_db.setdefault(database, {"origin": 0, "retractions": 0, "duplicates": 0, "remaining": 0})
391
+ dupes_removed_per_db.setdefault(database, 0)
392
+
169
393
  logging.info("Processing %s (database=%s)", xml_path.name, database)
170
- file_written = file_skipped = 0
394
+
395
+ produced = 0
396
+ skipped = 0
397
+
398
+ buffered: List[Dict[str, str]] = []
399
+ buffered_keys_index: Dict[str, int] = {}
171
400
 
172
401
  for rec in iter_records(xml_path):
173
402
  try:
174
- row = process_record(rec, database=database) # your existing function
175
- if record_matches_filters(row, ref_type, year):
176
- writer.writerow({k: row.get(k, "") for k in fieldnames})
177
- file_written += 1
178
- total_written += 1
179
- if max_records_per_file and file_written >= max_records_per_file:
180
- break
403
+ # Build row (for filters & output)
404
+ row = process_record(rec, database=database)
405
+
406
+ # Filter
407
+ if not record_matches_filters(row, ref_type, year):
408
+ continue
409
+
410
+ # Origin++ (count any passing-filter record before dedupe)
411
+ per_db[database]["origin"] += 1
412
+
413
+ # Retraction heuristic
414
+ if is_retraction(rec):
415
+ per_db[database]["retractions"] += 1
416
+
417
+ # Dedup
418
+ k = dedupe_key(row, dedupe)
419
+ if k and dedupe != "none":
420
+ if dedupe_keep == "first":
421
+ if k in seen_keys:
422
+ duplicates_counter[k] += 1
423
+ per_db[database]["duplicates"] += 1
424
+ dupes_removed_per_db[database] += 1
425
+ continue
426
+ seen_keys.add(k)
427
+ buffered.append(row)
428
+ produced += 1
429
+ else: # keep last within this file
430
+ if k in buffered_keys_index:
431
+ # replace old occurrence in this file buffer
432
+ prev_idx = buffered_keys_index[k]
433
+ buffered[prev_idx] = row
434
+ duplicates_counter[k] += 1
435
+ per_db[database]["duplicates"] += 1
436
+ dupes_removed_per_db[database] += 1
437
+ else:
438
+ buffered_keys_index[k] = len(buffered)
439
+ buffered.append(row)
440
+ produced += 1
441
+ seen_keys.add(k)
442
+ else:
443
+ buffered.append(row)
444
+ produced += 1
445
+
446
+ if max_records_per_file and produced >= max_records_per_file:
447
+ break
448
+
181
449
  except Exception:
182
- file_skipped += 1
450
+ skipped += 1
183
451
  logging.debug("Record error in %s", xml_path, exc_info=True)
184
452
 
185
- report_lines.append(f"{xml_path.name}: {file_written} exported, {file_skipped} skipped")
453
+ # Remaining = exported from this file
454
+ per_db[database]["remaining"] += len(buffered)
455
+
456
+ per_file_lines.append(f"{xml_path.name:<15} : {len(buffered)} exported, {skipped} skipped")
457
+
458
+ # Stats
459
+ if stats:
460
+ for r in buffered:
461
+ y = (r.get("year") or "").strip()
462
+ t = (r.get("ref_type") or "").strip()
463
+ j = (r.get("journal") or "").strip()
464
+ if y:
465
+ year_counter[y] += 1
466
+ if t:
467
+ type_counter[t] += 1
468
+ if j:
469
+ journal_counter[j] += 1
470
+ if r.get("authors"):
471
+ for a in (x.strip() for x in r["authors"].split(";")):
472
+ if a:
473
+ author_counter[a] += 1
474
+
475
+ # Yield to writer
476
+ for r in buffered:
477
+ yield r
478
+
479
+ # Select writer
480
+ start_ts = time.time()
481
+ run_start = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
186
482
 
187
- dur = time.time() - start_ts
188
- report_lines = [
189
- f"Run started: {run_start}",
190
- *report_lines,
191
- f"TOTAL exported: {total_written}",
192
- f"Files processed: {len(inputs)}",
193
- f"Duration: {dur:.2f} seconds",
483
+ if out_format == "csv":
484
+ total = _write_rows_csv(rows(), out_path, fieldnames, delimiter, quoting, include_header, encoding)
485
+ elif out_format == "json":
486
+ total = _write_rows_json(rows(), out_path, fieldnames, encoding)
487
+ else: # xlsx
488
+ total = _write_rows_xlsx(rows(), out_path, fieldnames)
489
+
490
+ duration = time.time() - start_ts
491
+
492
+ # ---------- Pretty report builder ----------
493
+ def _header_line(title: str) -> List[str]:
494
+ bar = "=" * 40
495
+ return [bar, title, bar]
496
+
497
+ def _section_line(title: str) -> List[str]:
498
+ return ["", title, "-" * 40]
499
+
500
+ report_lines: List[str] = []
501
+ report_lines += _header_line("EndNote Export Report")
502
+ report_lines += [
503
+ f"Run started : {run_start}",
504
+ f"Files : {len(inputs)}",
505
+ f"Duration : {duration:.2f} seconds",
194
506
  ]
195
- with open(report_path, "w", encoding="utf-8") as rf:
196
- rf.write("\n".join(report_lines))
197
507
 
198
- return total_written, csv_path, report_path
508
+ # Per-file section
509
+ report_lines += _section_line("Per-file results")
510
+ report_lines += per_file_lines
511
+ report_lines.append(f"TOTAL exported: {total}")
512
+
513
+ # Per-database duplicates table
514
+ # Build totals row
515
+ if per_db:
516
+ report_lines += _section_line("Duplicates table (by database)")
517
+ # compute column widths
518
+ db_names = list(per_db.keys())
519
+ db_col_w = max([len("Database")] + [len(db) for db in db_names])
520
+
521
+ # totals
522
+ tot_origin = sum(d["origin"] for d in per_db.values())
523
+ tot_retract = sum(d["retractions"] for d in per_db.values())
524
+ tot_dupes = sum(d["duplicates"] for d in per_db.values())
525
+ tot_remain = sum(d["remaining"] for d in per_db.values())
526
+
527
+ header = f"{'Database':<{db_col_w}} {'Origin':>8} {'Retractions':>12} {'Duplicates':>10} {'Remaining':>10}"
528
+ report_lines.append(header)
529
+ report_lines.append("-" * len(header))
530
+
531
+ for db in sorted(per_db.keys()):
532
+ d = per_db[db]
533
+ line = (
534
+ f"{db:<{db_col_w}} "
535
+ f"{d['origin']:>8} "
536
+ f"{d['retractions']:>12} "
537
+ f"{d['duplicates']:>10} "
538
+ f"{d['remaining']:>10}"
539
+ )
540
+ report_lines.append(line)
541
+
542
+ total_line = (
543
+ f"{'TOTAL':<{db_col_w}} "
544
+ f"{tot_origin:>8} "
545
+ f"{tot_retract:>12} "
546
+ f"{tot_dupes:>10} "
547
+ f"{tot_remain:>10}"
548
+ )
549
+ report_lines.append(total_line)
550
+
551
+ # Duplicates key summary (top)
552
+ if dedupe != "none":
553
+ report_lines += _section_line("Duplicate keys (top)")
554
+ total_dupes_global = sum(duplicates_counter.values())
555
+ report_lines.append(f"Mode : {dedupe}")
556
+ report_lines.append(f"Keep : {dedupe_keep}")
557
+ report_lines.append(f"Removed: {total_dupes_global}")
558
+ if total_dupes_global > 0:
559
+ report_lines.append("Details (top):")
560
+ for k, c in duplicates_counter.most_common(DUPES_DETAILS_LIMIT):
561
+ report_lines.append(f" {k} : {c} duplicate(s)")
562
+
563
+ # Summary stats
564
+ if stats:
565
+ def head(counter: Counter, n: int = 10):
566
+ return [(k, c) for k, c in counter.most_common(n) if k]
567
+
568
+ report_lines += _section_line("Summary stats")
569
+ # Year
570
+ report_lines.append("By year:")
571
+ for y in sorted(year_counter.keys()):
572
+ report_lines.append(f" {y:>6} : {year_counter[y]}")
573
+ # Ref type
574
+ report_lines.append("")
575
+ report_lines.append("By ref_type (top):")
576
+ for k, c in head(type_counter, STATS_LIST_LIMIT):
577
+ report_lines.append(f" {k}: {c}")
578
+ # Journal
579
+ report_lines.append("")
580
+ report_lines.append(f"By journal (top {STATS_LIST_LIMIT}):")
581
+ for k, c in head(journal_counter, STATS_LIST_LIMIT):
582
+ report_lines.append(f" {k}: {c}")
583
+ # Authors
584
+ report_lines.append("")
585
+ report_lines.append(f"Top authors (top {top_authors}):")
586
+ for k, c in head(author_counter, top_authors):
587
+ report_lines.append(f" {k}: {c}")
588
+
589
+ # Optional JSON dump
590
+ if stats_json:
591
+ ensure_parent_dir(stats_json)
592
+ with open(stats_json, "w", encoding="utf-8") as jf:
593
+ json.dump(
594
+ {
595
+ "totals": {
596
+ "exported": total,
597
+ "files_processed": len(inputs),
598
+ "duration_seconds": duration,
599
+ },
600
+ "by_year": dict(year_counter),
601
+ "by_ref_type": dict(type_counter),
602
+ "by_journal": dict(journal_counter),
603
+ "top_authors": author_counter.most_common(top_authors),
604
+ "duplicates": {
605
+ "mode": dedupe,
606
+ "keep": dedupe_keep,
607
+ "removed": sum(duplicates_counter.values()) if dedupe != "none" else 0,
608
+ "top": duplicates_counter.most_common(DUPES_DETAILS_LIMIT) if dedupe != "none" else [],
609
+ "by_database": per_db,
610
+ },
611
+ },
612
+ jf,
613
+ ensure_ascii=False,
614
+ indent=2,
615
+ )
616
+
617
+ # Write report unless disabled
618
+ final_report_path: Optional[Path] = report_path
619
+ if final_report_path is not None:
620
+ final_report_path = final_report_path or out_path.with_name(out_path.stem + "_report.txt")
621
+ ensure_parent_dir(final_report_path)
622
+ with open(final_report_path, "w", encoding="utf-8") as rf:
623
+ rf.write("\n".join(report_lines))
624
+
625
+ return total, out_path, final_report_path
199
626
 
200
- def export(xml_file: Path, csv_path: Path, **kwargs):
201
- """Convenience: single XML file to CSV (+report)."""
627
+
628
+ # ----------------------------
629
+ # Back-compat convenience wrappers (CSV only)
630
+ # ----------------------------
631
+
632
+ def export_files_to_csv_with_report(
633
+ inputs: List[Path],
634
+ csv_path: Path,
635
+ report_path: Optional[Path] = None,
636
+ *,
637
+ fieldnames: List[str] = None,
638
+ delimiter: str = ",",
639
+ quoting: str = "minimal",
640
+ include_header: bool = True,
641
+ encoding: str = "utf-8",
642
+ ref_type: Optional[str] = None,
643
+ year: Optional[str] = None,
644
+ max_records_per_file: Optional[int] = None,
645
+ ) -> Tuple[int, Path, Optional[Path]]:
646
+ """Legacy API: export to CSV + TXT report (or no report if report_path=None)."""
647
+ return export_files_with_report(
648
+ inputs=inputs,
649
+ out_path=csv_path,
650
+ out_format="csv",
651
+ fieldnames=fieldnames,
652
+ delimiter=delimiter,
653
+ quoting=quoting,
654
+ include_header=include_header,
655
+ encoding=encoding,
656
+ ref_type=ref_type,
657
+ year=year,
658
+ max_records_per_file=max_records_per_file,
659
+ report_path=report_path,
660
+ )
661
+
662
+
663
+ def export(xml_file: Path, csv_path: Path, **kwargs) -> Tuple[int, Path, Optional[Path]]:
664
+ """Convenience: single XML file to CSV (+report unless disabled)."""
202
665
  return export_files_to_csv_with_report([xml_file], csv_path, **kwargs)
203
666
 
204
- def export_folder(folder: Path, csv_path: Path, **kwargs):
205
- """Convenience: all *.xml in folder to CSV (+report)."""
667
+
668
+ def export_folder(folder: Path, csv_path: Path, **kwargs) -> Tuple[int, Path, Optional[Path]]:
669
+ """Convenience: all *.xml in folder to CSV (+report unless disabled)."""
206
670
  inputs = sorted(p for p in Path(folder).glob("*.xml") if p.is_file())
207
671
  if not inputs:
208
672
  raise FileNotFoundError(f"No *.xml found in {folder}")
@@ -0,0 +1,223 @@
1
+ Metadata-Version: 2.4
2
+ Name: endnote-utils
3
+ Version: 0.2.0
4
+ Summary: Convert EndNote XML to CSV/JSON/XLSX with streaming parse and TXT report.
5
+ Author-email: Minh Quach <minhquach8@gmail.com>
6
+ License: MIT
7
+ Keywords: endnote,xml,csv,bibliography,research
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.8
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: openpyxl>=3.1.0
14
+
15
+ # EndNote Utils
16
+
17
+ Convert **EndNote XML files** into clean CSV/JSON/XLSX with automatic TXT reports.
18
+ Supports both **Python API** and **command-line interface (CLI)**.
19
+
20
+ ---
21
+
22
+ ## Features
23
+
24
+ - ✅ Parse one XML file (`--xml`) or an entire folder of `*.xml` (`--folder`)
25
+ - ✅ Streams `<record>` elements using `iterparse` (low memory usage)
26
+ - ✅ Extracts fields:
27
+ `database, ref_type, title, journal, authors, year, volume, number, abstract, doi, urls, keywords, publisher, isbn, language, extracted_date`
28
+ - ✅ Adds a `database` column from the XML filename stem (`IEEE.xml → IEEE`)
29
+ - ✅ Normalizes DOI (`10.xxxx` → `https://doi.org/...`)
30
+ - ✅ Supports **multiple output formats**: CSV, JSON, XLSX
31
+ - ✅ Always generates a **TXT report** (default: `<out>_report.txt`) with:
32
+ - per-file counts (exported/skipped)
33
+ - totals, files processed
34
+ - run timestamp & duration
35
+ - **duplicate table** per database (Origin / Retractions / Duplicates / Remaining)
36
+ - optional duplicate key list (top-N)
37
+ - optional summary stats (year, ref_type, journal, top authors)
38
+ - ✅ Auto-creates output folders if missing
39
+ - ✅ Deduplication:
40
+ - `--dedupe doi` (unique by DOI)
41
+ - `--dedupe title-year` (unique by normalized title + year)
42
+ - `--dedupe-keep first|last` (keep first or last occurrence within each file)
43
+ - ✅ Summary stats (`--stats`) with optional JSON export (`--stats-json`)
44
+ - ✅ CLI options for CSV formatting, filters, verbosity
45
+ - ✅ Importable Python API for scripting & integration
46
+
47
+ ---
48
+
49
+ ## Installation
50
+
51
+ ### From PyPI
52
+
53
+ ```bash
54
+ pip install endnote-utils
55
+ ```
56
+
57
+ Requires **Python 3.8+**.
58
+
59
+ ---
60
+
61
+ ## Usage
62
+
63
+ ### Command Line
64
+
65
+ #### Single file
66
+
67
+ ```bash
68
+ endnote-utils --xml data/IEEE.xml --out output/ieee.csv
69
+ ```
70
+
71
+ #### Folder with multiple files
72
+
73
+ ```bash
74
+ endnote-utils --folder data/xmls --out output/all_records.csv
75
+ ```
76
+
77
+ #### Custom report path
78
+
79
+ ```bash
80
+ endnote-utils \
81
+ --xml data/Scopus.xml \
82
+ --out output/scopus.csv \
83
+ --report reports/scopus_run.txt \
84
+ --stats \
85
+ --verbose
86
+ ```
87
+
88
+ If `--report` is not provided, it defaults to `<out>_report.txt`.
89
+ Use `--no-report` to disable report generation.
90
+
91
+ ---
92
+
93
+ ### CLI Options
94
+
95
+ | Option | Description | Default |
96
+ | --------------- | --------------------------------------------------- | ------------------ |
97
+ | `--xml` | Path to a single EndNote XML file | – |
98
+ | `--folder` | Path to a folder containing multiple `*.xml` files | – |
99
+ | `--csv` | (Legacy) Output CSV path | – |
100
+ | `--out` | Generic output path (`.csv`, `.json`, `.xlsx`) | – |
101
+ | `--format` | Explicit format (`csv`, `json`, `xlsx`) | inferred |
102
+ | `--report` | Output TXT report path | `<out>_report.txt` |
103
+ | `--no-report` | Disable TXT report completely | – |
104
+ | `--delimiter` | CSV delimiter | `,` |
105
+ | `--quoting` | CSV quoting: `minimal`, `all`, `nonnumeric`, `none` | `minimal` |
106
+ | `--no-header` | Suppress CSV header row | – |
107
+ | `--encoding` | Output text encoding | `utf-8` |
108
+ | `--ref-type` | Only include records with this `ref_type` name | – |
109
+ | `--year` | Only include records with this year | – |
110
+ | `--max-records` | Stop after N records per file (for testing) | – |
111
+ | `--dedupe` | Deduplicate mode: `none`, `doi`, `title-year` | `none` |
112
+ | `--dedupe-keep` | Deduplication strategy: `first`, `last` | `first` |
113
+ | `--stats` | Include summary stats in TXT report | – |
114
+ | `--stats-json` | Path to JSON file to save stats & duplicate info | – |
115
+ | `--verbose` | Verbose logging with debug details | – |
116
+
117
+ ---
118
+
119
+ ### Example Report (snippet)
120
+
121
+ ```
122
+ ========================================
123
+ EndNote Export Report
124
+ ========================================
125
+ Run started : 2025-09-11 14:30:22
126
+ Files : 4
127
+ Duration : 0.47 seconds
128
+
129
+ Per-file results
130
+ ----------------------------------------
131
+ GGScholar.xml : 13 exported, 0 skipped
132
+ IEEE.xml : 2147 exported, 0 skipped
133
+ PubMed.xml : 504 exported, 0 skipped
134
+ Scopus.xml : 847 exported, 0 skipped
135
+ TOTAL exported: 3511
136
+
137
+ Duplicates table (by database)
138
+ ----------------------------------------
139
+ Database Origin Retractions Duplicates Remaining
140
+ ------------------------------------------------------------
141
+ GGScholar 179 0 27 152
142
+ IEEE 1900 0 589 1311
143
+ PubMed 320 0 225 95
144
+ Scopus 1999 1 511 1489
145
+ TOTAL 4410 1 1352 3047
146
+
147
+ Duplicate keys (top)
148
+ ----------------------------------------
149
+ Mode : doi
150
+ Keep : first
151
+ Removed: 1352
152
+ Details (top):
153
+ 10.1109/SPMB55497.2022.10014965 : 3 duplicate(s)
154
+ 10.1109/TSSA63730.2024.10864368 : 2 duplicate(s)
155
+
156
+ Summary stats
157
+ ----------------------------------------
158
+ By year:
159
+ 2022 : 569
160
+ 2023 : 684
161
+ 2024 : 1148
162
+ 2025 : 1108
163
+
164
+ By ref_type (top):
165
+ Journal Article: 2037
166
+ Conference Proceedings: 1470
167
+ Book Section: 4
168
+
169
+ By journal (top 20):
170
+ IEEE Access: 175
171
+ IEEE Journal of Biomedical and Health Informatics: 67
172
+ ...
173
+
174
+ Top authors (top 10):
175
+ Y. Wang: 50
176
+ X. Wang: 35
177
+ ...
178
+ ```
179
+
180
+ ---
181
+
182
+ ## Python API
183
+
184
+ ```python
185
+ from pathlib import Path
186
+ from endnote_utils import export, export_folder
187
+
188
+ # Single file
189
+ total, out_file, report_file = export(
190
+ Path("data/IEEE.xml"),
191
+ Path("output/ieee.csv"),
192
+ dedupe="doi", stats=True
193
+ )
194
+
195
+ # Folder
196
+ total, out_file, report_file = export_folder(
197
+ Path("data/xmls"),
198
+ Path("output/all.csv"),
199
+ ref_type="Conference Proceedings",
200
+ year="2024",
201
+ dedupe="title-year",
202
+ dedupe_keep="last",
203
+ stats=True,
204
+ stats_json=Path("output/stats.json"),
205
+ )
206
+ ```
207
+
208
+ ---
209
+
210
+ ## Development Notes
211
+
212
+ * Pure Python, uses only standard library (`argparse`, `csv`, `xml.etree.ElementTree`, `logging`, `pathlib`, `json`).
213
+ * Optional dependency: `openpyxl` (for Excel `.xlsx` export).
214
+ * Streaming XML parsing avoids high memory usage.
215
+ * Deduplication strategies configurable (`doi` / `title-year`).
216
+ * Report includes per-database table and optional JSON snapshot.
217
+ * Follows [PEP 621](https://peps.python.org/pep-0621/) packaging (`pyproject.toml`).
218
+
219
+ ---
220
+
221
+ ## License
222
+
223
+ MIT License © 2025 Minh Quach
@@ -0,0 +1,8 @@
1
+ endnote_utils/__init__.py,sha256=yuPzjVRcBiQPUCFG5DbSvwqz1O42_hYJ3_7dzWotE20,284
2
+ endnote_utils/cli.py,sha256=QFE73sKPMEbRiOuCVpMMQXT3RBx854uU-GS-ZHQv1Kw,7025
3
+ endnote_utils/core.py,sha256=e52ebYHx2QdY3juS3Jt8-SQhJyDLvIycaj0WhIatang,22960
4
+ endnote_utils-0.2.0.dist-info/METADATA,sha256=wllJhkRJlwO1eUROFNqvunl-rdSNiaSKzrTVH4p8zVs,7252
5
+ endnote_utils-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
+ endnote_utils-0.2.0.dist-info/entry_points.txt,sha256=l8OEYTGiRj49CND6Xmpk4cIlAE8WJg6UInRo-YRvg8w,57
7
+ endnote_utils-0.2.0.dist-info/top_level.txt,sha256=6ZlEkqvnKvYAHI7P3wlh5j3vDQF4-bKLIdYCwPTL-G8,14
8
+ endnote_utils-0.2.0.dist-info/RECORD,,
@@ -1,145 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: endnote-utils
3
- Version: 0.1.4
4
- Summary: Convert EndNote XML to CSV with streaming parse and TXT report.
5
- Author-email: Minh Quach <minhquach8@gmail.com>
6
- License: MIT
7
- Keywords: endnote,xml,csv,bibliography,research
8
- Classifier: Programming Language :: Python :: 3
9
- Classifier: License :: OSI Approved :: MIT License
10
- Classifier: Operating System :: OS Independent
11
- Requires-Python: >=3.8
12
- Description-Content-Type: text/markdown
13
-
14
- # EndNote Utils
15
-
16
- Convert **EndNote XML files** into clean CSVs with automatic TXT reports.
17
- Supports both **Python API** and **command-line interface (CLI)**.
18
-
19
- ---
20
-
21
- ## Features
22
-
23
- - ✅ Parse one XML file (`--xml`) or an entire folder of `*.xml` (`--folder`)
24
- - ✅ Streams `<record>` elements using `iterparse` (low memory usage)
25
- - ✅ Extracts fields:
26
- `database, ref_type, title, journal, authors, year, volume, number, abstract, doi, urls, extracted_date`
27
- - ✅ Adds a `database` column from the XML filename stem (`IEEE.xml → IEEE`)
28
- - ✅ Normalizes DOI (`10.xxxx` → `https://doi.org/...`)
29
- - ✅ Always generates a **TXT report** (default: `<csv>_report.txt`) with:
30
- - per-file counts (exported/skipped records)
31
- - totals, files processed
32
- - run timestamp & duration
33
- - ✅ Auto-creates output folders if missing
34
- - ✅ CLI options for CSV formatting, filters, verbosity
35
- - ✅ Importable Python API for scripting & integration
36
-
37
- ---
38
-
39
- ## Installation
40
-
41
- ### From PyPI
42
-
43
- ```bash
44
- pip install endnote-utils
45
- ```
46
-
47
- Requires **Python 3.8+**.
48
-
49
- ---
50
-
51
- ## Usage
52
-
53
- ### Command Line
54
-
55
- #### Single file
56
-
57
- ```bash
58
- endnote-utils --xml data/IEEE.xml --csv output/ieee.csv
59
- ```
60
-
61
- #### Folder with multiple files
62
-
63
- ```bash
64
- endnote-utils --folder data/xmls --csv output/all_records.csv
65
- ```
66
-
67
- #### Custom report path
68
-
69
- ```bash
70
- endnote-utils \
71
- --xml data/Scopus.xml \
72
- --csv output/scopus.csv \
73
- --report reports/scopus_run.txt
74
- ```
75
-
76
- If `--report` is not provided, it defaults to `<csv>_report.txt`.
77
-
78
- ---
79
-
80
- ### CLI Options
81
-
82
- | Option | Description | Default |
83
- | --------------- | --------------------------------------------------- | ------------------ |
84
- | `--xml` | Path to a single EndNote XML file | – |
85
- | `--folder` | Path to a folder containing multiple `*.xml` files | – |
86
- | `--csv` | Output CSV path | – |
87
- | `--report` | Output TXT report path | `<csv>_report.txt` |
88
- | `--delimiter` | CSV delimiter | `,` |
89
- | `--quoting` | CSV quoting: `minimal`, `all`, `nonnumeric`, `none` | `minimal` |
90
- | `--no-header` | Suppress CSV header row | – |
91
- | `--encoding` | Output CSV encoding | `utf-8` |
92
- | `--ref-type` | Only include records with this `ref_type` name | – |
93
- | `--year` | Only include records with this year | – |
94
- | `--max-records` | Stop after N records per file (useful for testing) | – |
95
- | `--verbose` | Verbose logging with debug details | – |
96
-
97
- ---
98
-
99
- ### Example Report
100
-
101
- ```
102
- Run started: 2025-09-11 14:30:22
103
- IEEE.xml: 120 exported, 0 skipped
104
- Scopus.xml: 95 exported, 2 skipped
105
- TOTAL exported: 215
106
- Files processed: 2
107
- Duration: 3.14 seconds
108
- ```
109
-
110
- ---
111
-
112
- ## Python API
113
-
114
- You can also use it directly in Python scripts:
115
-
116
- ```python
117
- from pathlib import Path
118
- from endnote_utils import export, export_folder
119
-
120
- # Single file
121
- total, csv_out, report_out = export(
122
- Path("data/IEEE.xml"), Path("output/ieee.csv")
123
- )
124
-
125
- # Folder
126
- total, csv_out, report_out = export_folder(
127
- Path("data/xmls"), Path("output/all.csv"),
128
- ref_type="Conference Proceedings", year="2024"
129
- )
130
- ```
131
-
132
- ---
133
-
134
- ## Development Notes
135
-
136
- * Pure Python, uses only standard library (`argparse`, `csv`, `xml.etree.ElementTree`, `logging`, `pathlib`).
137
- * Streaming XML parsing avoids high memory usage.
138
- * Robust error handling: skips malformed records but logs them in verbose mode.
139
- * Follows [PEP 621](https://peps.python.org/pep-0621/) packaging (`pyproject.toml`).
140
-
141
- ---
142
-
143
- ## License
144
-
145
- MIT License © 2025 Minh Quach
@@ -1,8 +0,0 @@
1
- endnote_utils/__init__.py,sha256=yuPzjVRcBiQPUCFG5DbSvwqz1O42_hYJ3_7dzWotE20,284
2
- endnote_utils/cli.py,sha256=TQxdO7IlaRXwNTm0MpBVk9CeUTUGgtlcI0O3O9xhgdM,2160
3
- endnote_utils/core.py,sha256=cddpuRMF5RC5mp3Lll0eTA9MXLzcVDnDl1Z7IMHOr0k,7480
4
- endnote_utils-0.1.4.dist-info/METADATA,sha256=FXD6AXEFT1_lqYpuDYtX89XnNsQpZrftudMp-YzodQI,4316
5
- endnote_utils-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
- endnote_utils-0.1.4.dist-info/entry_points.txt,sha256=l8OEYTGiRj49CND6Xmpk4cIlAE8WJg6UInRo-YRvg8w,57
7
- endnote_utils-0.1.4.dist-info/top_level.txt,sha256=6ZlEkqvnKvYAHI7P3wlh5j3vDQF4-bKLIdYCwPTL-G8,14
8
- endnote_utils-0.1.4.dist-info/RECORD,,