PyPI - endnote-utils - Versions diffs - 0.1.4__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

endnote-utils 0.1.4py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

endnote_utils/cli.py +164 -32
endnote_utils/core.py +518 -54
endnote_utils-0.2.0.dist-info/METADATA +223 -0
endnote_utils-0.2.0.dist-info/RECORD +8 -0
endnote_utils-0.1.4.dist-info/METADATA +0 -145
endnote_utils-0.1.4.dist-info/RECORD +0 -8
{endnote_utils-0.1.4.dist-info → endnote_utils-0.2.0.dist-info}/WHEEL +0 -0
{endnote_utils-0.1.4.dist-info → endnote_utils-0.2.0.dist-info}/entry_points.txt +0 -0
{endnote_utils-0.1.4.dist-info → endnote_utils-0.2.0.dist-info}/top_level.txt +0 -0

endnote_utils/cli.py CHANGED Viewed

@@ -4,51 +4,183 @@ import argparse
 import logging
 import sys
 from pathlib import Path
+from typing import List, Optional, Tuple
-from .core import DEFAULT_FIELDNAMES, export, export_folder
+from .core import (
+    DEFAULT_FIELDNAMES,
+    export_files_with_report,  # generic writer: csv/json/xlsx
+)
+SUPPORTED_FORMATS = ("csv", "json", "xlsx")
+EXT_TO_FORMAT = {".csv": "csv", ".json": "json", ".xlsx": "xlsx"}
 def build_parser() -> argparse.ArgumentParser:
-    p = argparse.ArgumentParser(description="Export EndNote XML (file or folder) to CSV + TXT report.")
+    p = argparse.ArgumentParser(
+        description="Export EndNote XML (file or folder) to CSV/JSON/XLSX with a TXT report."
+    )
+    # Input source (mutually exclusive)
     g = p.add_mutually_exclusive_group(required=True)
     g.add_argument("--xml", help="Path to a single EndNote XML file.")
     g.add_argument("--folder", help="Path to a folder containing *.xml files.")
-    p.add_argument("--csv", required=True, help="Path to CSV output file.")
-    p.add_argument("--report", required=False, help="Path to TXT report (default: <csv>_report.txt).")
-    p.add_argument("--delimiter", default=",")
-    p.add_argument("--quoting", default="minimal", choices=["minimal","all","nonnumeric","none"])
-    p.add_argument("--no-header", action="store_true")
-    p.add_argument("--encoding", default="utf-8")
-    p.add_argument("--ref-type", default=None)
-    p.add_argument("--year", default=None)
-    p.add_argument("--max-records", type=int, default=None)
-    p.add_argument("--verbose", action="store_true")
+    # Output selection (CSV legacy flag + new generic flags)
+    p.add_argument(
+        "--csv",
+        required=False,
+        help="(Legacy) Output CSV path. Prefer --out for csv/json/xlsx.",
+    )
+    p.add_argument(
+        "--out",
+        required=False,
+        help="Generic output path; format inferred from file extension if --format not provided. "
+             "Supported extensions: .csv, .json, .xlsx",
+    )
+    p.add_argument(
+        "--format",
+        choices=SUPPORTED_FORMATS,
+        help="Output format. If omitted, inferred from --out extension or --csv.",
+    )
+    # Report controls
+    p.add_argument("--report", required=False, help="Path to TXT report (default: <output>_report.txt).")
+    p.add_argument(
+        "--no-report",
+        action="store_true",
+        help="Disable writing the TXT report (by default, a report is always generated).",
+    )
+    # CSV-specific formatting options (ignored for JSON/XLSX except delimiter/quoting/header)
+    p.add_argument("--delimiter", default=",", help="CSV delimiter (default: ',').")
+    p.add_argument(
+        "--quoting",
+        default="minimal",
+        choices=["minimal", "all", "nonnumeric", "none"],
+        help="CSV quoting mode (default: minimal).",
+    )
+    p.add_argument("--no-header", action="store_true", help="Do not write CSV header row.")
+    p.add_argument("--encoding", default="utf-8", help="Output text encoding (default: utf-8).")
+    # Filters / limits
+    p.add_argument("--ref-type", default=None, help="Filter by ref_type name.")
+    p.add_argument("--year", default=None, help="Filter by year.")
+    p.add_argument("--max-records", type=int, default=None, help="Max records per file (testing).")
+    # Deduplication & Stats
+    p.add_argument("--dedupe", choices=["none", "doi", "title-year"], default="none",
+                help="Deduplicate records by key. Default: none.")
+    p.add_argument("--dedupe-keep", choices=["first", "last"], default="first",
+                help="When duplicates found, keep the first or last occurrence. Default: first.")
+    p.add_argument("--stats", action="store_true",
+                help="Compute summary stats and include them in the TXT report.")
+    p.add_argument("--stats-json",
+                help="Optional JSON file path to write detailed stats (when --stats is used).")
+    p.add_argument("--top-authors", type=int, default=10,
+                help="How many top authors to list in the report/stats JSON. Default: 10.")
+    # Verbosity
+    p.add_argument("--verbose", action="store_true", help="Verbose logging.")
     return p
+def _resolve_inputs(args: argparse.Namespace) -> List[Path]:
+    if args.xml:
+        xml_path = Path(args.xml)
+        if not xml_path.is_file():
+            raise FileNotFoundError(xml_path)
+        return [xml_path]
+    folder = Path(args.folder)
+    if not folder.is_dir():
+        raise FileNotFoundError(folder)
+    inputs = sorted(p for p in folder.glob("*.xml") if p.is_file())
+    if not inputs:
+        raise FileNotFoundError(f"No *.xml files found in folder: {folder}")
+    return inputs
+def _resolve_output_and_format(args: argparse.Namespace) -> tuple[Path, str, Optional[Path]]:
+    """
+    Decide final out_path, out_format, and report_path using:
+      - Prefer --out/--format if provided
+      - Fallback to --csv (legacy) which implies CSV
+      - If --no-report, return report_path=None
+    """
+    target_path: Optional[Path] = None
+    out_format: Optional[str] = None
+    if args.out:
+        target_path = Path(args.out)
+        out_format = args.format
+        if not out_format:
+            # infer from extension
+            out_format = EXT_TO_FORMAT.get(target_path.suffix.lower())
+            if not out_format:
+                raise SystemExit(
+                    "Cannot infer output format from extension. "
+                    "Use --format {csv,json,xlsx} or set a supported extension."
+                )
+    elif args.csv:
+        target_path = Path(args.csv)
+        out_format = args.format or "csv"
+        if out_format != "csv":
+            # user asked for non-csv but used --csv path
+            raise SystemExit("When using --csv, --format must be 'csv'. Use --out for json/xlsx.")
+    else:
+        raise SystemExit("You must provide either --out (preferred) or --csv (legacy).")
+    # Report path defaults next to chosen output file (unless disabled)
+    if args.no_report:
+        report_path: Optional[Path] = None
+    else:
+        report_path = Path(args.report) if args.report else target_path.with_name(target_path.stem + "_report.txt")
+    return target_path, out_format, report_path
 def main() -> None:
     args = build_parser().parse_args()
     logging.basicConfig(
         level=logging.DEBUG if args.verbose else logging.INFO,
-        format="%(levelname)s: %(message)s", stream=sys.stderr
-    )
-    csv_path = Path(args.csv)
-    report_path = Path(args.report) if args.report else csv_path.with_name(csv_path.stem + "_report.txt")
-    kwargs = dict(
-        report_path=report_path,
-        fieldnames=DEFAULT_FIELDNAMES,
-        delimiter=args.delimiter,
-        quoting=args.quoting,
-        include_header=not args.no_header,
-        encoding=args.encoding,
-        ref_type=args.ref_type,
-        year=args.year,
-        max_records_per_file=args.max_records,
+        format="%(levelname)s: %(message)s",
+        stream=sys.stderr,
     )
-    if args.xml:
-        total, csv_out, rep_out = export(Path(args.xml), csv_path, **kwargs)
-    else:
-        total, csv_out, rep_out = export_folder(Path(args.folder), csv_path, **kwargs)
+    try:
+        inputs = _resolve_inputs(args)
+        out_path, out_format, report_path = _resolve_output_and_format(args)
+        total, final_out, final_report = export_files_with_report(
+            inputs=inputs,
+            out_path=out_path,
+            out_format=out_format,
+            fieldnames=DEFAULT_FIELDNAMES,
+            delimiter=args.delimiter,
+            quoting=args.quoting,
+            include_header=not args.no_header,
+            encoding=args.encoding,
+            ref_type=args.ref_type,
+            year=args.year,
+            max_records_per_file=args.max_records,
+            dedupe=args.dedupe,
+            dedupe_keep=args.dedupe_keep,
+            stats=args.stats,
+            stats_json=Path(args.stats_json) if args.stats_json else None,
+            top_authors=args.top_authors,
+            report_path=report_path,  # may be None → core should skip writing report
+        )
+        logging.info("Exported %d record(s) → %s", total, final_out)
+        if report_path is None:
+            logging.info("Report disabled by --no-report.")
+        else:
+            logging.info("Report → %s", final_report)
-    logging.info("Exported %d record(s) → %s", total, csv_out)
-    logging.info("Report → %s", rep_out)
+    except FileNotFoundError as e:
+        logging.error("File/folder not found: %s", e)
+        sys.exit(1)
+    except Exception as e:
+        logging.error("Unexpected error: %s", e)
+        sys.exit(2)

endnote_utils/core.py CHANGED Viewed

@@ -1,17 +1,37 @@
-# src/endnote_exporter/core.py
+# src/endnote_utils/core.py
 from __future__ import annotations
 import csv
+import json
 import logging
 import time
 import xml.etree.ElementTree as ET
+from collections import Counter
 from datetime import datetime
 from pathlib import Path
 from typing import Dict, Iterable, List, Optional, Tuple
+# ----------------------------
+# Public constants
+# ----------------------------
 DEFAULT_FIELDNAMES: List[str] = [
-    "database", "ref_type", "title", "journal", "authors", "year",
-    "volume", "number", "abstract", "doi", "urls", "extracted_date",
+    "database",
+    "ref_type",
+    "title",
+    "journal",
+    "authors",
+    "year",
+    "volume",
+    "number",
+    "abstract",
+    "doi",
+    "urls",
+    "keywords",
+    "publisher",
+    "isbn",
+    "language",
+    "extracted_date",
 ]
 CSV_QUOTING_MAP = {
@@ -21,17 +41,27 @@ CSV_QUOTING_MAP = {
     "none": csv.QUOTE_NONE,
 }
+# Report layout
+DUPES_DETAILS_LIMIT = 50
+STATS_LIST_LIMIT = 20
+# ----------------------------
+# FS helpers
+# ----------------------------
 def ensure_parent_dir(p: Path) -> None:
+    """Create parent directory if it doesn't exist."""
     p.parent.mkdir(parents=True, exist_ok=True)
 # ----------------------------
-# Utilities
+# Text helpers
 # ----------------------------
 def clean_text(text: Optional[str]) -> str:
     """
     Trim, collapse internal whitespace, remove stray CRs, keep punctuation intact.
-    Safer for CSV fields than aggressive normalization.
     """
     if not text:
         return ""
@@ -48,14 +78,18 @@ def safe_find_text(node: ET.Element, path: str) -> str:
 def join_nonempty(items: Iterable[str], sep: str) -> str:
     return sep.join(x for x in (i.strip() for i in items) if x)
-def ensure_parent_dir(p: Path) -> None:
-    """Create parent directory if it doesn't exist."""
-    if not p.parent.exists():
-        p.parent.mkdir(parents=True, exist_ok=True)
+def normalize_text_for_key(s: str) -> str:
+    """Lowercase + strip non-alnum + single-space. Good for stable keys."""
+    if not s:
+        return ""
+    s = s.lower()
+    s = "".join(ch for ch in s if ch.isalnum() or ch.isspace())
+    return " ".join(s.split())
 # ----------------------------
-# Record processing
+# Record extraction
 # ----------------------------
 def process_doi(record: ET.Element) -> str:
@@ -97,8 +131,24 @@ def extract_urls(record: ET.Element) -> str:
     return join_nonempty(deduped, " | ")
+def extract_keywords(record: ET.Element) -> str:
+    """Collect keywords from //keywords/keyword/style, joined by '; ' (deduped)."""
+    items: List[str] = []
+    for kw in record.findall(".//keywords/keyword"):
+        style = kw.find("style")
+        if style is not None and style.text:
+            items.append(clean_text(style.text))
+    seen = set()
+    out: List[str] = []
+    for x in items:
+        if x not in seen:
+            seen.add(x)
+            out.append(x)
+    return join_nonempty(out, "; ")
 def process_record(record: ET.Element, database: str) -> Dict[str, str]:
-    """Transform a <record> element into a dictionary for CSV."""
+    """Transform a <record> element into a flat dictionary."""
     ref_type_name = ""
     ref_type = record.find("ref-type")
     if ref_type is not None:
@@ -116,10 +166,20 @@ def process_record(record: ET.Element, database: str) -> Dict[str, str]:
         "abstract": safe_find_text(record, ".//abstract/style"),
         "doi": process_doi(record),
         "urls": extract_urls(record),
+        "keywords": extract_keywords(record),
+        "publisher": safe_find_text(record, ".//publisher/style"),
+        "isbn": safe_find_text(record, ".//isbn/style"),
+        "language": safe_find_text(record, ".//language/style"),
         "extracted_date": datetime.now().strftime("%Y-%m-%d"),
     }
+# ----------------------------
+# XML streaming + filters
+# ----------------------------
 def iter_records(xml_path: Path) -> Iterable[ET.Element]:
+    """Stream <record> elements with low memory footprint."""
     context = ET.iterparse(str(xml_path), events=("start", "end"))
     _, root = next(context)
     for event, elem in context:
@@ -128,15 +188,141 @@ def iter_records(xml_path: Path) -> Iterable[ET.Element]:
             elem.clear()
             root.clear()
 def record_matches_filters(row: Dict[str, str], ref_type: Optional[str], year: Optional[str]) -> bool:
-    if ref_type and row.get("ref_type") != ref_type: return False
-    if year and row.get("year") != str(year): return False
+    if ref_type and row.get("ref_type") != ref_type:
+        return False
+    if year and row.get("year") != str(year):
+        return False
     return True
-def export_files_to_csv_with_report(
+# ----------------------------
+# Deduplication helpers
+# ----------------------------
+def dedupe_key(row: Dict[str, str], mode: str) -> Optional[str]:
+    """
+    mode: 'none' | 'doi' | 'title-year'
+    Returns None when no applicable key can be formed (row passes through).
+    """
+    if mode == "doi":
+        k = (row.get("doi") or "").strip()
+        return k or None
+    if mode == "title-year":
+        title = normalize_text_for_key(row.get("title", ""))
+        year = (row.get("year") or "").strip()
+        if title and year:
+            return f"{title}::{year}"
+        return None
+    return None
+# ----------------------------
+# Retraction detection (basic heuristic)
+# ----------------------------
+def is_retraction(record: ET.Element) -> bool:
+    """
+    Heuristic: consider a record 'retraction' if its notes or title contain substrings like:
+    'retraction', 'retracted', 'withdrawn', 'erratum'.
+    """
+    text_blob = " ".join(
+        [
+            safe_find_text(record, ".//notes/style"),
+            safe_find_text(record, ".//title/style"),
+        ]
+    ).lower()
+    indicators = ("retraction", "retracted", "withdrawn", "erratum")
+    return any(tok in text_blob for tok in indicators)
+# ----------------------------
+# Writers (CSV / JSON / XLSX)
+# ----------------------------
+def _write_rows_csv(
+    rows_iter: Iterable[Dict[str, str]],
+    out_path: Path,
+    fieldnames: List[str],
+    delimiter: str,
+    quoting: str,
+    include_header: bool,
+    encoding: str,
+) -> int:
+    qmode = CSV_QUOTING_MAP[quoting.lower()]
+    ensure_parent_dir(out_path)
+    count = 0
+    with open(out_path, "w", newline="", encoding=encoding) as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter=delimiter, quoting=qmode)
+        if include_header:
+            writer.writeheader()
+        for row in rows_iter:
+            writer.writerow({k: row.get(k, "") for k in fieldnames})
+            count += 1
+    return count
+def _write_rows_json(
+    rows_iter: Iterable[Dict[str, str]],
+    out_path: Path,
+    fieldnames: List[str],
+    encoding: str,
+) -> int:
+    """Write a JSON array streaming without holding all rows in memory."""
+    ensure_parent_dir(out_path)
+    count = 0
+    with open(out_path, "w", encoding=encoding) as f:
+        f.write("[")
+        first = True
+        for row in rows_iter:
+            obj = {k: row.get(k, "") for k in fieldnames}
+            if first:
+                first = False
+            else:
+                f.write(",")
+            f.write(json.dumps(obj, ensure_ascii=False))
+            count += 1
+        f.write("]")
+    return count
+def _write_rows_xlsx(
+    rows_iter: Iterable[Dict[str, str]],
+    out_path: Path,
+    fieldnames: List[str],
+) -> int:
+    """Write an Excel file using openpyxl (installed via project dependencies)."""
+    try:
+        from openpyxl import Workbook
+    except ImportError as e:
+        raise RuntimeError(
+            "Excel output requires 'openpyxl'. Ensure it is installed."
+        ) from e
+    ensure_parent_dir(out_path)
+    wb = Workbook()
+    ws = wb.active
+    ws.title = "records"
+    ws.append(fieldnames)  # header
+    count = 0
+    for row in rows_iter:
+        ws.append([row.get(k, "") for k in fieldnames])
+        count += 1
+    wb.save(out_path)
+    return count
+# ----------------------------
+# Generic export + report (+ dedupe + stats, pretty report + duplicates table)
+# ----------------------------
+def export_files_with_report(
     inputs: List[Path],
-    csv_path: Path,
-    report_path: Optional[Path] = None,
+    out_path: Path,
+    out_format: str,  # "csv" | "json" | "xlsx"
     *,
     fieldnames: List[str] = None,
     delimiter: str = ",",
@@ -146,63 +332,341 @@ def export_files_to_csv_with_report(
     ref_type: Optional[str] = None,
     year: Optional[str] = None,
     max_records_per_file: Optional[int] = None,
-) -> Tuple[int, Path, Path]:
-    """Primary library API: export one or many XML files to a single CSV + TXT report."""
-    fieldnames = fieldnames or DEFAULT_FIELDNAMES
-    qmode = CSV_QUOTING_MAP[quoting]
-    report_path = report_path or csv_path.with_name(csv_path.stem + "_report.txt")
+    report_path: Optional[Path] = None,
+    # Dedup + stats
+    dedupe: str = "none",
+    dedupe_keep: str = "first",
+    stats: bool = False,
+    stats_json: Optional[Path] = None,
+    top_authors: int = 10,
+) -> Tuple[int, Path, Optional[Path]]:
+    """
+    Stream records from one or many EndNote XML files and write to CSV/JSON/XLSX.
+    Writes a pretty TXT report unless report_path is None.
-    ensure_parent_dir(csv_path)
-    ensure_parent_dir(report_path)
+    Deduplication:
+      - dedupe='doi'        → unique by DOI
+      - dedupe='title-year' → unique by normalized (title, year)
+      - dedupe_keep='first' or 'last' (applies within each input file)
-    total_written, report_lines = 0, []
-    start_ts = time.time()
-    run_start = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    Stats (when stats=True) add counts by year/ref_type/journal and top authors.
+    stats_json (if provided) writes a JSON snapshot of these stats + duplicates.
-    with open(csv_path, "w", newline="", encoding=encoding) as f:
-        writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter=delimiter, quoting=qmode)
-        if include_header:
-            writer.writeheader()
+    The report now includes a per-database table: Origin / Retractions / Duplicates / Remaining.
+    Returns (total_rows_written, out_path, report_path or None if disabled).
+    """
+    fieldnames = fieldnames or DEFAULT_FIELDNAMES
+    out_format = out_format.lower()
+    if out_format not in {"csv", "json", "xlsx"}:
+        raise ValueError(f"Unknown out_format: {out_format}")
+    # Per-run accumulators
+    per_file_lines: List[str] = []
+    year_counter = Counter()
+    type_counter = Counter()
+    journal_counter = Counter()
+    author_counter = Counter()
+    # Dedupe state
+    seen_keys: set[str] = set()
+    duplicates_counter = Counter()          # global key -> duplicate count
+    dupes_removed_per_db: Dict[str, int] = {}
+    # Per-database accounting for the table
+    # Origin: records after filters, before dedupe (and before max_records per file cap)
+    # Retractions: simple heuristic via is_retraction()
+    # Duplicates: removed due to dedupe
+    # Remaining: exported (post-dedupe)
+    per_db: Dict[str, Dict[str, int]] = {}
+    def rows() -> Iterable[Dict[str, str]]:
+        nonlocal per_file_lines, seen_keys, duplicates_counter, per_db
+        nonlocal year_counter, type_counter, journal_counter, author_counter, dupes_removed_per_db
         for xml_path in inputs:
             database = xml_path.stem
+            per_db.setdefault(database, {"origin": 0, "retractions": 0, "duplicates": 0, "remaining": 0})
+            dupes_removed_per_db.setdefault(database, 0)
             logging.info("Processing %s (database=%s)", xml_path.name, database)
-            file_written = file_skipped = 0
+            produced = 0
+            skipped = 0
+            buffered: List[Dict[str, str]] = []
+            buffered_keys_index: Dict[str, int] = {}
             for rec in iter_records(xml_path):
                 try:
-                    row = process_record(rec, database=database)  # your existing function
-                    if record_matches_filters(row, ref_type, year):
-                        writer.writerow({k: row.get(k, "") for k in fieldnames})
-                        file_written += 1
-                        total_written += 1
-                        if max_records_per_file and file_written >= max_records_per_file:
-                            break
+                    # Build row (for filters & output)
+                    row = process_record(rec, database=database)
+                    # Filter
+                    if not record_matches_filters(row, ref_type, year):
+                        continue
+                    # Origin++ (count any passing-filter record before dedupe)
+                    per_db[database]["origin"] += 1
+                    # Retraction heuristic
+                    if is_retraction(rec):
+                        per_db[database]["retractions"] += 1
+                    # Dedup
+                    k = dedupe_key(row, dedupe)
+                    if k and dedupe != "none":
+                        if dedupe_keep == "first":
+                            if k in seen_keys:
+                                duplicates_counter[k] += 1
+                                per_db[database]["duplicates"] += 1
+                                dupes_removed_per_db[database] += 1
+                                continue
+                            seen_keys.add(k)
+                            buffered.append(row)
+                            produced += 1
+                        else:  # keep last within this file
+                            if k in buffered_keys_index:
+                                # replace old occurrence in this file buffer
+                                prev_idx = buffered_keys_index[k]
+                                buffered[prev_idx] = row
+                                duplicates_counter[k] += 1
+                                per_db[database]["duplicates"] += 1
+                                dupes_removed_per_db[database] += 1
+                            else:
+                                buffered_keys_index[k] = len(buffered)
+                                buffered.append(row)
+                                produced += 1
+                            seen_keys.add(k)
+                    else:
+                        buffered.append(row)
+                        produced += 1
+                    if max_records_per_file and produced >= max_records_per_file:
+                        break
                 except Exception:
-                    file_skipped += 1
+                    skipped += 1
                     logging.debug("Record error in %s", xml_path, exc_info=True)
-            report_lines.append(f"{xml_path.name}: {file_written} exported, {file_skipped} skipped")
+            # Remaining = exported from this file
+            per_db[database]["remaining"] += len(buffered)
+            per_file_lines.append(f"{xml_path.name:<15} : {len(buffered)} exported, {skipped} skipped")
+            # Stats
+            if stats:
+                for r in buffered:
+                    y = (r.get("year") or "").strip()
+                    t = (r.get("ref_type") or "").strip()
+                    j = (r.get("journal") or "").strip()
+                    if y:
+                        year_counter[y] += 1
+                    if t:
+                        type_counter[t] += 1
+                    if j:
+                        journal_counter[j] += 1
+                    if r.get("authors"):
+                        for a in (x.strip() for x in r["authors"].split(";")):
+                            if a:
+                                author_counter[a] += 1
+            # Yield to writer
+            for r in buffered:
+                yield r
+    # Select writer
+    start_ts = time.time()
+    run_start = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    dur = time.time() - start_ts
-    report_lines = [
-        f"Run started: {run_start}",
-        *report_lines,
-        f"TOTAL exported: {total_written}",
-        f"Files processed: {len(inputs)}",
-        f"Duration: {dur:.2f} seconds",
+    if out_format == "csv":
+        total = _write_rows_csv(rows(), out_path, fieldnames, delimiter, quoting, include_header, encoding)
+    elif out_format == "json":
+        total = _write_rows_json(rows(), out_path, fieldnames, encoding)
+    else:  # xlsx
+        total = _write_rows_xlsx(rows(), out_path, fieldnames)
+    duration = time.time() - start_ts
+    # ---------- Pretty report builder ----------
+    def _header_line(title: str) -> List[str]:
+        bar = "=" * 40
+        return [bar, title, bar]
+    def _section_line(title: str) -> List[str]:
+        return ["", title, "-" * 40]
+    report_lines: List[str] = []
+    report_lines += _header_line("EndNote Export Report")
+    report_lines += [
+        f"Run started : {run_start}",
+        f"Files       : {len(inputs)}",
+        f"Duration    : {duration:.2f} seconds",
     ]
-    with open(report_path, "w", encoding="utf-8") as rf:
-        rf.write("\n".join(report_lines))
-    return total_written, csv_path, report_path
+    # Per-file section
+    report_lines += _section_line("Per-file results")
+    report_lines += per_file_lines
+    report_lines.append(f"TOTAL exported: {total}")
+    # Per-database duplicates table
+    # Build totals row
+    if per_db:
+        report_lines += _section_line("Duplicates table (by database)")
+        # compute column widths
+        db_names = list(per_db.keys())
+        db_col_w = max([len("Database")] + [len(db) for db in db_names])
+        # totals
+        tot_origin = sum(d["origin"] for d in per_db.values())
+        tot_retract = sum(d["retractions"] for d in per_db.values())
+        tot_dupes = sum(d["duplicates"] for d in per_db.values())
+        tot_remain = sum(d["remaining"] for d in per_db.values())
+        header = f"{'Database':<{db_col_w}}  {'Origin':>8}  {'Retractions':>12}  {'Duplicates':>10}  {'Remaining':>10}"
+        report_lines.append(header)
+        report_lines.append("-" * len(header))
+        for db in sorted(per_db.keys()):
+            d = per_db[db]
+            line = (
+                f"{db:<{db_col_w}}  "
+                f"{d['origin']:>8}  "
+                f"{d['retractions']:>12}  "
+                f"{d['duplicates']:>10}  "
+                f"{d['remaining']:>10}"
+            )
+            report_lines.append(line)
+        total_line = (
+            f"{'TOTAL':<{db_col_w}}  "
+            f"{tot_origin:>8}  "
+            f"{tot_retract:>12}  "
+            f"{tot_dupes:>10}  "
+            f"{tot_remain:>10}"
+        )
+        report_lines.append(total_line)
+    # Duplicates key summary (top)
+    if dedupe != "none":
+        report_lines += _section_line("Duplicate keys (top)")
+        total_dupes_global = sum(duplicates_counter.values())
+        report_lines.append(f"Mode   : {dedupe}")
+        report_lines.append(f"Keep   : {dedupe_keep}")
+        report_lines.append(f"Removed: {total_dupes_global}")
+        if total_dupes_global > 0:
+            report_lines.append("Details (top):")
+            for k, c in duplicates_counter.most_common(DUPES_DETAILS_LIMIT):
+                report_lines.append(f"  {k} : {c} duplicate(s)")
+    # Summary stats
+    if stats:
+        def head(counter: Counter, n: int = 10):
+            return [(k, c) for k, c in counter.most_common(n) if k]
+        report_lines += _section_line("Summary stats")
+        # Year
+        report_lines.append("By year:")
+        for y in sorted(year_counter.keys()):
+            report_lines.append(f"  {y:>6} : {year_counter[y]}")
+        # Ref type
+        report_lines.append("")
+        report_lines.append("By ref_type (top):")
+        for k, c in head(type_counter, STATS_LIST_LIMIT):
+            report_lines.append(f"  {k}: {c}")
+        # Journal
+        report_lines.append("")
+        report_lines.append(f"By journal (top {STATS_LIST_LIMIT}):")
+        for k, c in head(journal_counter, STATS_LIST_LIMIT):
+            report_lines.append(f"  {k}: {c}")
+        # Authors
+        report_lines.append("")
+        report_lines.append(f"Top authors (top {top_authors}):")
+        for k, c in head(author_counter, top_authors):
+            report_lines.append(f"  {k}: {c}")
+        # Optional JSON dump
+        if stats_json:
+            ensure_parent_dir(stats_json)
+            with open(stats_json, "w", encoding="utf-8") as jf:
+                json.dump(
+                    {
+                        "totals": {
+                            "exported": total,
+                            "files_processed": len(inputs),
+                            "duration_seconds": duration,
+                        },
+                        "by_year": dict(year_counter),
+                        "by_ref_type": dict(type_counter),
+                        "by_journal": dict(journal_counter),
+                        "top_authors": author_counter.most_common(top_authors),
+                        "duplicates": {
+                            "mode": dedupe,
+                            "keep": dedupe_keep,
+                            "removed": sum(duplicates_counter.values()) if dedupe != "none" else 0,
+                            "top": duplicates_counter.most_common(DUPES_DETAILS_LIMIT) if dedupe != "none" else [],
+                            "by_database": per_db,
+                        },
+                    },
+                    jf,
+                    ensure_ascii=False,
+                    indent=2,
+                )
+    # Write report unless disabled
+    final_report_path: Optional[Path] = report_path
+    if final_report_path is not None:
+        final_report_path = final_report_path or out_path.with_name(out_path.stem + "_report.txt")
+        ensure_parent_dir(final_report_path)
+        with open(final_report_path, "w", encoding="utf-8") as rf:
+            rf.write("\n".join(report_lines))
+    return total, out_path, final_report_path
-def export(xml_file: Path, csv_path: Path, **kwargs):
-    """Convenience: single XML file to CSV (+report)."""
+# ----------------------------
+# Back-compat convenience wrappers (CSV only)
+# ----------------------------
+def export_files_to_csv_with_report(
+    inputs: List[Path],
+    csv_path: Path,
+    report_path: Optional[Path] = None,
+    *,
+    fieldnames: List[str] = None,
+    delimiter: str = ",",
+    quoting: str = "minimal",
+    include_header: bool = True,
+    encoding: str = "utf-8",
+    ref_type: Optional[str] = None,
+    year: Optional[str] = None,
+    max_records_per_file: Optional[int] = None,
+) -> Tuple[int, Path, Optional[Path]]:
+    """Legacy API: export to CSV + TXT report (or no report if report_path=None)."""
+    return export_files_with_report(
+        inputs=inputs,
+        out_path=csv_path,
+        out_format="csv",
+        fieldnames=fieldnames,
+        delimiter=delimiter,
+        quoting=quoting,
+        include_header=include_header,
+        encoding=encoding,
+        ref_type=ref_type,
+        year=year,
+        max_records_per_file=max_records_per_file,
+        report_path=report_path,
+    )
+def export(xml_file: Path, csv_path: Path, **kwargs) -> Tuple[int, Path, Optional[Path]]:
+    """Convenience: single XML file to CSV (+report unless disabled)."""
     return export_files_to_csv_with_report([xml_file], csv_path, **kwargs)
-def export_folder(folder: Path, csv_path: Path, **kwargs):
-    """Convenience: all *.xml in folder to CSV (+report)."""
+def export_folder(folder: Path, csv_path: Path, **kwargs) -> Tuple[int, Path, Optional[Path]]:
+    """Convenience: all *.xml in folder to CSV (+report unless disabled)."""
     inputs = sorted(p for p in Path(folder).glob("*.xml") if p.is_file())
     if not inputs:
         raise FileNotFoundError(f"No *.xml found in {folder}")

endnote_utils-0.2.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,223 @@
+Metadata-Version: 2.4
+Name: endnote-utils
+Version: 0.2.0
+Summary: Convert EndNote XML to CSV/JSON/XLSX with streaming parse and TXT report.
+Author-email: Minh Quach <minhquach8@gmail.com>
+License: MIT
+Keywords: endnote,xml,csv,bibliography,research
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+Requires-Dist: openpyxl>=3.1.0
+# EndNote Utils
+Convert **EndNote XML files** into clean CSV/JSON/XLSX with automatic TXT reports.
+Supports both **Python API** and **command-line interface (CLI)**.
+---
+## Features
+- ✅ Parse one XML file (`--xml`) or an entire folder of `*.xml` (`--folder`)
+- ✅ Streams `<record>` elements using `iterparse` (low memory usage)
+- ✅ Extracts fields:
+  `database, ref_type, title, journal, authors, year, volume, number, abstract, doi, urls, keywords, publisher, isbn, language, extracted_date`
+- ✅ Adds a `database` column from the XML filename stem (`IEEE.xml → IEEE`)
+- ✅ Normalizes DOI (`10.xxxx` → `https://doi.org/...`)
+- ✅ Supports **multiple output formats**: CSV, JSON, XLSX
+- ✅ Always generates a **TXT report** (default: `<out>_report.txt`) with:
+  - per-file counts (exported/skipped)
+  - totals, files processed
+  - run timestamp & duration
+  - **duplicate table** per database (Origin / Retractions / Duplicates / Remaining)
+  - optional duplicate key list (top-N)
+  - optional summary stats (year, ref_type, journal, top authors)
+- ✅ Auto-creates output folders if missing
+- ✅ Deduplication:
+  - `--dedupe doi` (unique by DOI)
+  - `--dedupe title-year` (unique by normalized title + year)
+  - `--dedupe-keep first|last` (keep first or last occurrence within each file)
+- ✅ Summary stats (`--stats`) with optional JSON export (`--stats-json`)
+- ✅ CLI options for CSV formatting, filters, verbosity
+- ✅ Importable Python API for scripting & integration
+---
+## Installation
+### From PyPI
+```bash
+pip install endnote-utils
+```
+Requires **Python 3.8+**.
+---
+## Usage
+### Command Line
+#### Single file
+```bash
+endnote-utils --xml data/IEEE.xml --out output/ieee.csv
+```
+#### Folder with multiple files
+```bash
+endnote-utils --folder data/xmls --out output/all_records.csv
+```
+#### Custom report path
+```bash
+endnote-utils \
+  --xml data/Scopus.xml \
+  --out output/scopus.csv \
+  --report reports/scopus_run.txt \
+  --stats \
+  --verbose
+```
+If `--report` is not provided, it defaults to `<out>_report.txt`.
+Use `--no-report` to disable report generation.
+---
+### CLI Options
+| Option          | Description                                         | Default            |
+| --------------- | --------------------------------------------------- | ------------------ |
+| `--xml`         | Path to a single EndNote XML file                   | –                  |
+| `--folder`      | Path to a folder containing multiple `*.xml` files  | –                  |
+| `--csv`         | (Legacy) Output CSV path                            | –                  |
+| `--out`         | Generic output path (`.csv`, `.json`, `.xlsx`)      | –                  |
+| `--format`      | Explicit format (`csv`, `json`, `xlsx`)             | inferred           |
+| `--report`      | Output TXT report path                              | `<out>_report.txt` |
+| `--no-report`   | Disable TXT report completely                       | –                  |
+| `--delimiter`   | CSV delimiter                                       | `,`                |
+| `--quoting`     | CSV quoting: `minimal`, `all`, `nonnumeric`, `none` | `minimal`          |
+| `--no-header`   | Suppress CSV header row                             | –                  |
+| `--encoding`    | Output text encoding                                | `utf-8`            |
+| `--ref-type`    | Only include records with this `ref_type` name      | –                  |
+| `--year`        | Only include records with this year                 | –                  |
+| `--max-records` | Stop after N records per file (for testing)         | –                  |
+| `--dedupe`      | Deduplicate mode: `none`, `doi`, `title-year`       | `none`             |
+| `--dedupe-keep` | Deduplication strategy: `first`, `last`             | `first`            |
+| `--stats`       | Include summary stats in TXT report                 | –                  |
+| `--stats-json`  | Path to JSON file to save stats & duplicate info    | –                  |
+| `--verbose`     | Verbose logging with debug details                  | –                  |
+---
+### Example Report (snippet)
+```
+========================================
+EndNote Export Report
+========================================
+Run started : 2025-09-11 14:30:22
+Files       : 4
+Duration    : 0.47 seconds
+Per-file results
+----------------------------------------
+GGScholar.xml    : 13 exported, 0 skipped
+IEEE.xml         : 2147 exported, 0 skipped
+PubMed.xml       : 504 exported, 0 skipped
+Scopus.xml       : 847 exported, 0 skipped
+TOTAL exported: 3511
+Duplicates table (by database)
+----------------------------------------
+Database        Origin   Retractions  Duplicates  Remaining
+------------------------------------------------------------
+GGScholar           179            0         27        152
+IEEE               1900            0        589       1311
+PubMed              320            0        225         95
+Scopus             1999            1        511       1489
+TOTAL              4410            1       1352       3047
+Duplicate keys (top)
+----------------------------------------
+Mode   : doi
+Keep   : first
+Removed: 1352
+Details (top):
+  10.1109/SPMB55497.2022.10014965 : 3 duplicate(s)
+  10.1109/TSSA63730.2024.10864368 : 2 duplicate(s)
+Summary stats
+----------------------------------------
+By year:
+   2022 : 569
+   2023 : 684
+   2024 : 1148
+   2025 : 1108
+By ref_type (top):
+  Journal Article: 2037
+  Conference Proceedings: 1470
+  Book Section: 4
+By journal (top 20):
+  IEEE Access: 175
+  IEEE Journal of Biomedical and Health Informatics: 67
+  ...
+Top authors (top 10):
+  Y. Wang: 50
+  X. Wang: 35
+  ...
+```
+---
+## Python API
+```python
+from pathlib import Path
+from endnote_utils import export, export_folder
+# Single file
+total, out_file, report_file = export(
+    Path("data/IEEE.xml"),
+    Path("output/ieee.csv"),
+    dedupe="doi", stats=True
+)
+# Folder
+total, out_file, report_file = export_folder(
+    Path("data/xmls"),
+    Path("output/all.csv"),
+    ref_type="Conference Proceedings",
+    year="2024",
+    dedupe="title-year",
+    dedupe_keep="last",
+    stats=True,
+    stats_json=Path("output/stats.json"),
+)
+```
+---
+## Development Notes
+* Pure Python, uses only standard library (`argparse`, `csv`, `xml.etree.ElementTree`, `logging`, `pathlib`, `json`).
+* Optional dependency: `openpyxl` (for Excel `.xlsx` export).
+* Streaming XML parsing avoids high memory usage.
+* Deduplication strategies configurable (`doi` / `title-year`).
+* Report includes per-database table and optional JSON snapshot.
+* Follows [PEP 621](https://peps.python.org/pep-0621/) packaging (`pyproject.toml`).
+---
+## License
+MIT License © 2025 Minh Quach

endnote_utils-0.2.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+endnote_utils/__init__.py,sha256=yuPzjVRcBiQPUCFG5DbSvwqz1O42_hYJ3_7dzWotE20,284
+endnote_utils/cli.py,sha256=QFE73sKPMEbRiOuCVpMMQXT3RBx854uU-GS-ZHQv1Kw,7025
+endnote_utils/core.py,sha256=e52ebYHx2QdY3juS3Jt8-SQhJyDLvIycaj0WhIatang,22960
+endnote_utils-0.2.0.dist-info/METADATA,sha256=wllJhkRJlwO1eUROFNqvunl-rdSNiaSKzrTVH4p8zVs,7252
+endnote_utils-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+endnote_utils-0.2.0.dist-info/entry_points.txt,sha256=l8OEYTGiRj49CND6Xmpk4cIlAE8WJg6UInRo-YRvg8w,57
+endnote_utils-0.2.0.dist-info/top_level.txt,sha256=6ZlEkqvnKvYAHI7P3wlh5j3vDQF4-bKLIdYCwPTL-G8,14
+endnote_utils-0.2.0.dist-info/RECORD,,

endnote_utils-0.1.4.dist-info/METADATA DELETED Viewed

@@ -1,145 +0,0 @@
-Metadata-Version: 2.4
-Name: endnote-utils
-Version: 0.1.4
-Summary: Convert EndNote XML to CSV with streaming parse and TXT report.
-Author-email: Minh Quach <minhquach8@gmail.com>
-License: MIT
-Keywords: endnote,xml,csv,bibliography,research
-Classifier: Programming Language :: Python :: 3
-Classifier: License :: OSI Approved :: MIT License
-Classifier: Operating System :: OS Independent
-Requires-Python: >=3.8
-Description-Content-Type: text/markdown
-# EndNote Utils
-Convert **EndNote XML files** into clean CSVs with automatic TXT reports.
-Supports both **Python API** and **command-line interface (CLI)**.
----
-## Features
-- ✅ Parse one XML file (`--xml`) or an entire folder of `*.xml` (`--folder`)
-- ✅ Streams `<record>` elements using `iterparse` (low memory usage)
-- ✅ Extracts fields:
-  `database, ref_type, title, journal, authors, year, volume, number, abstract, doi, urls, extracted_date`
-- ✅ Adds a `database` column from the XML filename stem (`IEEE.xml → IEEE`)
-- ✅ Normalizes DOI (`10.xxxx` → `https://doi.org/...`)
-- ✅ Always generates a **TXT report** (default: `<csv>_report.txt`) with:
-  - per-file counts (exported/skipped records)
-  - totals, files processed
-  - run timestamp & duration
-- ✅ Auto-creates output folders if missing
-- ✅ CLI options for CSV formatting, filters, verbosity
-- ✅ Importable Python API for scripting & integration
----
-## Installation
-### From PyPI
-```bash
-pip install endnote-utils
-```
-Requires **Python 3.8+**.
----
-## Usage
-### Command Line
-#### Single file
-```bash
-endnote-utils --xml data/IEEE.xml --csv output/ieee.csv
-```
-#### Folder with multiple files
-```bash
-endnote-utils --folder data/xmls --csv output/all_records.csv
-```
-#### Custom report path
-```bash
-endnote-utils \
-  --xml data/Scopus.xml \
-  --csv output/scopus.csv \
-  --report reports/scopus_run.txt
-```
-If `--report` is not provided, it defaults to `<csv>_report.txt`.
----
-### CLI Options
-| Option          | Description                                         | Default            |
-| --------------- | --------------------------------------------------- | ------------------ |
-| `--xml`         | Path to a single EndNote XML file                   | –                  |
-| `--folder`      | Path to a folder containing multiple `*.xml` files  | –                  |
-| `--csv`         | Output CSV path                                     | –                  |
-| `--report`      | Output TXT report path                              | `<csv>_report.txt` |
-| `--delimiter`   | CSV delimiter                                       | `,`                |
-| `--quoting`     | CSV quoting: `minimal`, `all`, `nonnumeric`, `none` | `minimal`          |
-| `--no-header`   | Suppress CSV header row                             | –                  |
-| `--encoding`    | Output CSV encoding                                 | `utf-8`            |
-| `--ref-type`    | Only include records with this `ref_type` name      | –                  |
-| `--year`        | Only include records with this year                 | –                  |
-| `--max-records` | Stop after N records per file (useful for testing)  | –                  |
-| `--verbose`     | Verbose logging with debug details                  | –                  |
----
-### Example Report
-```
-Run started: 2025-09-11 14:30:22
-IEEE.xml: 120 exported, 0 skipped
-Scopus.xml: 95 exported, 2 skipped
-TOTAL exported: 215
-Files processed: 2
-Duration: 3.14 seconds
-```
----
-## Python API
-You can also use it directly in Python scripts:
-```python
-from pathlib import Path
-from endnote_utils import export, export_folder
-# Single file
-total, csv_out, report_out = export(
-    Path("data/IEEE.xml"), Path("output/ieee.csv")
-)
-# Folder
-total, csv_out, report_out = export_folder(
-    Path("data/xmls"), Path("output/all.csv"),
-    ref_type="Conference Proceedings", year="2024"
-)
-```
----
-## Development Notes
-* Pure Python, uses only standard library (`argparse`, `csv`, `xml.etree.ElementTree`, `logging`, `pathlib`).
-* Streaming XML parsing avoids high memory usage.
-* Robust error handling: skips malformed records but logs them in verbose mode.
-* Follows [PEP 621](https://peps.python.org/pep-0621/) packaging (`pyproject.toml`).
----
-## License
-MIT License © 2025 Minh Quach

endnote_utils-0.1.4.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-endnote_utils/__init__.py,sha256=yuPzjVRcBiQPUCFG5DbSvwqz1O42_hYJ3_7dzWotE20,284
-endnote_utils/cli.py,sha256=TQxdO7IlaRXwNTm0MpBVk9CeUTUGgtlcI0O3O9xhgdM,2160
-endnote_utils/core.py,sha256=cddpuRMF5RC5mp3Lll0eTA9MXLzcVDnDl1Z7IMHOr0k,7480
-endnote_utils-0.1.4.dist-info/METADATA,sha256=FXD6AXEFT1_lqYpuDYtX89XnNsQpZrftudMp-YzodQI,4316
-endnote_utils-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-endnote_utils-0.1.4.dist-info/entry_points.txt,sha256=l8OEYTGiRj49CND6Xmpk4cIlAE8WJg6UInRo-YRvg8w,57
-endnote_utils-0.1.4.dist-info/top_level.txt,sha256=6ZlEkqvnKvYAHI7P3wlh5j3vDQF4-bKLIdYCwPTL-G8,14
-endnote_utils-0.1.4.dist-info/RECORD,,

{endnote_utils-0.1.4.dist-info → endnote_utils-0.2.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{endnote_utils-0.1.4.dist-info → endnote_utils-0.2.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{endnote_utils-0.1.4.dist-info → endnote_utils-0.2.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

endnote-utils 0.1.4__py3-none-any.whl → 0.2.0__py3-none-any.whl

endnote-utils 0.1.4py3-none-any.whl → 0.2.0py3-none-any.whl