endnote-utils 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- endnote-utils/__init__.py +14 -0
- endnote-utils/cli.py +54 -0
- endnote-utils/core.py +209 -0
- endnote_utils-0.1.0.dist-info/METADATA +154 -0
- endnote_utils-0.1.0.dist-info/RECORD +8 -0
- endnote_utils-0.1.0.dist-info/WHEEL +5 -0
- endnote_utils-0.1.0.dist-info/entry_points.txt +2 -0
- endnote_utils-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,14 @@
|
|
1
|
+
from .core import (
|
2
|
+
CSV_QUOTING_MAP,
|
3
|
+
DEFAULT_FIELDNAMES,
|
4
|
+
export,
|
5
|
+
export_files_to_csv_with_report,
|
6
|
+
export_folder,
|
7
|
+
)
|
8
|
+
|
9
|
+
__all__ = [
|
10
|
+
"export", "export_folder", "export_files_to_csv_with_report",
|
11
|
+
"DEFAULT_FIELDNAMES", "CSV_QUOTING_MAP",
|
12
|
+
]
|
13
|
+
|
14
|
+
__version__ = "0.1.0"
|
endnote-utils/cli.py
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import argparse
|
4
|
+
import logging
|
5
|
+
import sys
|
6
|
+
from pathlib import Path
|
7
|
+
|
8
|
+
from .core import DEFAULT_FIELDNAMES, export, export_folder
|
9
|
+
|
10
|
+
|
11
|
+
def build_parser() -> argparse.ArgumentParser:
|
12
|
+
p = argparse.ArgumentParser(description="Export EndNote XML (file or folder) to CSV + TXT report.")
|
13
|
+
g = p.add_mutually_exclusive_group(required=True)
|
14
|
+
g.add_argument("--xml", help="Path to a single EndNote XML file.")
|
15
|
+
g.add_argument("--folder", help="Path to a folder containing *.xml files.")
|
16
|
+
p.add_argument("--csv", required=True, help="Path to CSV output file.")
|
17
|
+
p.add_argument("--report", required=False, help="Path to TXT report (default: <csv>_report.txt).")
|
18
|
+
p.add_argument("--delimiter", default=",")
|
19
|
+
p.add_argument("--quoting", default="minimal", choices=["minimal","all","nonnumeric","none"])
|
20
|
+
p.add_argument("--no-header", action="store_true")
|
21
|
+
p.add_argument("--encoding", default="utf-8")
|
22
|
+
p.add_argument("--ref-type", default=None)
|
23
|
+
p.add_argument("--year", default=None)
|
24
|
+
p.add_argument("--max-records", type=int, default=None)
|
25
|
+
p.add_argument("--verbose", action="store_true")
|
26
|
+
return p
|
27
|
+
|
28
|
+
def main() -> None:
|
29
|
+
args = build_parser().parse_args()
|
30
|
+
logging.basicConfig(
|
31
|
+
level=logging.DEBUG if args.verbose else logging.INFO,
|
32
|
+
format="%(levelname)s: %(message)s", stream=sys.stderr
|
33
|
+
)
|
34
|
+
csv_path = Path(args.csv)
|
35
|
+
report_path = Path(args.report) if args.report else csv_path.with_name(csv_path.stem + "_report.txt")
|
36
|
+
kwargs = dict(
|
37
|
+
report_path=report_path,
|
38
|
+
fieldnames=DEFAULT_FIELDNAMES,
|
39
|
+
delimiter=args.delimiter,
|
40
|
+
quoting=args.quoting,
|
41
|
+
include_header=not args.no_header,
|
42
|
+
encoding=args.encoding,
|
43
|
+
ref_type=args.ref_type,
|
44
|
+
year=args.year,
|
45
|
+
max_records_per_file=args.max_records,
|
46
|
+
)
|
47
|
+
|
48
|
+
if args.xml:
|
49
|
+
total, csv_out, rep_out = export(Path(args.xml), csv_path, **kwargs)
|
50
|
+
else:
|
51
|
+
total, csv_out, rep_out = export_folder(Path(args.folder), csv_path, **kwargs)
|
52
|
+
|
53
|
+
logging.info("Exported %d record(s) → %s", total, csv_out)
|
54
|
+
logging.info("Report → %s", rep_out)
|
endnote-utils/core.py
ADDED
@@ -0,0 +1,209 @@
|
|
1
|
+
# src/endnote_exporter/core.py
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
import csv
|
5
|
+
import logging
|
6
|
+
import time
|
7
|
+
import xml.etree.ElementTree as ET
|
8
|
+
from datetime import datetime
|
9
|
+
from pathlib import Path
|
10
|
+
from typing import Dict, Iterable, List, Optional, Tuple
|
11
|
+
|
12
|
+
DEFAULT_FIELDNAMES: List[str] = [
|
13
|
+
"database", "ref_type", "title", "journal", "authors", "year",
|
14
|
+
"volume", "number", "abstract", "doi", "urls", "extracted_date",
|
15
|
+
]
|
16
|
+
|
17
|
+
CSV_QUOTING_MAP = {
|
18
|
+
"minimal": csv.QUOTE_MINIMAL,
|
19
|
+
"all": csv.QUOTE_ALL,
|
20
|
+
"nonnumeric": csv.QUOTE_NONNUMERIC,
|
21
|
+
"none": csv.QUOTE_NONE,
|
22
|
+
}
|
23
|
+
|
24
|
+
def ensure_parent_dir(p: Path) -> None:
|
25
|
+
p.parent.mkdir(parents=True, exist_ok=True)
|
26
|
+
|
27
|
+
# ----------------------------
|
28
|
+
# Utilities
|
29
|
+
# ----------------------------
|
30
|
+
|
31
|
+
def clean_text(text: Optional[str]) -> str:
|
32
|
+
"""
|
33
|
+
Trim, collapse internal whitespace, remove stray CRs, keep punctuation intact.
|
34
|
+
Safer for CSV fields than aggressive normalization.
|
35
|
+
"""
|
36
|
+
if not text:
|
37
|
+
return ""
|
38
|
+
text = text.replace("\r", " ")
|
39
|
+
return " ".join(text.split()).strip()
|
40
|
+
|
41
|
+
|
42
|
+
def safe_find_text(node: ET.Element, path: str) -> str:
|
43
|
+
"""Find text with XPath and return cleaned string."""
|
44
|
+
elem = node.find(path)
|
45
|
+
return clean_text(elem.text) if elem is not None and elem.text is not None else ""
|
46
|
+
|
47
|
+
|
48
|
+
def join_nonempty(items: Iterable[str], sep: str) -> str:
|
49
|
+
return sep.join(x for x in (i.strip() for i in items) if x)
|
50
|
+
|
51
|
+
def ensure_parent_dir(p: Path) -> None:
|
52
|
+
"""Create parent directory if it doesn't exist."""
|
53
|
+
if not p.parent.exists():
|
54
|
+
p.parent.mkdir(parents=True, exist_ok=True)
|
55
|
+
|
56
|
+
|
57
|
+
# ----------------------------
|
58
|
+
# Record processing
|
59
|
+
# ----------------------------
|
60
|
+
|
61
|
+
def process_doi(record: ET.Element) -> str:
|
62
|
+
"""Extract and format DOI information to a canonical URL if possible."""
|
63
|
+
doi_raw = safe_find_text(record, ".//electronic-resource-num/style")
|
64
|
+
if not doi_raw:
|
65
|
+
return ""
|
66
|
+
if doi_raw.startswith("10."):
|
67
|
+
return f"https://doi.org/{doi_raw}"
|
68
|
+
if doi_raw.startswith(("http://", "https://")):
|
69
|
+
return doi_raw
|
70
|
+
return ""
|
71
|
+
|
72
|
+
|
73
|
+
def extract_authors(record: ET.Element) -> str:
|
74
|
+
"""Collect authors from //author/style, joined by '; '."""
|
75
|
+
authors: List[str] = []
|
76
|
+
for author in record.findall(".//author"):
|
77
|
+
style = author.find("style")
|
78
|
+
if style is not None and style.text:
|
79
|
+
authors.append(clean_text(style.text))
|
80
|
+
return join_nonempty(authors, "; ")
|
81
|
+
|
82
|
+
|
83
|
+
def extract_urls(record: ET.Element) -> str:
|
84
|
+
"""Collect related URLs from //urls/related-urls/url/style, joined by ' | '."""
|
85
|
+
urls: List[str] = []
|
86
|
+
for url in record.findall(".//urls/related-urls/url"):
|
87
|
+
style = url.find("style")
|
88
|
+
if style is not None and style.text:
|
89
|
+
urls.append(clean_text(style.text))
|
90
|
+
# Deduplicate while preserving order
|
91
|
+
seen = set()
|
92
|
+
deduped = []
|
93
|
+
for u in urls:
|
94
|
+
if u not in seen:
|
95
|
+
seen.add(u)
|
96
|
+
deduped.append(u)
|
97
|
+
return join_nonempty(deduped, " | ")
|
98
|
+
|
99
|
+
|
100
|
+
def process_record(record: ET.Element, database: str) -> Dict[str, str]:
|
101
|
+
"""Transform a <record> element into a dictionary for CSV."""
|
102
|
+
ref_type_name = ""
|
103
|
+
ref_type = record.find("ref-type")
|
104
|
+
if ref_type is not None:
|
105
|
+
ref_type_name = ref_type.get("name") or ""
|
106
|
+
|
107
|
+
return {
|
108
|
+
"database": database,
|
109
|
+
"ref_type": clean_text(ref_type_name),
|
110
|
+
"title": safe_find_text(record, ".//title/style"),
|
111
|
+
"journal": safe_find_text(record, ".//secondary-title/style"),
|
112
|
+
"authors": extract_authors(record),
|
113
|
+
"year": safe_find_text(record, ".//year/style"),
|
114
|
+
"volume": safe_find_text(record, ".//volume/style"),
|
115
|
+
"number": safe_find_text(record, ".//number/style"),
|
116
|
+
"abstract": safe_find_text(record, ".//abstract/style"),
|
117
|
+
"doi": process_doi(record),
|
118
|
+
"urls": extract_urls(record),
|
119
|
+
"extracted_date": datetime.now().strftime("%Y-%m-%d"),
|
120
|
+
}
|
121
|
+
|
122
|
+
def iter_records(xml_path: Path) -> Iterable[ET.Element]:
|
123
|
+
context = ET.iterparse(str(xml_path), events=("start", "end"))
|
124
|
+
_, root = next(context)
|
125
|
+
for event, elem in context:
|
126
|
+
if event == "end" and elem.tag == "record":
|
127
|
+
yield elem
|
128
|
+
elem.clear()
|
129
|
+
root.clear()
|
130
|
+
|
131
|
+
def record_matches_filters(row: Dict[str, str], ref_type: Optional[str], year: Optional[str]) -> bool:
|
132
|
+
if ref_type and row.get("ref_type") != ref_type: return False
|
133
|
+
if year and row.get("year") != str(year): return False
|
134
|
+
return True
|
135
|
+
|
136
|
+
def export_files_to_csv_with_report(
|
137
|
+
inputs: List[Path],
|
138
|
+
csv_path: Path,
|
139
|
+
report_path: Optional[Path] = None,
|
140
|
+
*,
|
141
|
+
fieldnames: List[str] = None,
|
142
|
+
delimiter: str = ",",
|
143
|
+
quoting: str = "minimal",
|
144
|
+
include_header: bool = True,
|
145
|
+
encoding: str = "utf-8",
|
146
|
+
ref_type: Optional[str] = None,
|
147
|
+
year: Optional[str] = None,
|
148
|
+
max_records_per_file: Optional[int] = None,
|
149
|
+
) -> Tuple[int, Path, Path]:
|
150
|
+
"""Primary library API: export one or many XML files to a single CSV + TXT report."""
|
151
|
+
fieldnames = fieldnames or DEFAULT_FIELDNAMES
|
152
|
+
qmode = CSV_QUOTING_MAP[quoting]
|
153
|
+
report_path = report_path or csv_path.with_name(csv_path.stem + "_report.txt")
|
154
|
+
|
155
|
+
ensure_parent_dir(csv_path)
|
156
|
+
ensure_parent_dir(report_path)
|
157
|
+
|
158
|
+
total_written, report_lines = 0, []
|
159
|
+
start_ts = time.time()
|
160
|
+
run_start = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
161
|
+
|
162
|
+
with open(csv_path, "w", newline="", encoding=encoding) as f:
|
163
|
+
writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter=delimiter, quoting=qmode)
|
164
|
+
if include_header:
|
165
|
+
writer.writeheader()
|
166
|
+
|
167
|
+
for xml_path in inputs:
|
168
|
+
database = xml_path.stem
|
169
|
+
logging.info("Processing %s (database=%s)", xml_path.name, database)
|
170
|
+
file_written = file_skipped = 0
|
171
|
+
|
172
|
+
for rec in iter_records(xml_path):
|
173
|
+
try:
|
174
|
+
row = process_record(rec, database=database) # your existing function
|
175
|
+
if record_matches_filters(row, ref_type, year):
|
176
|
+
writer.writerow({k: row.get(k, "") for k in fieldnames})
|
177
|
+
file_written += 1
|
178
|
+
total_written += 1
|
179
|
+
if max_records_per_file and file_written >= max_records_per_file:
|
180
|
+
break
|
181
|
+
except Exception:
|
182
|
+
file_skipped += 1
|
183
|
+
logging.debug("Record error in %s", xml_path, exc_info=True)
|
184
|
+
|
185
|
+
report_lines.append(f"{xml_path.name}: {file_written} exported, {file_skipped} skipped")
|
186
|
+
|
187
|
+
dur = time.time() - start_ts
|
188
|
+
report_lines = [
|
189
|
+
f"Run started: {run_start}",
|
190
|
+
*report_lines,
|
191
|
+
f"TOTAL exported: {total_written}",
|
192
|
+
f"Files processed: {len(inputs)}",
|
193
|
+
f"Duration: {dur:.2f} seconds",
|
194
|
+
]
|
195
|
+
with open(report_path, "w", encoding="utf-8") as rf:
|
196
|
+
rf.write("\n".join(report_lines))
|
197
|
+
|
198
|
+
return total_written, csv_path, report_path
|
199
|
+
|
200
|
+
def export(xml_file: Path, csv_path: Path, **kwargs):
|
201
|
+
"""Convenience: single XML file to CSV (+report)."""
|
202
|
+
return export_files_to_csv_with_report([xml_file], csv_path, **kwargs)
|
203
|
+
|
204
|
+
def export_folder(folder: Path, csv_path: Path, **kwargs):
|
205
|
+
"""Convenience: all *.xml in folder to CSV (+report)."""
|
206
|
+
inputs = sorted(p for p in Path(folder).glob("*.xml") if p.is_file())
|
207
|
+
if not inputs:
|
208
|
+
raise FileNotFoundError(f"No *.xml found in {folder}")
|
209
|
+
return export_files_to_csv_with_report(inputs, csv_path, **kwargs)
|
@@ -0,0 +1,154 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: endnote-utils
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: Convert EndNote XML to CSV with streaming parse and TXT report.
|
5
|
+
Author-email: Minh Quach <minhquach8@gmail.com>
|
6
|
+
License: MIT
|
7
|
+
Keywords: endnote,xml,csv,bibliography,research
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
10
|
+
Classifier: Operating System :: OS Independent
|
11
|
+
Requires-Python: >=3.8
|
12
|
+
Description-Content-Type: text/markdown
|
13
|
+
|
14
|
+
# EndNote Exporter
|
15
|
+
|
16
|
+
Convert **EndNote XML files** into clean CSVs with automatic TXT reports.
|
17
|
+
Supports both **Python API** and **command-line interface (CLI)**.
|
18
|
+
|
19
|
+
---
|
20
|
+
|
21
|
+
## Features
|
22
|
+
|
23
|
+
- ✅ Parse one XML file (`--xml`) or an entire folder of `*.xml` (`--folder`)
|
24
|
+
- ✅ Streams `<record>` elements using `iterparse` (low memory usage)
|
25
|
+
- ✅ Extracts fields:
|
26
|
+
`database, ref_type, title, journal, authors, year, volume, number, abstract, doi, urls, extracted_date`
|
27
|
+
- ✅ Adds a `database` column from the XML filename stem (`IEEE.xml → IEEE`)
|
28
|
+
- ✅ Normalizes DOI (`10.xxxx` → `https://doi.org/...`)
|
29
|
+
- ✅ Always generates a **TXT report** (default: `<csv>_report.txt`) with:
|
30
|
+
- per-file counts (exported/skipped records)
|
31
|
+
- totals, files processed
|
32
|
+
- run timestamp & duration
|
33
|
+
- ✅ Auto-creates output folders if missing
|
34
|
+
- ✅ CLI options for CSV formatting, filters, verbosity
|
35
|
+
- ✅ Importable Python API for scripting & integration
|
36
|
+
|
37
|
+
---
|
38
|
+
|
39
|
+
## Installation
|
40
|
+
|
41
|
+
### From PyPI (recommended)
|
42
|
+
|
43
|
+
```bash
|
44
|
+
pip install endnote-exporter
|
45
|
+
````
|
46
|
+
|
47
|
+
### From local source
|
48
|
+
|
49
|
+
If you have the source code in a folder or `.zip`:
|
50
|
+
|
51
|
+
```bash
|
52
|
+
cd /path/to/endnote-exporter
|
53
|
+
pip install .
|
54
|
+
```
|
55
|
+
|
56
|
+
Requires **Python 3.8+**.
|
57
|
+
|
58
|
+
---
|
59
|
+
|
60
|
+
## Usage
|
61
|
+
|
62
|
+
### Command Line
|
63
|
+
|
64
|
+
#### Single file
|
65
|
+
|
66
|
+
```bash
|
67
|
+
endnote-xml-export --xml data/IEEE.xml --csv output/ieee.csv
|
68
|
+
```
|
69
|
+
|
70
|
+
#### Folder with multiple files
|
71
|
+
|
72
|
+
```bash
|
73
|
+
endnote-xml-export --folder data/xmls --csv output/all_records.csv
|
74
|
+
```
|
75
|
+
|
76
|
+
#### Custom report path
|
77
|
+
|
78
|
+
```bash
|
79
|
+
endnote-xml-export \
|
80
|
+
--xml data/Scopus.xml \
|
81
|
+
--csv output/scopus.csv \
|
82
|
+
--report reports/scopus_run.txt
|
83
|
+
```
|
84
|
+
|
85
|
+
If `--report` is not provided, it defaults to `<csv>_report.txt`.
|
86
|
+
|
87
|
+
---
|
88
|
+
|
89
|
+
### CLI Options
|
90
|
+
|
91
|
+
| Option | Description | Default |
|
92
|
+
| --------------- | --------------------------------------------------- | ------------------ |
|
93
|
+
| `--xml` | Path to a single EndNote XML file | – |
|
94
|
+
| `--folder` | Path to a folder containing multiple `*.xml` files | – |
|
95
|
+
| `--csv` | Output CSV path | – |
|
96
|
+
| `--report` | Output TXT report path | `<csv>_report.txt` |
|
97
|
+
| `--delimiter` | CSV delimiter | `,` |
|
98
|
+
| `--quoting` | CSV quoting: `minimal`, `all`, `nonnumeric`, `none` | `minimal` |
|
99
|
+
| `--no-header` | Suppress CSV header row | – |
|
100
|
+
| `--encoding` | Output CSV encoding | `utf-8` |
|
101
|
+
| `--ref-type` | Only include records with this `ref_type` name | – |
|
102
|
+
| `--year` | Only include records with this year | – |
|
103
|
+
| `--max-records` | Stop after N records per file (useful for testing) | – |
|
104
|
+
| `--verbose` | Verbose logging with debug details | – |
|
105
|
+
|
106
|
+
---
|
107
|
+
|
108
|
+
### Example Report
|
109
|
+
|
110
|
+
```
|
111
|
+
Run started: 2025-09-11 14:30:22
|
112
|
+
IEEE.xml: 120 exported, 0 skipped
|
113
|
+
Scopus.xml: 95 exported, 2 skipped
|
114
|
+
TOTAL exported: 215
|
115
|
+
Files processed: 2
|
116
|
+
Duration: 3.14 seconds
|
117
|
+
```
|
118
|
+
|
119
|
+
---
|
120
|
+
|
121
|
+
## Python API
|
122
|
+
|
123
|
+
You can also use it directly in Python scripts:
|
124
|
+
|
125
|
+
```python
|
126
|
+
from pathlib import Path
|
127
|
+
from endnote_exporter import export, export_folder
|
128
|
+
|
129
|
+
# Single file
|
130
|
+
total, csv_out, report_out = export(
|
131
|
+
Path("data/IEEE.xml"), Path("output/ieee.csv")
|
132
|
+
)
|
133
|
+
|
134
|
+
# Folder
|
135
|
+
total, csv_out, report_out = export_folder(
|
136
|
+
Path("data/xmls"), Path("output/all.csv"),
|
137
|
+
ref_type="Conference Proceedings", year="2024"
|
138
|
+
)
|
139
|
+
```
|
140
|
+
|
141
|
+
---
|
142
|
+
|
143
|
+
## Development Notes
|
144
|
+
|
145
|
+
* Pure Python, uses only standard library (`argparse`, `csv`, `xml.etree.ElementTree`, `logging`, `pathlib`).
|
146
|
+
* Streaming XML parsing avoids high memory usage.
|
147
|
+
* Robust error handling: skips malformed records but logs them in verbose mode.
|
148
|
+
* Follows [PEP 621](https://peps.python.org/pep-0621/) packaging (`pyproject.toml`).
|
149
|
+
|
150
|
+
---
|
151
|
+
|
152
|
+
## License
|
153
|
+
|
154
|
+
MIT License © 2025 Minh Quach
|
@@ -0,0 +1,8 @@
|
|
1
|
+
endnote-utils/__init__.py,sha256=yuPzjVRcBiQPUCFG5DbSvwqz1O42_hYJ3_7dzWotE20,284
|
2
|
+
endnote-utils/cli.py,sha256=TQxdO7IlaRXwNTm0MpBVk9CeUTUGgtlcI0O3O9xhgdM,2160
|
3
|
+
endnote-utils/core.py,sha256=cddpuRMF5RC5mp3Lll0eTA9MXLzcVDnDl1Z7IMHOr0k,7480
|
4
|
+
endnote_utils-0.1.0.dist-info/METADATA,sha256=oBeBNLi7xPptTfTk6FvjtLv576YK0FTGj4_zrWZhebw,4486
|
5
|
+
endnote_utils-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
6
|
+
endnote_utils-0.1.0.dist-info/entry_points.txt,sha256=1mpe6VMBMZwSGdNPdcDPMesi8QEovtEUtf-_slQF8g8,65
|
7
|
+
endnote_utils-0.1.0.dist-info/top_level.txt,sha256=Bs12r-xSbXfACyOO_DFAg4TmG8xY_3iM4cx5skHB3-g,14
|
8
|
+
endnote_utils-0.1.0.dist-info/RECORD,,
|
@@ -0,0 +1 @@
|
|
1
|
+
endnote-utils
|