philoch-bib-sdk 0.3.9__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- philoch_bib_sdk/__init__.py +0 -0
- philoch_bib_sdk/_rust.cp313-win_amd64.pyd +0 -0
- philoch_bib_sdk/adapters/io/__init__.py +115 -0
- philoch_bib_sdk/adapters/io/csv/__init__.py +308 -0
- philoch_bib_sdk/adapters/io/ods/__init__.py +145 -0
- philoch_bib_sdk/adapters/plaintext/bibitem_reader.py +0 -0
- philoch_bib_sdk/adapters/tabular_data/read_journal_volume_number_index.py +58 -0
- philoch_bib_sdk/converters/latex.py +6 -0
- philoch_bib_sdk/converters/plaintext/author/formatter.py +34 -0
- philoch_bib_sdk/converters/plaintext/author/parser.py +83 -0
- philoch_bib_sdk/converters/plaintext/bib_string_formatter.py +8 -0
- philoch_bib_sdk/converters/plaintext/bibitem/bibkey_formatter.py +21 -0
- philoch_bib_sdk/converters/plaintext/bibitem/bibkey_parser.py +158 -0
- philoch_bib_sdk/converters/plaintext/bibitem/date_formatter.py +37 -0
- philoch_bib_sdk/converters/plaintext/bibitem/date_parser.py +62 -0
- philoch_bib_sdk/converters/plaintext/bibitem/formatter.py +182 -0
- philoch_bib_sdk/converters/plaintext/bibitem/pages_formatter.py +13 -0
- philoch_bib_sdk/converters/plaintext/bibitem/pages_parser.py +63 -0
- philoch_bib_sdk/converters/plaintext/bibitem/parser.py +415 -0
- philoch_bib_sdk/converters/plaintext/journal/formatter.py +25 -0
- philoch_bib_sdk/converters/plaintext/journal/parser.py +36 -0
- philoch_bib_sdk/converters/plaintext/shared/renderable_formatter.py +25 -0
- philoch_bib_sdk/interfaces/cli/__init__.py +3 -0
- philoch_bib_sdk/interfaces/cli/fuzzy_matching.py +135 -0
- philoch_bib_sdk/logic/__init__.py +39 -0
- philoch_bib_sdk/logic/default_models.py +315 -0
- philoch_bib_sdk/logic/functions/__init__.py +31 -0
- philoch_bib_sdk/logic/functions/comparator.py +414 -0
- philoch_bib_sdk/logic/functions/fuzzy_matcher.py +796 -0
- philoch_bib_sdk/logic/functions/journal_article_matcher.py +44 -0
- philoch_bib_sdk/logic/literals.py +98 -0
- philoch_bib_sdk/logic/models.py +366 -0
- philoch_bib_sdk/logic/models_staging.py +173 -0
- philoch_bib_sdk/procedures/fuzzy_matching.py +112 -0
- philoch_bib_sdk/py.typed +0 -0
- philoch_bib_sdk/rust_scorer/Cargo.lock +232 -0
- philoch_bib_sdk/rust_scorer/Cargo.toml +26 -0
- philoch_bib_sdk/rust_scorer/pyproject.toml +15 -0
- philoch_bib_sdk/rust_scorer/rust_scorer.pyi +65 -0
- philoch_bib_sdk/rust_scorer/src/lib.rs +362 -0
- philoch_bib_sdk-0.3.9.dist-info/METADATA +15 -0
- philoch_bib_sdk-0.3.9.dist-info/RECORD +44 -0
- philoch_bib_sdk-0.3.9.dist-info/WHEEL +4 -0
- philoch_bib_sdk-0.3.9.dist-info/licenses/LICENSE +21 -0
|
File without changes
|
|
Binary file
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""IO adapters with automatic format detection.
|
|
2
|
+
|
|
3
|
+
This module provides format-agnostic wrappers that detect file formats
|
|
4
|
+
and delegate to appropriate format-specific adapters.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Dict, Tuple
|
|
9
|
+
|
|
10
|
+
from aletk.ResultMonad import Err, Ok
|
|
11
|
+
|
|
12
|
+
from philoch_bib_sdk.adapters.io.csv import (
|
|
13
|
+
load_bibliography_csv,
|
|
14
|
+
load_staged_csv,
|
|
15
|
+
write_report_csv,
|
|
16
|
+
)
|
|
17
|
+
from philoch_bib_sdk.adapters.io.ods import (
|
|
18
|
+
load_bibliography_ods,
|
|
19
|
+
load_staged_ods,
|
|
20
|
+
)
|
|
21
|
+
from philoch_bib_sdk.logic.models import BibItem
|
|
22
|
+
from philoch_bib_sdk.logic.models_staging import BibItemStaged
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def load_bibliography(filename: str, max_rows: int | None = None) -> Ok[Dict[str, BibItem]] | Err:
|
|
26
|
+
"""Load bibliography with automatic format detection.
|
|
27
|
+
|
|
28
|
+
Detects format based on file extension and delegates to appropriate adapter.
|
|
29
|
+
|
|
30
|
+
Supported formats:
|
|
31
|
+
- .csv: CSV format
|
|
32
|
+
- .ods: OpenDocument Spreadsheet format
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
filename: Path to bibliography file
|
|
36
|
+
max_rows: Optional limit on number of rows (for testing large files)
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Ok[Dict[str, BibItem]] with bibkey as key, or Err on failure
|
|
40
|
+
"""
|
|
41
|
+
file_path = Path(filename)
|
|
42
|
+
suffix = file_path.suffix.lower()
|
|
43
|
+
|
|
44
|
+
match suffix:
|
|
45
|
+
case ".csv":
|
|
46
|
+
return load_bibliography_csv(filename)
|
|
47
|
+
case ".ods":
|
|
48
|
+
return load_bibliography_ods(filename, max_rows=max_rows)
|
|
49
|
+
case _:
|
|
50
|
+
return Err(
|
|
51
|
+
message=f"Unsupported bibliography format: {suffix}. Supported: .csv, .ods",
|
|
52
|
+
code=-1,
|
|
53
|
+
error_type="UnsupportedFormatError",
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def load_staged(filename: str, max_rows: int | None = None) -> Ok[Tuple[BibItem, ...]] | Err:
|
|
58
|
+
"""Load staged items with automatic format detection.
|
|
59
|
+
|
|
60
|
+
Detects format based on file extension and delegates to appropriate adapter.
|
|
61
|
+
|
|
62
|
+
Supported formats:
|
|
63
|
+
- .csv: CSV format
|
|
64
|
+
- .ods: OpenDocument Spreadsheet format
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
filename: Path to staged items file
|
|
68
|
+
max_rows: Optional limit on number of rows (for testing large files)
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
Ok[Tuple[BibItem, ...]] or Err on failure
|
|
72
|
+
"""
|
|
73
|
+
file_path = Path(filename)
|
|
74
|
+
suffix = file_path.suffix.lower()
|
|
75
|
+
|
|
76
|
+
match suffix:
|
|
77
|
+
case ".csv":
|
|
78
|
+
return load_staged_csv(filename)
|
|
79
|
+
case ".ods":
|
|
80
|
+
return load_staged_ods(filename, max_rows=max_rows)
|
|
81
|
+
case _:
|
|
82
|
+
return Err(
|
|
83
|
+
message=f"Unsupported staged items format: {suffix}. Supported: .csv, .ods",
|
|
84
|
+
code=-1,
|
|
85
|
+
error_type="UnsupportedFormatError",
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def write_report(filename: str, staged: Tuple[BibItemStaged, ...], output_format: str = "csv") -> Ok[None] | Err:
|
|
90
|
+
"""Write fuzzy matching report with format selection.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
filename: Path to output file (extension will be added based on format)
|
|
94
|
+
staged: Tuple of staged items with matches
|
|
95
|
+
output_format: Output format ("csv", etc.)
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
Ok[None] on success, Err on failure
|
|
99
|
+
"""
|
|
100
|
+
match output_format.lower():
|
|
101
|
+
case "csv":
|
|
102
|
+
return write_report_csv(filename, staged)
|
|
103
|
+
case _:
|
|
104
|
+
return Err(
|
|
105
|
+
message=f"Unsupported output format: {output_format}. Supported: csv",
|
|
106
|
+
code=-1,
|
|
107
|
+
error_type="UnsupportedFormatError",
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
__all__ = [
|
|
112
|
+
"load_bibliography",
|
|
113
|
+
"load_staged",
|
|
114
|
+
"write_report",
|
|
115
|
+
]
|
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
"""CSV adapters for reading and writing bibliographic data.
|
|
2
|
+
|
|
3
|
+
This module provides CSV-specific implementations for loading bibliographies,
|
|
4
|
+
staged items, and writing fuzzy matching reports.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import csv
|
|
8
|
+
import traceback
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any, Dict, Tuple
|
|
11
|
+
|
|
12
|
+
from aletk.ResultMonad import Err, Ok
|
|
13
|
+
from aletk.utils import get_logger
|
|
14
|
+
|
|
15
|
+
from philoch_bib_sdk.converters.plaintext.bibitem.bibkey_formatter import format_bibkey
|
|
16
|
+
from philoch_bib_sdk.converters.plaintext.bibitem.parser import (
|
|
17
|
+
ParsedBibItemData,
|
|
18
|
+
parse_bibitem,
|
|
19
|
+
)
|
|
20
|
+
from philoch_bib_sdk.logic.models import BibItem
|
|
21
|
+
from philoch_bib_sdk.logic.models_staging import BibItemStaged
|
|
22
|
+
|
|
23
|
+
logger = get_logger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _csv_row_to_parsed_data(row: dict[str, Any]) -> ParsedBibItemData:
|
|
27
|
+
"""Convert CSV row to ParsedBibItemData, filtering empty values.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
row: Dictionary from csv.DictReader
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
ParsedBibItemData with empty values removed
|
|
34
|
+
"""
|
|
35
|
+
# Filter out empty values and create ParsedBibItemData
|
|
36
|
+
# TypedDict with total=False allows any subset of fields
|
|
37
|
+
return {k: v for k, v in row.items() if v} # type: ignore[return-value]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def load_bibliography_csv(filename: str) -> Ok[Dict[str, BibItem]] | Err:
|
|
41
|
+
"""Load bibliography from CSV file.
|
|
42
|
+
|
|
43
|
+
Expected CSV format: Standard CSV with headers matching ParsedBibItemData fields.
|
|
44
|
+
Required columns: entry_type, author, title, date
|
|
45
|
+
Optional columns: journal, volume, number, pages, doi, etc.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
filename: Path to CSV file
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Ok[Dict[str, BibItem]] with bibkey as key, or Err on failure
|
|
52
|
+
"""
|
|
53
|
+
try:
|
|
54
|
+
file_path = Path(filename)
|
|
55
|
+
if not file_path.exists():
|
|
56
|
+
return Err(
|
|
57
|
+
message=f"File not found: {filename}",
|
|
58
|
+
code=-1,
|
|
59
|
+
error_type="FileNotFoundError",
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
bibliography: Dict[str, BibItem] = {}
|
|
63
|
+
errors = []
|
|
64
|
+
|
|
65
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
66
|
+
reader = csv.DictReader(f)
|
|
67
|
+
|
|
68
|
+
if reader.fieldnames is None:
|
|
69
|
+
return Err(
|
|
70
|
+
message=f"CSV file has no headers: {filename}",
|
|
71
|
+
code=-1,
|
|
72
|
+
error_type="CSVFormatError",
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
for row_num, row in enumerate(reader, start=2): # Start at 2 (header is row 1)
|
|
76
|
+
# Convert CSV row to ParsedBibItemData
|
|
77
|
+
parsed_data = _csv_row_to_parsed_data(row)
|
|
78
|
+
|
|
79
|
+
# Parse the row into a BibItem
|
|
80
|
+
parse_result = parse_bibitem(parsed_data, bibstring_type="simplified")
|
|
81
|
+
|
|
82
|
+
if isinstance(parse_result, Err):
|
|
83
|
+
errors.append(f"Row {row_num}: {parse_result.message}")
|
|
84
|
+
continue
|
|
85
|
+
|
|
86
|
+
bibitem = parse_result.out
|
|
87
|
+
bibkey = format_bibkey(bibitem.bibkey)
|
|
88
|
+
|
|
89
|
+
# Check for duplicate bibkeys
|
|
90
|
+
if bibkey in bibliography:
|
|
91
|
+
errors.append(f"Row {row_num}: Duplicate bibkey '{bibkey}' (first seen in earlier row)")
|
|
92
|
+
continue
|
|
93
|
+
|
|
94
|
+
bibliography[bibkey] = bibitem
|
|
95
|
+
|
|
96
|
+
# Report results
|
|
97
|
+
if errors:
|
|
98
|
+
error_summary = f"Loaded {len(bibliography)} items with {len(errors)} errors:\n" + "\n".join(
|
|
99
|
+
errors[:10] # Show first 10 errors
|
|
100
|
+
)
|
|
101
|
+
if len(errors) > 10:
|
|
102
|
+
error_summary += f"\n... and {len(errors) - 10} more errors"
|
|
103
|
+
|
|
104
|
+
logger.warning(error_summary)
|
|
105
|
+
|
|
106
|
+
if not bibliography:
|
|
107
|
+
return Err(
|
|
108
|
+
message=f"No valid items loaded from {filename}. Errors: {len(errors)}",
|
|
109
|
+
code=-1,
|
|
110
|
+
error_type="EmptyBibliographyError",
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
logger.info(f"Successfully loaded {len(bibliography)} items from {filename}")
|
|
114
|
+
return Ok(bibliography)
|
|
115
|
+
|
|
116
|
+
except Exception as e:
|
|
117
|
+
return Err(
|
|
118
|
+
message=f"Failed to load bibliography from {filename}: {e.__class__.__name__}: {e}",
|
|
119
|
+
code=-1,
|
|
120
|
+
error_type=e.__class__.__name__,
|
|
121
|
+
error_trace=traceback.format_exc(),
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def load_staged_csv_allow_empty_bibkeys(filename: str) -> Ok[Tuple[BibItem, ...]] | Err:
|
|
126
|
+
"""Load staged items from CSV file, allowing empty bibkeys.
|
|
127
|
+
|
|
128
|
+
This is useful for staging files where bibkeys haven't been assigned yet.
|
|
129
|
+
Items without bibkeys will be assigned temporary sequential keys.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
filename: Path to CSV file
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
Ok[Tuple[BibItem, ...]] or Err on failure
|
|
136
|
+
"""
|
|
137
|
+
try:
|
|
138
|
+
file_path = Path(filename)
|
|
139
|
+
|
|
140
|
+
if not file_path.exists():
|
|
141
|
+
return Err(
|
|
142
|
+
message=f"File not found: {filename}",
|
|
143
|
+
code=-1,
|
|
144
|
+
error_type="FileNotFoundError",
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
staged_items: list[BibItem] = []
|
|
148
|
+
errors = []
|
|
149
|
+
|
|
150
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
151
|
+
reader = csv.DictReader(f)
|
|
152
|
+
|
|
153
|
+
if reader.fieldnames is None:
|
|
154
|
+
return Err(
|
|
155
|
+
message=f"CSV file has no headers: {filename}",
|
|
156
|
+
code=-1,
|
|
157
|
+
error_type="CSVFormatError",
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
for row_num, row in enumerate(reader, start=2): # Start at 2 (header is row 1)
|
|
161
|
+
# Convert CSV row to ParsedBibItemData
|
|
162
|
+
parsed_data = _csv_row_to_parsed_data(row)
|
|
163
|
+
|
|
164
|
+
# If bibkey is empty, assign a temporary one
|
|
165
|
+
if not parsed_data.get("bibkey"):
|
|
166
|
+
parsed_data["bibkey"] = f"temp:{row_num}"
|
|
167
|
+
|
|
168
|
+
# Parse the row into a BibItem
|
|
169
|
+
parse_result = parse_bibitem(parsed_data, bibstring_type="simplified")
|
|
170
|
+
|
|
171
|
+
if isinstance(parse_result, Err):
|
|
172
|
+
errors.append(f"Row {row_num}: {parse_result.message}")
|
|
173
|
+
continue
|
|
174
|
+
|
|
175
|
+
bibitem = parse_result.out
|
|
176
|
+
staged_items.append(bibitem)
|
|
177
|
+
|
|
178
|
+
# Report results
|
|
179
|
+
if errors:
|
|
180
|
+
error_summary = f"Loaded {len(staged_items)} items with {len(errors)} errors:\n" + "\n".join(
|
|
181
|
+
errors[:10] # Show first 10 errors
|
|
182
|
+
)
|
|
183
|
+
if len(errors) > 10:
|
|
184
|
+
error_summary += f"\n... and {len(errors) - 10} more errors"
|
|
185
|
+
|
|
186
|
+
logger.warning(error_summary)
|
|
187
|
+
|
|
188
|
+
if not staged_items:
|
|
189
|
+
return Err(
|
|
190
|
+
message=f"No valid items loaded from {filename}. Errors: {len(errors)}",
|
|
191
|
+
code=-1,
|
|
192
|
+
error_type="EmptyFileError",
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
logger.info(f"Successfully loaded {len(staged_items)} staged items from {filename}")
|
|
196
|
+
|
|
197
|
+
return Ok(tuple(staged_items))
|
|
198
|
+
|
|
199
|
+
except Exception as e:
|
|
200
|
+
return Err(
|
|
201
|
+
message=f"Failed to load staged items from {filename}: {e.__class__.__name__}: {e}",
|
|
202
|
+
code=-1,
|
|
203
|
+
error_type=e.__class__.__name__,
|
|
204
|
+
error_trace=traceback.format_exc(),
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def load_staged_csv(filename: str) -> Ok[Tuple[BibItem, ...]] | Err:
|
|
209
|
+
"""Load staged items from CSV file.
|
|
210
|
+
|
|
211
|
+
Uses the same format as load_bibliography_csv - standard CSV with ParsedBibItemData fields.
|
|
212
|
+
Additional score-related columns (if present) are ignored during loading.
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
filename: Path to CSV file
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
Ok[Tuple[BibItem, ...]] or Err on failure
|
|
219
|
+
"""
|
|
220
|
+
try:
|
|
221
|
+
# Load as bibliography first
|
|
222
|
+
result = load_bibliography_csv(filename)
|
|
223
|
+
|
|
224
|
+
if isinstance(result, Err):
|
|
225
|
+
return result
|
|
226
|
+
|
|
227
|
+
bibliography = result.out
|
|
228
|
+
|
|
229
|
+
# Convert dict values to tuple
|
|
230
|
+
staged_items = tuple(bibliography.values())
|
|
231
|
+
|
|
232
|
+
logger.info(f"Successfully loaded {len(staged_items)} staged items from {filename}")
|
|
233
|
+
return Ok(staged_items)
|
|
234
|
+
|
|
235
|
+
except Exception as e:
|
|
236
|
+
return Err(
|
|
237
|
+
message=f"Failed to load staged items from {filename}: {e.__class__.__name__}: {e}",
|
|
238
|
+
code=-1,
|
|
239
|
+
error_type=e.__class__.__name__,
|
|
240
|
+
error_trace=traceback.format_exc(),
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def write_report_csv(filename: str, staged: Tuple[BibItemStaged, ...]) -> Ok[None] | Err:
|
|
245
|
+
"""Write fuzzy matching report to CSV file.
|
|
246
|
+
|
|
247
|
+
Output format: Uses BibItemStaged.to_csv_row() with columns:
|
|
248
|
+
- staged_bibkey, staged_title, staged_author, staged_year
|
|
249
|
+
- num_matches, best_match_score, best_match_bibkey
|
|
250
|
+
- top_matches_json (JSON-encoded match details)
|
|
251
|
+
- search_time_ms, candidates_searched
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
filename: Path to output CSV file (without extension)
|
|
255
|
+
staged: Tuple of staged items with matches
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
Ok[None] on success, Err on failure
|
|
259
|
+
"""
|
|
260
|
+
try:
|
|
261
|
+
# Add .csv extension if not present
|
|
262
|
+
output_path = Path(filename)
|
|
263
|
+
if output_path.suffix != ".csv":
|
|
264
|
+
output_path = output_path.with_suffix(".csv")
|
|
265
|
+
|
|
266
|
+
if not staged:
|
|
267
|
+
logger.warning("No staged items to write")
|
|
268
|
+
# Create empty file with headers
|
|
269
|
+
with open(output_path, "w", encoding="utf-8", newline="") as f:
|
|
270
|
+
simple_writer = csv.writer(f)
|
|
271
|
+
simple_writer.writerow(
|
|
272
|
+
[
|
|
273
|
+
"staged_bibkey",
|
|
274
|
+
"staged_title",
|
|
275
|
+
"staged_author",
|
|
276
|
+
"staged_year",
|
|
277
|
+
"num_matches",
|
|
278
|
+
"best_match_score",
|
|
279
|
+
"best_match_bibkey",
|
|
280
|
+
"top_matches_json",
|
|
281
|
+
"search_time_ms",
|
|
282
|
+
"candidates_searched",
|
|
283
|
+
]
|
|
284
|
+
)
|
|
285
|
+
logger.info(f"Created empty report at {output_path}")
|
|
286
|
+
return Ok(None)
|
|
287
|
+
|
|
288
|
+
# Convert to CSV rows
|
|
289
|
+
rows = tuple(item.to_csv_row() for item in staged)
|
|
290
|
+
|
|
291
|
+
# Write to CSV
|
|
292
|
+
with open(output_path, "w", encoding="utf-8", newline="") as f:
|
|
293
|
+
if rows:
|
|
294
|
+
fieldnames = list(rows[0].keys())
|
|
295
|
+
dict_writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
296
|
+
dict_writer.writeheader()
|
|
297
|
+
dict_writer.writerows(rows)
|
|
298
|
+
|
|
299
|
+
logger.info(f"Successfully wrote {len(staged)} items to {output_path}")
|
|
300
|
+
return Ok(None)
|
|
301
|
+
|
|
302
|
+
except Exception as e:
|
|
303
|
+
return Err(
|
|
304
|
+
message=f"Failed to write report to {filename}: {e.__class__.__name__}: {e}",
|
|
305
|
+
code=-1,
|
|
306
|
+
error_type=e.__class__.__name__,
|
|
307
|
+
error_trace=traceback.format_exc(),
|
|
308
|
+
)
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""ODS (OpenDocument Spreadsheet) adapters for bibliography I/O operations."""
|
|
2
|
+
|
|
3
|
+
import traceback
|
|
4
|
+
from typing import Dict, Tuple, Any
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from aletk.ResultMonad import Ok, Err
|
|
8
|
+
from aletk.utils import get_logger
|
|
9
|
+
|
|
10
|
+
from philoch_bib_sdk.logic.models import BibItem
|
|
11
|
+
from philoch_bib_sdk.converters.plaintext.bibitem.parser import parse_bibitem, ParsedBibItemData
|
|
12
|
+
from philoch_bib_sdk.converters.plaintext.bibitem.bibkey_formatter import format_bibkey
|
|
13
|
+
|
|
14
|
+
lgr = get_logger(__name__)
|
|
15
|
+
|
|
16
|
+
__all__: list[str] = [
|
|
17
|
+
"load_bibliography_ods",
|
|
18
|
+
"load_staged_ods",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _normalize_column_name(name: str) -> str:
|
|
23
|
+
"""
|
|
24
|
+
Normalize column names from ODS to match ParsedBibItemData keys.
|
|
25
|
+
|
|
26
|
+
ODS columns use hyphens (e.g., 'journal-id') while ParsedBibItemData uses underscores.
|
|
27
|
+
"""
|
|
28
|
+
return name.replace("-", "_")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _ods_row_to_parsed_data(row: dict[str, Any]) -> ParsedBibItemData:
|
|
32
|
+
"""
|
|
33
|
+
Convert an ODS row (dict) to ParsedBibItemData.
|
|
34
|
+
|
|
35
|
+
This helper exists to isolate the type: ignore directive.
|
|
36
|
+
Polars returns dict[str, Any] while ParsedBibItemData is a TypedDict.
|
|
37
|
+
We filter out None/empty values and normalize column names.
|
|
38
|
+
"""
|
|
39
|
+
normalized = {_normalize_column_name(k): str(v) if v is not None else "" for k, v in row.items()}
|
|
40
|
+
return {k: v for k, v in normalized.items() if v} # type: ignore[return-value]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def load_bibliography_ods(
|
|
44
|
+
filename: str, max_rows: int | None = None, bibstring_type: str = "simplified"
|
|
45
|
+
) -> Ok[Dict[str, BibItem]] | Err:
|
|
46
|
+
"""
|
|
47
|
+
Load a bibliography from an ODS file.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
filename: Path to the ODS file
|
|
51
|
+
max_rows: Optional limit on number of rows to read (for testing)
|
|
52
|
+
bibstring_type: Type of bibstring to use ('simplified', 'latex', 'unicode')
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
Ok with dict mapping bibkey -> BibItem, or Err with details
|
|
56
|
+
"""
|
|
57
|
+
try:
|
|
58
|
+
import polars as pl
|
|
59
|
+
|
|
60
|
+
if not Path(filename).exists():
|
|
61
|
+
return Err(message=f"File not found: {filename}", code=1)
|
|
62
|
+
|
|
63
|
+
# Read ODS file
|
|
64
|
+
df = pl.read_ods(source=filename, has_header=True)
|
|
65
|
+
|
|
66
|
+
if df.is_empty():
|
|
67
|
+
return Err(message=f"ODS file is empty: {filename}", code=1)
|
|
68
|
+
|
|
69
|
+
# Limit rows if requested
|
|
70
|
+
if max_rows is not None:
|
|
71
|
+
df = df.head(max_rows)
|
|
72
|
+
|
|
73
|
+
# Convert to list of dicts
|
|
74
|
+
rows = df.to_dicts()
|
|
75
|
+
|
|
76
|
+
bibliography: dict[str, BibItem] = {}
|
|
77
|
+
errors: list[str] = []
|
|
78
|
+
seen_bibkeys: dict[str, int] = {}
|
|
79
|
+
|
|
80
|
+
for i, row in enumerate(rows, start=2): # Start at 2 because row 1 is header
|
|
81
|
+
try:
|
|
82
|
+
parsed_data = _ods_row_to_parsed_data(row)
|
|
83
|
+
result = parse_bibitem(parsed_data, bibstring_type=bibstring_type) # type: ignore[arg-type]
|
|
84
|
+
|
|
85
|
+
if isinstance(result, Err):
|
|
86
|
+
errors.append(f"Row {i}: {result.message}")
|
|
87
|
+
continue
|
|
88
|
+
|
|
89
|
+
bibitem = result.out
|
|
90
|
+
bibkey_str = format_bibkey(bibitem.bibkey)
|
|
91
|
+
|
|
92
|
+
# Check for duplicates
|
|
93
|
+
if bibkey_str in seen_bibkeys:
|
|
94
|
+
first_row = seen_bibkeys[bibkey_str]
|
|
95
|
+
errors.append(f"Row {i}: Duplicate bibkey '{bibkey_str}' (first seen in row {first_row})")
|
|
96
|
+
continue
|
|
97
|
+
|
|
98
|
+
bibliography[bibkey_str] = bibitem
|
|
99
|
+
seen_bibkeys[bibkey_str] = i
|
|
100
|
+
|
|
101
|
+
except Exception as e:
|
|
102
|
+
errors.append(f"Row {i}: Unexpected error: {e}")
|
|
103
|
+
continue
|
|
104
|
+
|
|
105
|
+
if errors:
|
|
106
|
+
lgr.warning(f"Loaded {len(bibliography)} items with {len(errors)} errors:\n" + "\n".join(errors[:10]))
|
|
107
|
+
|
|
108
|
+
if not bibliography:
|
|
109
|
+
error_summary = "\n".join(errors[:5])
|
|
110
|
+
return Err(message=f"No valid items loaded from {filename}. Errors: {len(errors)}\n{error_summary}", code=1)
|
|
111
|
+
|
|
112
|
+
lgr.info(f"Successfully loaded {len(bibliography)} items from {filename}")
|
|
113
|
+
return Ok(bibliography)
|
|
114
|
+
|
|
115
|
+
except Exception as e:
|
|
116
|
+
return Err(
|
|
117
|
+
message=f"Failed to load bibliography from {filename}: {e.__class__.__name__}: {e}",
|
|
118
|
+
code=-1,
|
|
119
|
+
error_type=e.__class__.__name__,
|
|
120
|
+
error_trace=traceback.format_exc(),
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def load_staged_ods(filename: str, max_rows: int | None = None) -> Ok[Tuple[BibItem, ...]] | Err:
|
|
125
|
+
"""
|
|
126
|
+
Load staged BibItems from an ODS file.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
filename: Path to the ODS file
|
|
130
|
+
max_rows: Optional limit on number of rows to read (for testing)
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
Ok with tuple of BibItems, or Err with details
|
|
134
|
+
"""
|
|
135
|
+
result = load_bibliography_ods(filename, max_rows=max_rows)
|
|
136
|
+
|
|
137
|
+
if isinstance(result, Err):
|
|
138
|
+
return result
|
|
139
|
+
|
|
140
|
+
bibliography = result.out
|
|
141
|
+
staged_items = tuple(bibliography.values())
|
|
142
|
+
|
|
143
|
+
lgr.info(f"Successfully loaded {len(staged_items)} staged items from {filename}")
|
|
144
|
+
|
|
145
|
+
return Ok(staged_items)
|
|
File without changes
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from functools import partial
|
|
2
|
+
from typing import Callable, NamedTuple
|
|
3
|
+
|
|
4
|
+
import polars as pl
|
|
5
|
+
|
|
6
|
+
from philoch_bib_sdk.converters.plaintext.bibitem.bibkey_parser import hard_parse_bibkey
|
|
7
|
+
from philoch_bib_sdk.logic.functions.journal_article_matcher import (
|
|
8
|
+
TJournalBibkeyIndex,
|
|
9
|
+
TReadIndex,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ColumnNames(NamedTuple):
|
|
14
|
+
bibkey: str
|
|
15
|
+
journal: str
|
|
16
|
+
volume: str
|
|
17
|
+
number: str
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _read_from_ods(
|
|
21
|
+
column_names: ColumnNames,
|
|
22
|
+
file_path: str,
|
|
23
|
+
) -> TJournalBibkeyIndex:
|
|
24
|
+
"""
|
|
25
|
+
Reads the specified columns from an ODS file and returns a TJournalBibkeyIndex dictionary.
|
|
26
|
+
Args:
|
|
27
|
+
column_names (ColumnNames): The names of the columns to read (journal, volume, number, bibkey).
|
|
28
|
+
file_path (str): The path to the ODS file.
|
|
29
|
+
Returns:
|
|
30
|
+
TJournalBibkeyIndex: A dictionary mapping (journal, volume, number) tuples to bibkey values.
|
|
31
|
+
"""
|
|
32
|
+
df = pl.read_ods(
|
|
33
|
+
source=file_path,
|
|
34
|
+
has_header=True,
|
|
35
|
+
columns=[column_names.journal, column_names.volume, column_names.number, column_names.bibkey],
|
|
36
|
+
schema_overrides={
|
|
37
|
+
column_names.journal: pl.Utf8,
|
|
38
|
+
column_names.volume: pl.Utf8,
|
|
39
|
+
column_names.number: pl.Utf8,
|
|
40
|
+
column_names.bibkey: pl.Utf8,
|
|
41
|
+
},
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
if df.is_empty():
|
|
45
|
+
raise ValueError(
|
|
46
|
+
f"Tabular data at '{file_path}' is empty or does not contain the expected columns: {column_names}"
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
return {
|
|
50
|
+
(row[column_names.journal], row[column_names.volume], row[column_names.number]): hard_parse_bibkey(
|
|
51
|
+
row[column_names.bibkey]
|
|
52
|
+
)
|
|
53
|
+
for row in df.to_dicts()
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
type THOFReadFromOds = Callable[[ColumnNames], TReadIndex]
|
|
58
|
+
hof_read_from_ods: THOFReadFromOds = lambda column_names: partial(_read_from_ods, column_names)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from typing import Tuple
|
|
2
|
+
from aletk.utils import get_logger
|
|
3
|
+
from philoch_bib_sdk.logic.models import Author, TBibString
|
|
4
|
+
|
|
5
|
+
lgr = get_logger(__name__)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _full_name_generic(given_name: str, family_name: str, mononym: str) -> str:
|
|
9
|
+
if mononym:
|
|
10
|
+
return mononym
|
|
11
|
+
|
|
12
|
+
if not given_name and family_name:
|
|
13
|
+
return family_name
|
|
14
|
+
|
|
15
|
+
if not given_name:
|
|
16
|
+
return ""
|
|
17
|
+
|
|
18
|
+
if not family_name:
|
|
19
|
+
return given_name
|
|
20
|
+
|
|
21
|
+
return f"{family_name}, {given_name}"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _format_single(author: Author, bibstring_type: TBibString) -> str:
|
|
25
|
+
given_name = f"{getattr(author.given_name, bibstring_type)}"
|
|
26
|
+
family_name = f"{getattr(author.family_name, bibstring_type)}"
|
|
27
|
+
mononym = f"{getattr(author.mononym, bibstring_type)}"
|
|
28
|
+
|
|
29
|
+
return _full_name_generic(given_name, family_name, mononym)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def format_author(authors: Tuple[Author, ...], bibstring_type: TBibString) -> str:
|
|
33
|
+
names = (_format_single(author, bibstring_type=bibstring_type) for author in authors)
|
|
34
|
+
return " and ".join(name for name in names if name)
|