xml-data-extractor 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xml_config_builder.py +617 -0
- xml_data_extractor-0.1.0.dist-info/METADATA +408 -0
- xml_data_extractor-0.1.0.dist-info/RECORD +7 -0
- xml_data_extractor-0.1.0.dist-info/WHEEL +5 -0
- xml_data_extractor-0.1.0.dist-info/entry_points.txt +2 -0
- xml_data_extractor-0.1.0.dist-info/top_level.txt +2 -0
- xml_extractor.py +778 -0
xml_extractor.py
ADDED
|
@@ -0,0 +1,778 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
XML Field Extractor - Flexible XML to CSV/Parquet converter
|
|
4
|
+
Supports complex XML formats like Dublin Core, METS, LIDO, PREMIS, etc.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Dict, List, Optional, Union
|
|
10
|
+
|
|
11
|
+
import click
|
|
12
|
+
import pandas as pd
|
|
13
|
+
import yaml
|
|
14
|
+
from loguru import logger
|
|
15
|
+
from lxml import etree
|
|
16
|
+
from rich.console import Console
|
|
17
|
+
from rich.progress import (
|
|
18
|
+
Progress,
|
|
19
|
+
SpinnerColumn,
|
|
20
|
+
TextColumn,
|
|
21
|
+
BarColumn,
|
|
22
|
+
TaskProgressColumn,
|
|
23
|
+
TimeRemainingColumn,
|
|
24
|
+
)
|
|
25
|
+
from rich.table import Table
|
|
26
|
+
|
|
27
|
+
console = Console()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class XMLExtractor:
|
|
31
|
+
"""Main extractor class for XML to CSV conversion."""
|
|
32
|
+
|
|
33
|
+
def __init__(self, config: Dict[str, Any], debug: bool = False):
|
|
34
|
+
self.config = config
|
|
35
|
+
self.debug = debug
|
|
36
|
+
# Initialize with config namespaces so they're always available
|
|
37
|
+
self.namespaces: Dict[str, str] = config.get("namespaces", {}).copy()
|
|
38
|
+
self.stats = {
|
|
39
|
+
"files_processed": 0,
|
|
40
|
+
"files_skipped": 0,
|
|
41
|
+
"records_extracted": 0,
|
|
42
|
+
"records_filtered": 0,
|
|
43
|
+
"fields_missing": 0,
|
|
44
|
+
"missing_fields_by_name": {},
|
|
45
|
+
"errors": [],
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
def _extract_namespaces(self, tree: etree._ElementTree) -> Dict[str, str]:
|
|
49
|
+
"""Extract all namespaces from XML document."""
|
|
50
|
+
nsmap = {}
|
|
51
|
+
for ns_prefix, ns_uri in tree.getroot().nsmap.items():
|
|
52
|
+
if ns_prefix is not None:
|
|
53
|
+
nsmap[ns_prefix] = ns_uri
|
|
54
|
+
else:
|
|
55
|
+
# Handle default namespace
|
|
56
|
+
nsmap["default"] = ns_uri
|
|
57
|
+
return nsmap
|
|
58
|
+
|
|
59
|
+
def _merge_namespaces(self, extracted_ns: Dict[str, str]) -> Dict[str, str]:
|
|
60
|
+
"""Merge extracted namespaces with config namespaces (config takes priority)."""
|
|
61
|
+
config_ns = self.config.get("namespaces", {})
|
|
62
|
+
|
|
63
|
+
# Start with extracted namespaces
|
|
64
|
+
merged = extracted_ns.copy()
|
|
65
|
+
|
|
66
|
+
# Override with config namespaces
|
|
67
|
+
merged.update(config_ns)
|
|
68
|
+
|
|
69
|
+
if self.debug:
|
|
70
|
+
logger.debug(f"Extracted namespaces: {extracted_ns}")
|
|
71
|
+
logger.debug(f"Config namespaces: {config_ns}")
|
|
72
|
+
logger.debug(f"Merged namespaces: {merged}")
|
|
73
|
+
|
|
74
|
+
return merged
|
|
75
|
+
|
|
76
|
+
def _apply_filter(
|
|
77
|
+
self, values: List[str], filter_config: Optional[Dict[str, str]]
|
|
78
|
+
) -> List[str]:
|
|
79
|
+
"""Apply filter to list of values based on filter configuration."""
|
|
80
|
+
if not filter_config or not values:
|
|
81
|
+
return values
|
|
82
|
+
|
|
83
|
+
filter_type = filter_config.get("type", "").lower()
|
|
84
|
+
pattern = filter_config.get("pattern", "")
|
|
85
|
+
|
|
86
|
+
if not pattern:
|
|
87
|
+
logger.warning("Filter specified but no pattern provided")
|
|
88
|
+
return values
|
|
89
|
+
|
|
90
|
+
filtered = []
|
|
91
|
+
for value in values:
|
|
92
|
+
if filter_type == "regex":
|
|
93
|
+
if re.search(pattern, value):
|
|
94
|
+
filtered.append(value)
|
|
95
|
+
elif filter_type == "startswith":
|
|
96
|
+
if value.startswith(pattern):
|
|
97
|
+
filtered.append(value)
|
|
98
|
+
elif filter_type == "contains":
|
|
99
|
+
if pattern in value:
|
|
100
|
+
filtered.append(value)
|
|
101
|
+
else:
|
|
102
|
+
logger.warning(f"Unknown filter type: {filter_type}")
|
|
103
|
+
return values
|
|
104
|
+
|
|
105
|
+
return filtered
|
|
106
|
+
|
|
107
|
+
def _apply_transform(
|
|
108
|
+
self, values: List[str], transform_config: Dict[str, Any]
|
|
109
|
+
) -> List[str]:
|
|
110
|
+
"""Apply regex transformation to extract specific parts from values.
|
|
111
|
+
|
|
112
|
+
Transform config can have:
|
|
113
|
+
- regex: Pattern to match and extract (uses groups if present)
|
|
114
|
+
- group: Which capture group to use (default: 0 = full match)
|
|
115
|
+
- format: Optional format string to reformat the extracted value
|
|
116
|
+
Use {0}, {1}, {2}... to reference capture groups
|
|
117
|
+
"""
|
|
118
|
+
if not transform_config or not values:
|
|
119
|
+
return values
|
|
120
|
+
|
|
121
|
+
pattern_str = transform_config.get("regex", "")
|
|
122
|
+
group = transform_config.get("group", 0)
|
|
123
|
+
format_str = transform_config.get("format")
|
|
124
|
+
|
|
125
|
+
if not pattern_str:
|
|
126
|
+
logger.warning("Transform specified but no regex pattern provided")
|
|
127
|
+
return values
|
|
128
|
+
|
|
129
|
+
try:
|
|
130
|
+
pattern = re.compile(pattern_str)
|
|
131
|
+
except re.error as e:
|
|
132
|
+
logger.error(f"Invalid regex pattern '{pattern_str}': {e}")
|
|
133
|
+
return values
|
|
134
|
+
|
|
135
|
+
transformed = []
|
|
136
|
+
for value in values:
|
|
137
|
+
match = pattern.search(value)
|
|
138
|
+
if match:
|
|
139
|
+
try:
|
|
140
|
+
if format_str:
|
|
141
|
+
# Use format string with all capture groups
|
|
142
|
+
try:
|
|
143
|
+
formatted = format_str.format(*match.groups())
|
|
144
|
+
transformed.append(formatted)
|
|
145
|
+
except (IndexError, ValueError) as e:
|
|
146
|
+
logger.warning(
|
|
147
|
+
f"Transform format error: {e}. Pattern: '{pattern_str}', Format: '{format_str}'"
|
|
148
|
+
)
|
|
149
|
+
# Fall back to specified group
|
|
150
|
+
extracted = match.group(group)
|
|
151
|
+
if extracted:
|
|
152
|
+
transformed.append(extracted)
|
|
153
|
+
else:
|
|
154
|
+
# Extract specified group
|
|
155
|
+
extracted = match.group(group)
|
|
156
|
+
if extracted:
|
|
157
|
+
transformed.append(extracted)
|
|
158
|
+
except IndexError:
|
|
159
|
+
logger.warning(
|
|
160
|
+
f"Transform regex group {group} not found in pattern '{pattern_str}'"
|
|
161
|
+
)
|
|
162
|
+
# Fall back to full match
|
|
163
|
+
if match.group(0):
|
|
164
|
+
transformed.append(match.group(0))
|
|
165
|
+
|
|
166
|
+
return transformed
|
|
167
|
+
|
|
168
|
+
def _extract_field(
|
|
169
|
+
self, element: etree._Element, field_config: Dict[str, Any], file_path: str
|
|
170
|
+
) -> Union[str, List[str]]:
|
|
171
|
+
"""Extract a single field from an element based on configuration."""
|
|
172
|
+
xpath = field_config.get("xpath", "")
|
|
173
|
+
as_list = field_config.get("as_list", False)
|
|
174
|
+
separator = field_config.get("separator", " | ")
|
|
175
|
+
filter_config = field_config.get("filter")
|
|
176
|
+
|
|
177
|
+
if not xpath:
|
|
178
|
+
logger.warning(
|
|
179
|
+
f"No xpath specified for field: {field_config.get('column')}"
|
|
180
|
+
)
|
|
181
|
+
return ""
|
|
182
|
+
|
|
183
|
+
try:
|
|
184
|
+
# Execute XPath query
|
|
185
|
+
results = element.xpath(xpath, namespaces=self.namespaces)
|
|
186
|
+
|
|
187
|
+
if self.debug and results:
|
|
188
|
+
logger.debug(
|
|
189
|
+
f"XPath '{xpath}' found {len(results)} result(s) in {file_path}"
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
# Convert results to strings
|
|
193
|
+
values = []
|
|
194
|
+
for result in results:
|
|
195
|
+
if isinstance(result, str):
|
|
196
|
+
values.append(result)
|
|
197
|
+
elif isinstance(result, etree._Element):
|
|
198
|
+
# If element returned, get its text
|
|
199
|
+
text = result.text or ""
|
|
200
|
+
if text.strip():
|
|
201
|
+
values.append(text.strip())
|
|
202
|
+
elif isinstance(result, etree._ElementUnicodeResult):
|
|
203
|
+
values.append(str(result))
|
|
204
|
+
|
|
205
|
+
# Apply filter if configured
|
|
206
|
+
if filter_config:
|
|
207
|
+
values = self._apply_filter(values, filter_config)
|
|
208
|
+
|
|
209
|
+
# Apply regex transform if configured
|
|
210
|
+
transform_config = field_config.get("transform")
|
|
211
|
+
if transform_config:
|
|
212
|
+
values = self._apply_transform(values, transform_config)
|
|
213
|
+
|
|
214
|
+
# Apply URL prefix if configured
|
|
215
|
+
url_prefix = field_config.get("url_prefix")
|
|
216
|
+
if url_prefix:
|
|
217
|
+
values = [f"{url_prefix}{value}" for value in values]
|
|
218
|
+
|
|
219
|
+
# Handle missing values
|
|
220
|
+
if not values:
|
|
221
|
+
verbosity = self.config.get("logging", {}).get(
|
|
222
|
+
"missing_xpath_verbosity", "summary"
|
|
223
|
+
)
|
|
224
|
+
if verbosity in ["summary", "detailed"]:
|
|
225
|
+
self.stats["fields_missing"] += 1
|
|
226
|
+
# Track which fields are missing for summary
|
|
227
|
+
field_name = field_config.get("column", xpath)
|
|
228
|
+
if field_name not in self.stats["missing_fields_by_name"]:
|
|
229
|
+
self.stats["missing_fields_by_name"][field_name] = 0
|
|
230
|
+
self.stats["missing_fields_by_name"][field_name] += 1
|
|
231
|
+
|
|
232
|
+
if verbosity == "detailed":
|
|
233
|
+
logger.warning(
|
|
234
|
+
f"XPath '{xpath}' returned no results in {file_path}"
|
|
235
|
+
)
|
|
236
|
+
return [] if as_list else ""
|
|
237
|
+
|
|
238
|
+
# Return as list or joined string based on config
|
|
239
|
+
if as_list:
|
|
240
|
+
return values
|
|
241
|
+
else:
|
|
242
|
+
return separator.join(values)
|
|
243
|
+
|
|
244
|
+
except etree.XPathEvalError as e:
|
|
245
|
+
logger.error(f"Invalid XPath expression '{xpath}': {e}")
|
|
246
|
+
self.stats["errors"].append(f"XPath error in {file_path}: {e}")
|
|
247
|
+
return [] if as_list else ""
|
|
248
|
+
|
|
249
|
+
def _check_record_filters(self, element: etree._Element, file_path: str) -> bool:
|
|
250
|
+
"""Check if record matches all filter conditions.
|
|
251
|
+
|
|
252
|
+
Returns True if record should be included, False if it should be filtered out.
|
|
253
|
+
"""
|
|
254
|
+
filters = self.config.get("record_filters", [])
|
|
255
|
+
if not filters:
|
|
256
|
+
return True # No filters means include all records
|
|
257
|
+
|
|
258
|
+
for filter_config in filters:
|
|
259
|
+
xpath = filter_config.get("xpath", "")
|
|
260
|
+
condition = filter_config.get("condition", "exists")
|
|
261
|
+
value = filter_config.get("value")
|
|
262
|
+
|
|
263
|
+
if not xpath:
|
|
264
|
+
continue
|
|
265
|
+
|
|
266
|
+
try:
|
|
267
|
+
# Extract value using XPath
|
|
268
|
+
results = element.xpath(xpath, namespaces=self.namespaces)
|
|
269
|
+
|
|
270
|
+
# Convert to text values
|
|
271
|
+
text_values = []
|
|
272
|
+
for result in results:
|
|
273
|
+
if isinstance(result, str):
|
|
274
|
+
text_values.append(result)
|
|
275
|
+
elif hasattr(result, "text") and result.text:
|
|
276
|
+
text_values.append(result.text)
|
|
277
|
+
elif isinstance(result, etree._ElementUnicodeResult):
|
|
278
|
+
text_values.append(str(result))
|
|
279
|
+
|
|
280
|
+
# Apply condition
|
|
281
|
+
if condition == "exists":
|
|
282
|
+
if not text_values:
|
|
283
|
+
return False
|
|
284
|
+
elif condition == "not_exists":
|
|
285
|
+
if text_values:
|
|
286
|
+
return False
|
|
287
|
+
elif condition == "equals" and value is not None:
|
|
288
|
+
if not any(v == value for v in text_values):
|
|
289
|
+
return False
|
|
290
|
+
elif condition == "not_equals" and value is not None:
|
|
291
|
+
if any(v == value for v in text_values):
|
|
292
|
+
return False
|
|
293
|
+
elif condition == "contains" and value is not None:
|
|
294
|
+
if not any(value in v for v in text_values):
|
|
295
|
+
return False
|
|
296
|
+
elif condition == "not_contains" and value is not None:
|
|
297
|
+
if any(value in v for v in text_values):
|
|
298
|
+
return False
|
|
299
|
+
elif condition == "matches" and value is not None:
|
|
300
|
+
# Regex match
|
|
301
|
+
pattern = re.compile(value)
|
|
302
|
+
if not any(pattern.search(v) for v in text_values):
|
|
303
|
+
return False
|
|
304
|
+
elif condition == "not_matches" and value is not None:
|
|
305
|
+
pattern = re.compile(value)
|
|
306
|
+
if any(pattern.search(v) for v in text_values):
|
|
307
|
+
return False
|
|
308
|
+
elif condition == "date_after" and value is not None:
|
|
309
|
+
# Simple date comparison (works for ISO dates)
|
|
310
|
+
if not any(v > value for v in text_values):
|
|
311
|
+
return False
|
|
312
|
+
elif condition == "date_before" and value is not None:
|
|
313
|
+
if not any(v < value for v in text_values):
|
|
314
|
+
return False
|
|
315
|
+
elif condition == "in" and isinstance(value, list):
|
|
316
|
+
if not any(v in value for v in text_values):
|
|
317
|
+
return False
|
|
318
|
+
elif condition == "not_in" and isinstance(value, list):
|
|
319
|
+
if any(v in value for v in text_values):
|
|
320
|
+
return False
|
|
321
|
+
|
|
322
|
+
except etree.XPathEvalError as e:
|
|
323
|
+
logger.error(f"Invalid filter XPath '{xpath}': {e}")
|
|
324
|
+
return False
|
|
325
|
+
|
|
326
|
+
return True # All filters passed
|
|
327
|
+
|
|
328
|
+
def _process_record(
|
|
329
|
+
self,
|
|
330
|
+
element: etree._Element,
|
|
331
|
+
field_configs: List[Dict[str, Any]],
|
|
332
|
+
file_path: str,
|
|
333
|
+
) -> Dict[str, Union[str, List[str]]]:
|
|
334
|
+
"""Process a single record element and extract all configured fields."""
|
|
335
|
+
record = {}
|
|
336
|
+
|
|
337
|
+
# Add filename as first column
|
|
338
|
+
record["_source_file"] = Path(file_path).name
|
|
339
|
+
|
|
340
|
+
for field_config in field_configs:
|
|
341
|
+
column_name = field_config.get("column", "")
|
|
342
|
+
value = self._extract_field(element, field_config, file_path)
|
|
343
|
+
record[column_name] = value
|
|
344
|
+
|
|
345
|
+
return record
|
|
346
|
+
|
|
347
|
+
def _process_file(self, file_path: Path) -> List[Dict[str, Union[str, List[str]]]]:
|
|
348
|
+
"""Process a single XML file and extract all records."""
|
|
349
|
+
records = []
|
|
350
|
+
|
|
351
|
+
try:
|
|
352
|
+
# Parse XML file
|
|
353
|
+
parser = etree.XMLParser(
|
|
354
|
+
remove_blank_text=True,
|
|
355
|
+
resolve_entities=False,
|
|
356
|
+
recover=True, # Try to recover from errors
|
|
357
|
+
)
|
|
358
|
+
tree = etree.parse(str(file_path), parser)
|
|
359
|
+
|
|
360
|
+
# Extract and merge namespaces
|
|
361
|
+
extracted_ns = self._extract_namespaces(tree)
|
|
362
|
+
self.namespaces = self._merge_namespaces(extracted_ns)
|
|
363
|
+
|
|
364
|
+
# Get root xpath for record elements
|
|
365
|
+
root_xpath = self.config.get("root_xpath", "")
|
|
366
|
+
if not root_xpath:
|
|
367
|
+
logger.error("No root_xpath specified in configuration")
|
|
368
|
+
return records
|
|
369
|
+
|
|
370
|
+
# Find all record elements
|
|
371
|
+
try:
|
|
372
|
+
record_elements = tree.xpath(root_xpath, namespaces=self.namespaces)
|
|
373
|
+
except etree.XPathEvalError as e:
|
|
374
|
+
logger.error(f"Invalid root_xpath '{root_xpath}': {e}")
|
|
375
|
+
self.stats["errors"].append(f"Root XPath error in {file_path}: {e}")
|
|
376
|
+
return records
|
|
377
|
+
|
|
378
|
+
if not record_elements:
|
|
379
|
+
# Check if this is a deleted OAI record
|
|
380
|
+
try:
|
|
381
|
+
deleted_headers = tree.xpath(
|
|
382
|
+
"//oai:header[@status='deleted']",
|
|
383
|
+
namespaces={"oai": "http://www.openarchives.org/OAI/2.0/"},
|
|
384
|
+
)
|
|
385
|
+
if deleted_headers:
|
|
386
|
+
logger.info(f"Skipping deleted OAI record in {file_path.name}")
|
|
387
|
+
else:
|
|
388
|
+
logger.warning(
|
|
389
|
+
f"No records found with root_xpath '{root_xpath}' in {file_path}"
|
|
390
|
+
)
|
|
391
|
+
except:
|
|
392
|
+
logger.warning(
|
|
393
|
+
f"No records found with root_xpath '{root_xpath}' in {file_path}"
|
|
394
|
+
)
|
|
395
|
+
return records
|
|
396
|
+
|
|
397
|
+
logger.info(f"Found {len(record_elements)} record(s) in {file_path.name}")
|
|
398
|
+
|
|
399
|
+
# Process each record
|
|
400
|
+
field_configs = self.config.get("fields", [])
|
|
401
|
+
for idx, element in enumerate(record_elements, 1):
|
|
402
|
+
if self.debug:
|
|
403
|
+
logger.debug(
|
|
404
|
+
f"Processing record {idx}/{len(record_elements)} in {file_path.name}"
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
# Check if record passes filters
|
|
408
|
+
if not self._check_record_filters(element, str(file_path)):
|
|
409
|
+
self.stats["records_filtered"] += 1
|
|
410
|
+
if self.debug:
|
|
411
|
+
logger.debug(f"Record {idx} filtered out by record_filters")
|
|
412
|
+
continue
|
|
413
|
+
|
|
414
|
+
record = self._process_record(element, field_configs, str(file_path))
|
|
415
|
+
records.append(record)
|
|
416
|
+
self.stats["records_extracted"] += 1
|
|
417
|
+
|
|
418
|
+
self.stats["files_processed"] += 1
|
|
419
|
+
|
|
420
|
+
except etree.XMLSyntaxError as e:
|
|
421
|
+
logger.error(f"Invalid XML in {file_path}: {e}")
|
|
422
|
+
self.stats["files_skipped"] += 1
|
|
423
|
+
self.stats["errors"].append(f"XML syntax error in {file_path}: {e}")
|
|
424
|
+
|
|
425
|
+
if not self.config.get("logging", {}).get("skip_invalid_xml", True):
|
|
426
|
+
raise
|
|
427
|
+
|
|
428
|
+
except Exception as e:
|
|
429
|
+
logger.error(f"Error processing {file_path}: {e}")
|
|
430
|
+
self.stats["files_skipped"] += 1
|
|
431
|
+
self.stats["errors"].append(f"Error in {file_path}: {e}")
|
|
432
|
+
raise
|
|
433
|
+
|
|
434
|
+
return records
|
|
435
|
+
|
|
436
|
+
def extract(
|
|
437
|
+
self, input_dir: Path, output_file: Path, output_format: str = "csv"
|
|
438
|
+
) -> None:
|
|
439
|
+
"""Main extraction method - processes all XML files and writes output."""
|
|
440
|
+
# Find all XML files
|
|
441
|
+
xml_files = list(input_dir.glob("*.xml"))
|
|
442
|
+
|
|
443
|
+
if not xml_files:
|
|
444
|
+
console.print(f"[yellow]No XML files found in {input_dir}[/yellow]")
|
|
445
|
+
return
|
|
446
|
+
|
|
447
|
+
console.print(f"[cyan]Found {len(xml_files)} XML file(s) to process[/cyan]")
|
|
448
|
+
|
|
449
|
+
all_records = []
|
|
450
|
+
|
|
451
|
+
# Process files with progress bar
|
|
452
|
+
with Progress(
|
|
453
|
+
SpinnerColumn(),
|
|
454
|
+
TextColumn("[progress.description]{task.description}"),
|
|
455
|
+
BarColumn(),
|
|
456
|
+
TaskProgressColumn(),
|
|
457
|
+
TimeRemainingColumn(),
|
|
458
|
+
console=console,
|
|
459
|
+
) as progress:
|
|
460
|
+
task = progress.add_task(
|
|
461
|
+
"[cyan]Processing XML files...", total=len(xml_files)
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
for xml_file in xml_files:
|
|
465
|
+
progress.update(
|
|
466
|
+
task, description=f"[cyan]Processing {xml_file.name}..."
|
|
467
|
+
)
|
|
468
|
+
records = self._process_file(xml_file)
|
|
469
|
+
all_records.extend(records)
|
|
470
|
+
progress.advance(task)
|
|
471
|
+
|
|
472
|
+
# Write output
|
|
473
|
+
if all_records:
|
|
474
|
+
self._write_output(all_records, output_file, output_format)
|
|
475
|
+
console.print(
|
|
476
|
+
f"\n[green]✓ Successfully wrote extracted data from {len(all_records)} record(s) to {output_file.absolute()}[/green]"
|
|
477
|
+
)
|
|
478
|
+
else:
|
|
479
|
+
console.print(
|
|
480
|
+
"\n[yellow]⚠ No records extracted, output not created[/yellow]"
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
# Print missing fields summary if verbosity is summary or detailed
|
|
484
|
+
verbosity = self.config.get("logging", {}).get(
|
|
485
|
+
"missing_xpath_verbosity", "summary"
|
|
486
|
+
)
|
|
487
|
+
if (
|
|
488
|
+
verbosity in ["summary", "detailed"]
|
|
489
|
+
and self.stats["missing_fields_by_name"]
|
|
490
|
+
):
|
|
491
|
+
console.print("\n[yellow]Missing Fields Summary:[/yellow]")
|
|
492
|
+
logger.info("Missing Fields Summary:")
|
|
493
|
+
for field_name, count in sorted(
|
|
494
|
+
self.stats["missing_fields_by_name"].items()
|
|
495
|
+
):
|
|
496
|
+
console.print(f" [yellow]• {field_name}: {count} record(s)[/yellow]")
|
|
497
|
+
logger.info(f" • {field_name}: {count} record(s)")
|
|
498
|
+
|
|
499
|
+
# Print statistics
|
|
500
|
+
self._print_statistics()
|
|
501
|
+
|
|
502
|
+
def _write_output(
|
|
503
|
+
self,
|
|
504
|
+
records: List[Dict[str, Union[str, List[str]]]],
|
|
505
|
+
output_file: Path,
|
|
506
|
+
output_format: str,
|
|
507
|
+
) -> None:
|
|
508
|
+
"""Write extracted records to file using pandas."""
|
|
509
|
+
if not records:
|
|
510
|
+
return
|
|
511
|
+
|
|
512
|
+
# Create DataFrame
|
|
513
|
+
df = pd.DataFrame(records)
|
|
514
|
+
|
|
515
|
+
# Ensure output directory exists
|
|
516
|
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
517
|
+
|
|
518
|
+
# Write based on format
|
|
519
|
+
if output_format == "csv":
|
|
520
|
+
# For CSV, convert lists to pipe-separated strings for compatibility
|
|
521
|
+
df_csv = df.copy()
|
|
522
|
+
for col in df_csv.columns:
|
|
523
|
+
if df_csv[col].apply(lambda x: isinstance(x, list)).any():
|
|
524
|
+
df_csv[col] = df_csv[col].apply(
|
|
525
|
+
lambda x: " | ".join(x) if isinstance(x, list) else x
|
|
526
|
+
)
|
|
527
|
+
df_csv.to_csv(output_file, index=False, encoding="utf-8")
|
|
528
|
+
logger.info(f"Wrote {len(records)} records to CSV: {output_file}")
|
|
529
|
+
|
|
530
|
+
elif output_format == "parquet":
|
|
531
|
+
# Parquet supports native lists
|
|
532
|
+
df.to_parquet(output_file, index=False, engine="pyarrow")
|
|
533
|
+
logger.info(f"Wrote {len(records)} records to Parquet: {output_file}")
|
|
534
|
+
|
|
535
|
+
elif output_format == "excel":
|
|
536
|
+
# Excel: convert lists to strings
|
|
537
|
+
df_excel = df.copy()
|
|
538
|
+
for col in df_excel.columns:
|
|
539
|
+
if df_excel[col].apply(lambda x: isinstance(x, list)).any():
|
|
540
|
+
df_excel[col] = df_excel[col].apply(
|
|
541
|
+
lambda x: " | ".join(x) if isinstance(x, list) else x
|
|
542
|
+
)
|
|
543
|
+
df_excel.to_excel(output_file, index=False, engine="openpyxl")
|
|
544
|
+
logger.info(f"Wrote {len(records)} records to Excel: {output_file}")
|
|
545
|
+
|
|
546
|
+
elif output_format == "json":
|
|
547
|
+
# JSON supports native lists
|
|
548
|
+
df.to_json(output_file, orient="records", indent=2, force_ascii=False)
|
|
549
|
+
logger.info(f"Wrote {len(records)} records to JSON: {output_file}")
|
|
550
|
+
|
|
551
|
+
else:
|
|
552
|
+
raise ValueError(f"Unsupported output format: {output_format}")
|
|
553
|
+
|
|
554
|
+
def _print_statistics(self) -> None:
|
|
555
|
+
"""Print extraction statistics in a nice table."""
|
|
556
|
+
table = Table(
|
|
557
|
+
title="Extraction Statistics", show_header=True, header_style="bold magenta"
|
|
558
|
+
)
|
|
559
|
+
table.add_column("Metric", style="cyan", no_wrap=True)
|
|
560
|
+
table.add_column("Value", style="green", justify="right")
|
|
561
|
+
|
|
562
|
+
table.add_row("Files processed", str(self.stats["files_processed"]))
|
|
563
|
+
table.add_row("Files skipped", str(self.stats["files_skipped"]))
|
|
564
|
+
table.add_row("Records extracted", str(self.stats["records_extracted"]))
|
|
565
|
+
if self.stats["records_filtered"] > 0:
|
|
566
|
+
table.add_row("Records filtered out", str(self.stats["records_filtered"]))
|
|
567
|
+
table.add_row("Fields with missing data", str(self.stats["fields_missing"]))
|
|
568
|
+
table.add_row("Errors", str(len(self.stats["errors"])))
|
|
569
|
+
|
|
570
|
+
console.print()
|
|
571
|
+
console.print(table)
|
|
572
|
+
|
|
573
|
+
# Log statistics
|
|
574
|
+
logger.info("Extraction Statistics:")
|
|
575
|
+
logger.info(f" Files processed: {self.stats['files_processed']}")
|
|
576
|
+
logger.info(f" Files skipped: {self.stats['files_skipped']}")
|
|
577
|
+
logger.info(f" Records extracted: {self.stats['records_extracted']}")
|
|
578
|
+
if self.stats["records_filtered"] > 0:
|
|
579
|
+
logger.info(f" Records filtered out: {self.stats['records_filtered']}")
|
|
580
|
+
logger.info(f" Fields with missing data: {self.stats['fields_missing']}")
|
|
581
|
+
logger.info(f" Errors: {len(self.stats['errors'])}")
|
|
582
|
+
|
|
583
|
+
if self.stats["errors"] and self.debug:
|
|
584
|
+
console.print("\n[red]Errors encountered:[/red]")
|
|
585
|
+
for error in self.stats["errors"][:10]: # Show first 10 errors
|
|
586
|
+
console.print(f" • {error}")
|
|
587
|
+
if len(self.stats["errors"]) > 10:
|
|
588
|
+
console.print(f" ... and {len(self.stats['errors']) - 10} more")
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
def load_config(config_file: Path) -> Dict[str, Any]:
|
|
592
|
+
"""Load and validate configuration from YAML file."""
|
|
593
|
+
if not config_file.exists():
|
|
594
|
+
raise FileNotFoundError(f"Configuration file not found: {config_file}")
|
|
595
|
+
|
|
596
|
+
with open(config_file, "r", encoding="utf-8") as f:
|
|
597
|
+
config = yaml.safe_load(f)
|
|
598
|
+
|
|
599
|
+
# Validate required fields
|
|
600
|
+
required_fields = ["root_xpath", "fields"]
|
|
601
|
+
for field in required_fields:
|
|
602
|
+
if field not in config:
|
|
603
|
+
raise ValueError(f"Required field '{field}' missing in configuration")
|
|
604
|
+
|
|
605
|
+
if not config["fields"]:
|
|
606
|
+
raise ValueError("No fields configured for extraction")
|
|
607
|
+
|
|
608
|
+
return config
|
|
609
|
+
|
|
610
|
+
|
|
611
|
+
@click.group()
|
|
612
|
+
def cli():
|
|
613
|
+
"""XML Field Extractor — extract XML data or build configs interactively."""
|
|
614
|
+
pass
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
@cli.command("run")
|
|
618
|
+
@click.argument(
|
|
619
|
+
"config",
|
|
620
|
+
type=click.Path(exists=True, path_type=Path),
|
|
621
|
+
)
|
|
622
|
+
@click.option(
|
|
623
|
+
"--input",
|
|
624
|
+
"-i",
|
|
625
|
+
"input_dir",
|
|
626
|
+
type=click.Path(exists=True, file_okay=False, path_type=Path),
|
|
627
|
+
help="Input directory containing XML files (overrides config)",
|
|
628
|
+
)
|
|
629
|
+
@click.option(
|
|
630
|
+
"--output",
|
|
631
|
+
"-o",
|
|
632
|
+
"output_file",
|
|
633
|
+
type=click.Path(path_type=Path),
|
|
634
|
+
help="Output file path (overrides config)",
|
|
635
|
+
)
|
|
636
|
+
@click.option(
|
|
637
|
+
"--format",
|
|
638
|
+
"-f",
|
|
639
|
+
"output_format",
|
|
640
|
+
type=click.Choice(["csv", "parquet", "excel", "json"], case_sensitive=False),
|
|
641
|
+
default="csv",
|
|
642
|
+
help="Output format (default: csv)",
|
|
643
|
+
)
|
|
644
|
+
@click.option(
|
|
645
|
+
"--debug",
|
|
646
|
+
is_flag=True,
|
|
647
|
+
default=False,
|
|
648
|
+
help="Enable debug mode with verbose logging",
|
|
649
|
+
)
|
|
650
|
+
def run_cmd(
|
|
651
|
+
config: Path,
|
|
652
|
+
input_dir: Optional[Path],
|
|
653
|
+
output_file: Optional[Path],
|
|
654
|
+
output_format: str,
|
|
655
|
+
debug: bool,
|
|
656
|
+
):
|
|
657
|
+
"""Extract fields from XML files using CONFIG yaml file.
|
|
658
|
+
|
|
659
|
+
Supports Dublin Core, METS, LIDO, PREMIS, and other XML formats.
|
|
660
|
+
Output formats: CSV, Parquet (native list support), Excel, JSON.
|
|
661
|
+
"""
|
|
662
|
+
console.print()
|
|
663
|
+
console.print("[bold cyan]═══════════════════════════════════════[/bold cyan]")
|
|
664
|
+
console.print("[bold cyan] XML Field Extractor[/bold cyan]")
|
|
665
|
+
console.print("[bold cyan]═══════════════════════════════════════[/bold cyan]")
|
|
666
|
+
console.print()
|
|
667
|
+
|
|
668
|
+
try:
|
|
669
|
+
console.print(f"[cyan]Loading configuration from {config}...[/cyan]")
|
|
670
|
+
cfg = load_config(config)
|
|
671
|
+
|
|
672
|
+
if input_dir:
|
|
673
|
+
cfg["input_directory"] = str(input_dir)
|
|
674
|
+
if output_file:
|
|
675
|
+
cfg["output_file"] = str(output_file)
|
|
676
|
+
|
|
677
|
+
output_base = Path(cfg.get("output_file", "output"))
|
|
678
|
+
|
|
679
|
+
format_extensions = {
|
|
680
|
+
"csv": ".csv",
|
|
681
|
+
"parquet": ".parquet",
|
|
682
|
+
"excel": ".xlsx",
|
|
683
|
+
"json": ".json",
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
extension = format_extensions.get(output_format, ".csv")
|
|
687
|
+
if output_base.suffix not in format_extensions.values():
|
|
688
|
+
output_path = output_base.with_suffix(extension)
|
|
689
|
+
else:
|
|
690
|
+
output_path = output_base
|
|
691
|
+
|
|
692
|
+
log_path = output_path.parent / f"{output_path.stem}.log"
|
|
693
|
+
|
|
694
|
+
logger.remove()
|
|
695
|
+
logger.add(
|
|
696
|
+
lambda msg: console.print(msg, end=""),
|
|
697
|
+
level="DEBUG" if debug else "WARNING",
|
|
698
|
+
format="<level>{message}</level>",
|
|
699
|
+
)
|
|
700
|
+
logger.add(
|
|
701
|
+
log_path,
|
|
702
|
+
level="DEBUG",
|
|
703
|
+
format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {message}",
|
|
704
|
+
rotation="10 MB",
|
|
705
|
+
)
|
|
706
|
+
|
|
707
|
+
logger.info("=" * 50)
|
|
708
|
+
logger.info("XML Field Extractor - Starting extraction")
|
|
709
|
+
logger.info(f"Configuration: {config}")
|
|
710
|
+
logger.info(f"Debug mode: {debug}")
|
|
711
|
+
|
|
712
|
+
input_path = Path(cfg.get("input_directory", "./example_data"))
|
|
713
|
+
|
|
714
|
+
if not input_path.exists():
|
|
715
|
+
console.print(f"[red]✗ Input directory not found: {input_path}[/red]")
|
|
716
|
+
return
|
|
717
|
+
|
|
718
|
+
console.print(f"[cyan]Input directory: {input_path}[/cyan]")
|
|
719
|
+
console.print(f"[cyan]Output file: {output_path}[/cyan]")
|
|
720
|
+
console.print(f"[cyan]Output format: {output_format.upper()}[/cyan]")
|
|
721
|
+
console.print(f"[cyan]Log file: {log_path}[/cyan]")
|
|
722
|
+
console.print()
|
|
723
|
+
|
|
724
|
+
extractor = XMLExtractor(cfg, debug=debug)
|
|
725
|
+
extractor.extract(input_path, output_path, output_format)
|
|
726
|
+
|
|
727
|
+
logger.info("Extraction completed successfully")
|
|
728
|
+
console.print("\n[green]✓ Extraction completed![/green]")
|
|
729
|
+
|
|
730
|
+
except Exception as e:
|
|
731
|
+
logger.exception("Fatal error during extraction")
|
|
732
|
+
console.print(f"\n[red]✗ Error: {e}[/red]")
|
|
733
|
+
raise click.Abort()
|
|
734
|
+
|
|
735
|
+
|
|
736
|
+
@cli.command("build")
|
|
737
|
+
@click.argument(
|
|
738
|
+
"xml_file",
|
|
739
|
+
type=click.Path(exists=True, path_type=Path),
|
|
740
|
+
)
|
|
741
|
+
def build_cmd(xml_file: Path) -> None:
|
|
742
|
+
"""Interactively build a config YAML from an XML sample file."""
|
|
743
|
+
from xml_config_builder import ConfigBuilderApp
|
|
744
|
+
|
|
745
|
+
app = ConfigBuilderApp(xml_file)
|
|
746
|
+
config_path = app.run()
|
|
747
|
+
|
|
748
|
+
if config_path:
|
|
749
|
+
console.print(f"\n[cyan]Running extraction with {config_path} …[/cyan]\n")
|
|
750
|
+
cfg = load_config(config_path)
|
|
751
|
+
output_base = Path(cfg.get("output_file", config_path.stem))
|
|
752
|
+
output_path = output_base.with_suffix(".csv")
|
|
753
|
+
log_path = output_path.parent / f"{output_path.stem}.log"
|
|
754
|
+
|
|
755
|
+
logger.remove()
|
|
756
|
+
logger.add(
|
|
757
|
+
lambda msg: console.print(msg, end=""),
|
|
758
|
+
level="WARNING",
|
|
759
|
+
format="<level>{message}</level>",
|
|
760
|
+
)
|
|
761
|
+
logger.add(
|
|
762
|
+
log_path,
|
|
763
|
+
level="DEBUG",
|
|
764
|
+
format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {message}",
|
|
765
|
+
rotation="10 MB",
|
|
766
|
+
)
|
|
767
|
+
|
|
768
|
+
input_path = Path(cfg.get("input_directory", xml_file.parent))
|
|
769
|
+
extractor = XMLExtractor(cfg)
|
|
770
|
+
extractor.extract(input_path, output_path, "csv")
|
|
771
|
+
console.print("\n[green]✓ Extraction completed![/green]")
|
|
772
|
+
|
|
773
|
+
|
|
774
|
+
# keep bare `main` so `if __name__ == "__main__"` still works
|
|
775
|
+
main = cli
|
|
776
|
+
|
|
777
|
+
if __name__ == "__main__":
|
|
778
|
+
cli()
|