xml-data-extractor 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xml_extractor.py ADDED
@@ -0,0 +1,778 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ XML Field Extractor - Flexible XML to CSV/Parquet converter
4
+ Supports complex XML formats like Dublin Core, METS, LIDO, PREMIS, etc.
5
+ """
6
+
7
+ import re
8
+ from pathlib import Path
9
+ from typing import Any, Dict, List, Optional, Union
10
+
11
+ import click
12
+ import pandas as pd
13
+ import yaml
14
+ from loguru import logger
15
+ from lxml import etree
16
+ from rich.console import Console
17
+ from rich.progress import (
18
+ Progress,
19
+ SpinnerColumn,
20
+ TextColumn,
21
+ BarColumn,
22
+ TaskProgressColumn,
23
+ TimeRemainingColumn,
24
+ )
25
+ from rich.table import Table
26
+
27
+ console = Console()
28
+
29
+
30
+ class XMLExtractor:
31
+ """Main extractor class for XML to CSV conversion."""
32
+
33
+ def __init__(self, config: Dict[str, Any], debug: bool = False):
34
+ self.config = config
35
+ self.debug = debug
36
+ # Initialize with config namespaces so they're always available
37
+ self.namespaces: Dict[str, str] = config.get("namespaces", {}).copy()
38
+ self.stats = {
39
+ "files_processed": 0,
40
+ "files_skipped": 0,
41
+ "records_extracted": 0,
42
+ "records_filtered": 0,
43
+ "fields_missing": 0,
44
+ "missing_fields_by_name": {},
45
+ "errors": [],
46
+ }
47
+
48
+ def _extract_namespaces(self, tree: etree._ElementTree) -> Dict[str, str]:
49
+ """Extract all namespaces from XML document."""
50
+ nsmap = {}
51
+ for ns_prefix, ns_uri in tree.getroot().nsmap.items():
52
+ if ns_prefix is not None:
53
+ nsmap[ns_prefix] = ns_uri
54
+ else:
55
+ # Handle default namespace
56
+ nsmap["default"] = ns_uri
57
+ return nsmap
58
+
59
+ def _merge_namespaces(self, extracted_ns: Dict[str, str]) -> Dict[str, str]:
60
+ """Merge extracted namespaces with config namespaces (config takes priority)."""
61
+ config_ns = self.config.get("namespaces", {})
62
+
63
+ # Start with extracted namespaces
64
+ merged = extracted_ns.copy()
65
+
66
+ # Override with config namespaces
67
+ merged.update(config_ns)
68
+
69
+ if self.debug:
70
+ logger.debug(f"Extracted namespaces: {extracted_ns}")
71
+ logger.debug(f"Config namespaces: {config_ns}")
72
+ logger.debug(f"Merged namespaces: {merged}")
73
+
74
+ return merged
75
+
76
+ def _apply_filter(
77
+ self, values: List[str], filter_config: Optional[Dict[str, str]]
78
+ ) -> List[str]:
79
+ """Apply filter to list of values based on filter configuration."""
80
+ if not filter_config or not values:
81
+ return values
82
+
83
+ filter_type = filter_config.get("type", "").lower()
84
+ pattern = filter_config.get("pattern", "")
85
+
86
+ if not pattern:
87
+ logger.warning("Filter specified but no pattern provided")
88
+ return values
89
+
90
+ filtered = []
91
+ for value in values:
92
+ if filter_type == "regex":
93
+ if re.search(pattern, value):
94
+ filtered.append(value)
95
+ elif filter_type == "startswith":
96
+ if value.startswith(pattern):
97
+ filtered.append(value)
98
+ elif filter_type == "contains":
99
+ if pattern in value:
100
+ filtered.append(value)
101
+ else:
102
+ logger.warning(f"Unknown filter type: {filter_type}")
103
+ return values
104
+
105
+ return filtered
106
+
107
+ def _apply_transform(
108
+ self, values: List[str], transform_config: Dict[str, Any]
109
+ ) -> List[str]:
110
+ """Apply regex transformation to extract specific parts from values.
111
+
112
+ Transform config can have:
113
+ - regex: Pattern to match and extract (uses groups if present)
114
+ - group: Which capture group to use (default: 0 = full match)
115
+ - format: Optional format string to reformat the extracted value
116
+ Use {0}, {1}, {2}... to reference capture groups
117
+ """
118
+ if not transform_config or not values:
119
+ return values
120
+
121
+ pattern_str = transform_config.get("regex", "")
122
+ group = transform_config.get("group", 0)
123
+ format_str = transform_config.get("format")
124
+
125
+ if not pattern_str:
126
+ logger.warning("Transform specified but no regex pattern provided")
127
+ return values
128
+
129
+ try:
130
+ pattern = re.compile(pattern_str)
131
+ except re.error as e:
132
+ logger.error(f"Invalid regex pattern '{pattern_str}': {e}")
133
+ return values
134
+
135
+ transformed = []
136
+ for value in values:
137
+ match = pattern.search(value)
138
+ if match:
139
+ try:
140
+ if format_str:
141
+ # Use format string with all capture groups
142
+ try:
143
+ formatted = format_str.format(*match.groups())
144
+ transformed.append(formatted)
145
+ except (IndexError, ValueError) as e:
146
+ logger.warning(
147
+ f"Transform format error: {e}. Pattern: '{pattern_str}', Format: '{format_str}'"
148
+ )
149
+ # Fall back to specified group
150
+ extracted = match.group(group)
151
+ if extracted:
152
+ transformed.append(extracted)
153
+ else:
154
+ # Extract specified group
155
+ extracted = match.group(group)
156
+ if extracted:
157
+ transformed.append(extracted)
158
+ except IndexError:
159
+ logger.warning(
160
+ f"Transform regex group {group} not found in pattern '{pattern_str}'"
161
+ )
162
+ # Fall back to full match
163
+ if match.group(0):
164
+ transformed.append(match.group(0))
165
+
166
+ return transformed
167
+
168
+ def _extract_field(
169
+ self, element: etree._Element, field_config: Dict[str, Any], file_path: str
170
+ ) -> Union[str, List[str]]:
171
+ """Extract a single field from an element based on configuration."""
172
+ xpath = field_config.get("xpath", "")
173
+ as_list = field_config.get("as_list", False)
174
+ separator = field_config.get("separator", " | ")
175
+ filter_config = field_config.get("filter")
176
+
177
+ if not xpath:
178
+ logger.warning(
179
+ f"No xpath specified for field: {field_config.get('column')}"
180
+ )
181
+ return ""
182
+
183
+ try:
184
+ # Execute XPath query
185
+ results = element.xpath(xpath, namespaces=self.namespaces)
186
+
187
+ if self.debug and results:
188
+ logger.debug(
189
+ f"XPath '{xpath}' found {len(results)} result(s) in {file_path}"
190
+ )
191
+
192
+ # Convert results to strings
193
+ values = []
194
+ for result in results:
195
+ if isinstance(result, str):
196
+ values.append(result)
197
+ elif isinstance(result, etree._Element):
198
+ # If element returned, get its text
199
+ text = result.text or ""
200
+ if text.strip():
201
+ values.append(text.strip())
202
+ elif isinstance(result, etree._ElementUnicodeResult):
203
+ values.append(str(result))
204
+
205
+ # Apply filter if configured
206
+ if filter_config:
207
+ values = self._apply_filter(values, filter_config)
208
+
209
+ # Apply regex transform if configured
210
+ transform_config = field_config.get("transform")
211
+ if transform_config:
212
+ values = self._apply_transform(values, transform_config)
213
+
214
+ # Apply URL prefix if configured
215
+ url_prefix = field_config.get("url_prefix")
216
+ if url_prefix:
217
+ values = [f"{url_prefix}{value}" for value in values]
218
+
219
+ # Handle missing values
220
+ if not values:
221
+ verbosity = self.config.get("logging", {}).get(
222
+ "missing_xpath_verbosity", "summary"
223
+ )
224
+ if verbosity in ["summary", "detailed"]:
225
+ self.stats["fields_missing"] += 1
226
+ # Track which fields are missing for summary
227
+ field_name = field_config.get("column", xpath)
228
+ if field_name not in self.stats["missing_fields_by_name"]:
229
+ self.stats["missing_fields_by_name"][field_name] = 0
230
+ self.stats["missing_fields_by_name"][field_name] += 1
231
+
232
+ if verbosity == "detailed":
233
+ logger.warning(
234
+ f"XPath '{xpath}' returned no results in {file_path}"
235
+ )
236
+ return [] if as_list else ""
237
+
238
+ # Return as list or joined string based on config
239
+ if as_list:
240
+ return values
241
+ else:
242
+ return separator.join(values)
243
+
244
+ except etree.XPathEvalError as e:
245
+ logger.error(f"Invalid XPath expression '{xpath}': {e}")
246
+ self.stats["errors"].append(f"XPath error in {file_path}: {e}")
247
+ return [] if as_list else ""
248
+
249
+ def _check_record_filters(self, element: etree._Element, file_path: str) -> bool:
250
+ """Check if record matches all filter conditions.
251
+
252
+ Returns True if record should be included, False if it should be filtered out.
253
+ """
254
+ filters = self.config.get("record_filters", [])
255
+ if not filters:
256
+ return True # No filters means include all records
257
+
258
+ for filter_config in filters:
259
+ xpath = filter_config.get("xpath", "")
260
+ condition = filter_config.get("condition", "exists")
261
+ value = filter_config.get("value")
262
+
263
+ if not xpath:
264
+ continue
265
+
266
+ try:
267
+ # Extract value using XPath
268
+ results = element.xpath(xpath, namespaces=self.namespaces)
269
+
270
+ # Convert to text values
271
+ text_values = []
272
+ for result in results:
273
+ if isinstance(result, str):
274
+ text_values.append(result)
275
+ elif hasattr(result, "text") and result.text:
276
+ text_values.append(result.text)
277
+ elif isinstance(result, etree._ElementUnicodeResult):
278
+ text_values.append(str(result))
279
+
280
+ # Apply condition
281
+ if condition == "exists":
282
+ if not text_values:
283
+ return False
284
+ elif condition == "not_exists":
285
+ if text_values:
286
+ return False
287
+ elif condition == "equals" and value is not None:
288
+ if not any(v == value for v in text_values):
289
+ return False
290
+ elif condition == "not_equals" and value is not None:
291
+ if any(v == value for v in text_values):
292
+ return False
293
+ elif condition == "contains" and value is not None:
294
+ if not any(value in v for v in text_values):
295
+ return False
296
+ elif condition == "not_contains" and value is not None:
297
+ if any(value in v for v in text_values):
298
+ return False
299
+ elif condition == "matches" and value is not None:
300
+ # Regex match
301
+ pattern = re.compile(value)
302
+ if not any(pattern.search(v) for v in text_values):
303
+ return False
304
+ elif condition == "not_matches" and value is not None:
305
+ pattern = re.compile(value)
306
+ if any(pattern.search(v) for v in text_values):
307
+ return False
308
+ elif condition == "date_after" and value is not None:
309
+ # Simple date comparison (works for ISO dates)
310
+ if not any(v > value for v in text_values):
311
+ return False
312
+ elif condition == "date_before" and value is not None:
313
+ if not any(v < value for v in text_values):
314
+ return False
315
+ elif condition == "in" and isinstance(value, list):
316
+ if not any(v in value for v in text_values):
317
+ return False
318
+ elif condition == "not_in" and isinstance(value, list):
319
+ if any(v in value for v in text_values):
320
+ return False
321
+
322
+ except etree.XPathEvalError as e:
323
+ logger.error(f"Invalid filter XPath '{xpath}': {e}")
324
+ return False
325
+
326
+ return True # All filters passed
327
+
328
+ def _process_record(
329
+ self,
330
+ element: etree._Element,
331
+ field_configs: List[Dict[str, Any]],
332
+ file_path: str,
333
+ ) -> Dict[str, Union[str, List[str]]]:
334
+ """Process a single record element and extract all configured fields."""
335
+ record = {}
336
+
337
+ # Add filename as first column
338
+ record["_source_file"] = Path(file_path).name
339
+
340
+ for field_config in field_configs:
341
+ column_name = field_config.get("column", "")
342
+ value = self._extract_field(element, field_config, file_path)
343
+ record[column_name] = value
344
+
345
+ return record
346
+
347
+ def _process_file(self, file_path: Path) -> List[Dict[str, Union[str, List[str]]]]:
348
+ """Process a single XML file and extract all records."""
349
+ records = []
350
+
351
+ try:
352
+ # Parse XML file
353
+ parser = etree.XMLParser(
354
+ remove_blank_text=True,
355
+ resolve_entities=False,
356
+ recover=True, # Try to recover from errors
357
+ )
358
+ tree = etree.parse(str(file_path), parser)
359
+
360
+ # Extract and merge namespaces
361
+ extracted_ns = self._extract_namespaces(tree)
362
+ self.namespaces = self._merge_namespaces(extracted_ns)
363
+
364
+ # Get root xpath for record elements
365
+ root_xpath = self.config.get("root_xpath", "")
366
+ if not root_xpath:
367
+ logger.error("No root_xpath specified in configuration")
368
+ return records
369
+
370
+ # Find all record elements
371
+ try:
372
+ record_elements = tree.xpath(root_xpath, namespaces=self.namespaces)
373
+ except etree.XPathEvalError as e:
374
+ logger.error(f"Invalid root_xpath '{root_xpath}': {e}")
375
+ self.stats["errors"].append(f"Root XPath error in {file_path}: {e}")
376
+ return records
377
+
378
+ if not record_elements:
379
+ # Check if this is a deleted OAI record
380
+ try:
381
+ deleted_headers = tree.xpath(
382
+ "//oai:header[@status='deleted']",
383
+ namespaces={"oai": "http://www.openarchives.org/OAI/2.0/"},
384
+ )
385
+ if deleted_headers:
386
+ logger.info(f"Skipping deleted OAI record in {file_path.name}")
387
+ else:
388
+ logger.warning(
389
+ f"No records found with root_xpath '{root_xpath}' in {file_path}"
390
+ )
391
+ except:
392
+ logger.warning(
393
+ f"No records found with root_xpath '{root_xpath}' in {file_path}"
394
+ )
395
+ return records
396
+
397
+ logger.info(f"Found {len(record_elements)} record(s) in {file_path.name}")
398
+
399
+ # Process each record
400
+ field_configs = self.config.get("fields", [])
401
+ for idx, element in enumerate(record_elements, 1):
402
+ if self.debug:
403
+ logger.debug(
404
+ f"Processing record {idx}/{len(record_elements)} in {file_path.name}"
405
+ )
406
+
407
+ # Check if record passes filters
408
+ if not self._check_record_filters(element, str(file_path)):
409
+ self.stats["records_filtered"] += 1
410
+ if self.debug:
411
+ logger.debug(f"Record {idx} filtered out by record_filters")
412
+ continue
413
+
414
+ record = self._process_record(element, field_configs, str(file_path))
415
+ records.append(record)
416
+ self.stats["records_extracted"] += 1
417
+
418
+ self.stats["files_processed"] += 1
419
+
420
+ except etree.XMLSyntaxError as e:
421
+ logger.error(f"Invalid XML in {file_path}: {e}")
422
+ self.stats["files_skipped"] += 1
423
+ self.stats["errors"].append(f"XML syntax error in {file_path}: {e}")
424
+
425
+ if not self.config.get("logging", {}).get("skip_invalid_xml", True):
426
+ raise
427
+
428
+ except Exception as e:
429
+ logger.error(f"Error processing {file_path}: {e}")
430
+ self.stats["files_skipped"] += 1
431
+ self.stats["errors"].append(f"Error in {file_path}: {e}")
432
+ raise
433
+
434
+ return records
435
+
436
+ def extract(
437
+ self, input_dir: Path, output_file: Path, output_format: str = "csv"
438
+ ) -> None:
439
+ """Main extraction method - processes all XML files and writes output."""
440
+ # Find all XML files
441
+ xml_files = list(input_dir.glob("*.xml"))
442
+
443
+ if not xml_files:
444
+ console.print(f"[yellow]No XML files found in {input_dir}[/yellow]")
445
+ return
446
+
447
+ console.print(f"[cyan]Found {len(xml_files)} XML file(s) to process[/cyan]")
448
+
449
+ all_records = []
450
+
451
+ # Process files with progress bar
452
+ with Progress(
453
+ SpinnerColumn(),
454
+ TextColumn("[progress.description]{task.description}"),
455
+ BarColumn(),
456
+ TaskProgressColumn(),
457
+ TimeRemainingColumn(),
458
+ console=console,
459
+ ) as progress:
460
+ task = progress.add_task(
461
+ "[cyan]Processing XML files...", total=len(xml_files)
462
+ )
463
+
464
+ for xml_file in xml_files:
465
+ progress.update(
466
+ task, description=f"[cyan]Processing {xml_file.name}..."
467
+ )
468
+ records = self._process_file(xml_file)
469
+ all_records.extend(records)
470
+ progress.advance(task)
471
+
472
+ # Write output
473
+ if all_records:
474
+ self._write_output(all_records, output_file, output_format)
475
+ console.print(
476
+ f"\n[green]✓ Successfully wrote extracted data from {len(all_records)} record(s) to {output_file.absolute()}[/green]"
477
+ )
478
+ else:
479
+ console.print(
480
+ "\n[yellow]⚠ No records extracted, output not created[/yellow]"
481
+ )
482
+
483
+ # Print missing fields summary if verbosity is summary or detailed
484
+ verbosity = self.config.get("logging", {}).get(
485
+ "missing_xpath_verbosity", "summary"
486
+ )
487
+ if (
488
+ verbosity in ["summary", "detailed"]
489
+ and self.stats["missing_fields_by_name"]
490
+ ):
491
+ console.print("\n[yellow]Missing Fields Summary:[/yellow]")
492
+ logger.info("Missing Fields Summary:")
493
+ for field_name, count in sorted(
494
+ self.stats["missing_fields_by_name"].items()
495
+ ):
496
+ console.print(f" [yellow]• {field_name}: {count} record(s)[/yellow]")
497
+ logger.info(f" • {field_name}: {count} record(s)")
498
+
499
+ # Print statistics
500
+ self._print_statistics()
501
+
502
+ def _write_output(
503
+ self,
504
+ records: List[Dict[str, Union[str, List[str]]]],
505
+ output_file: Path,
506
+ output_format: str,
507
+ ) -> None:
508
+ """Write extracted records to file using pandas."""
509
+ if not records:
510
+ return
511
+
512
+ # Create DataFrame
513
+ df = pd.DataFrame(records)
514
+
515
+ # Ensure output directory exists
516
+ output_file.parent.mkdir(parents=True, exist_ok=True)
517
+
518
+ # Write based on format
519
+ if output_format == "csv":
520
+ # For CSV, convert lists to pipe-separated strings for compatibility
521
+ df_csv = df.copy()
522
+ for col in df_csv.columns:
523
+ if df_csv[col].apply(lambda x: isinstance(x, list)).any():
524
+ df_csv[col] = df_csv[col].apply(
525
+ lambda x: " | ".join(x) if isinstance(x, list) else x
526
+ )
527
+ df_csv.to_csv(output_file, index=False, encoding="utf-8")
528
+ logger.info(f"Wrote {len(records)} records to CSV: {output_file}")
529
+
530
+ elif output_format == "parquet":
531
+ # Parquet supports native lists
532
+ df.to_parquet(output_file, index=False, engine="pyarrow")
533
+ logger.info(f"Wrote {len(records)} records to Parquet: {output_file}")
534
+
535
+ elif output_format == "excel":
536
+ # Excel: convert lists to strings
537
+ df_excel = df.copy()
538
+ for col in df_excel.columns:
539
+ if df_excel[col].apply(lambda x: isinstance(x, list)).any():
540
+ df_excel[col] = df_excel[col].apply(
541
+ lambda x: " | ".join(x) if isinstance(x, list) else x
542
+ )
543
+ df_excel.to_excel(output_file, index=False, engine="openpyxl")
544
+ logger.info(f"Wrote {len(records)} records to Excel: {output_file}")
545
+
546
+ elif output_format == "json":
547
+ # JSON supports native lists
548
+ df.to_json(output_file, orient="records", indent=2, force_ascii=False)
549
+ logger.info(f"Wrote {len(records)} records to JSON: {output_file}")
550
+
551
+ else:
552
+ raise ValueError(f"Unsupported output format: {output_format}")
553
+
554
+ def _print_statistics(self) -> None:
555
+ """Print extraction statistics in a nice table."""
556
+ table = Table(
557
+ title="Extraction Statistics", show_header=True, header_style="bold magenta"
558
+ )
559
+ table.add_column("Metric", style="cyan", no_wrap=True)
560
+ table.add_column("Value", style="green", justify="right")
561
+
562
+ table.add_row("Files processed", str(self.stats["files_processed"]))
563
+ table.add_row("Files skipped", str(self.stats["files_skipped"]))
564
+ table.add_row("Records extracted", str(self.stats["records_extracted"]))
565
+ if self.stats["records_filtered"] > 0:
566
+ table.add_row("Records filtered out", str(self.stats["records_filtered"]))
567
+ table.add_row("Fields with missing data", str(self.stats["fields_missing"]))
568
+ table.add_row("Errors", str(len(self.stats["errors"])))
569
+
570
+ console.print()
571
+ console.print(table)
572
+
573
+ # Log statistics
574
+ logger.info("Extraction Statistics:")
575
+ logger.info(f" Files processed: {self.stats['files_processed']}")
576
+ logger.info(f" Files skipped: {self.stats['files_skipped']}")
577
+ logger.info(f" Records extracted: {self.stats['records_extracted']}")
578
+ if self.stats["records_filtered"] > 0:
579
+ logger.info(f" Records filtered out: {self.stats['records_filtered']}")
580
+ logger.info(f" Fields with missing data: {self.stats['fields_missing']}")
581
+ logger.info(f" Errors: {len(self.stats['errors'])}")
582
+
583
+ if self.stats["errors"] and self.debug:
584
+ console.print("\n[red]Errors encountered:[/red]")
585
+ for error in self.stats["errors"][:10]: # Show first 10 errors
586
+ console.print(f" • {error}")
587
+ if len(self.stats["errors"]) > 10:
588
+ console.print(f" ... and {len(self.stats['errors']) - 10} more")
589
+
590
+
591
+ def load_config(config_file: Path) -> Dict[str, Any]:
592
+ """Load and validate configuration from YAML file."""
593
+ if not config_file.exists():
594
+ raise FileNotFoundError(f"Configuration file not found: {config_file}")
595
+
596
+ with open(config_file, "r", encoding="utf-8") as f:
597
+ config = yaml.safe_load(f)
598
+
599
+ # Validate required fields
600
+ required_fields = ["root_xpath", "fields"]
601
+ for field in required_fields:
602
+ if field not in config:
603
+ raise ValueError(f"Required field '{field}' missing in configuration")
604
+
605
+ if not config["fields"]:
606
+ raise ValueError("No fields configured for extraction")
607
+
608
+ return config
609
+
610
+
611
+ @click.group()
612
+ def cli():
613
+ """XML Field Extractor — extract XML data or build configs interactively."""
614
+ pass
615
+
616
+
617
+ @cli.command("run")
618
+ @click.argument(
619
+ "config",
620
+ type=click.Path(exists=True, path_type=Path),
621
+ )
622
+ @click.option(
623
+ "--input",
624
+ "-i",
625
+ "input_dir",
626
+ type=click.Path(exists=True, file_okay=False, path_type=Path),
627
+ help="Input directory containing XML files (overrides config)",
628
+ )
629
+ @click.option(
630
+ "--output",
631
+ "-o",
632
+ "output_file",
633
+ type=click.Path(path_type=Path),
634
+ help="Output file path (overrides config)",
635
+ )
636
+ @click.option(
637
+ "--format",
638
+ "-f",
639
+ "output_format",
640
+ type=click.Choice(["csv", "parquet", "excel", "json"], case_sensitive=False),
641
+ default="csv",
642
+ help="Output format (default: csv)",
643
+ )
644
+ @click.option(
645
+ "--debug",
646
+ is_flag=True,
647
+ default=False,
648
+ help="Enable debug mode with verbose logging",
649
+ )
650
+ def run_cmd(
651
+ config: Path,
652
+ input_dir: Optional[Path],
653
+ output_file: Optional[Path],
654
+ output_format: str,
655
+ debug: bool,
656
+ ):
657
+ """Extract fields from XML files using CONFIG yaml file.
658
+
659
+ Supports Dublin Core, METS, LIDO, PREMIS, and other XML formats.
660
+ Output formats: CSV, Parquet (native list support), Excel, JSON.
661
+ """
662
+ console.print()
663
+ console.print("[bold cyan]═══════════════════════════════════════[/bold cyan]")
664
+ console.print("[bold cyan] XML Field Extractor[/bold cyan]")
665
+ console.print("[bold cyan]═══════════════════════════════════════[/bold cyan]")
666
+ console.print()
667
+
668
+ try:
669
+ console.print(f"[cyan]Loading configuration from {config}...[/cyan]")
670
+ cfg = load_config(config)
671
+
672
+ if input_dir:
673
+ cfg["input_directory"] = str(input_dir)
674
+ if output_file:
675
+ cfg["output_file"] = str(output_file)
676
+
677
+ output_base = Path(cfg.get("output_file", "output"))
678
+
679
+ format_extensions = {
680
+ "csv": ".csv",
681
+ "parquet": ".parquet",
682
+ "excel": ".xlsx",
683
+ "json": ".json",
684
+ }
685
+
686
+ extension = format_extensions.get(output_format, ".csv")
687
+ if output_base.suffix not in format_extensions.values():
688
+ output_path = output_base.with_suffix(extension)
689
+ else:
690
+ output_path = output_base
691
+
692
+ log_path = output_path.parent / f"{output_path.stem}.log"
693
+
694
+ logger.remove()
695
+ logger.add(
696
+ lambda msg: console.print(msg, end=""),
697
+ level="DEBUG" if debug else "WARNING",
698
+ format="<level>{message}</level>",
699
+ )
700
+ logger.add(
701
+ log_path,
702
+ level="DEBUG",
703
+ format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {message}",
704
+ rotation="10 MB",
705
+ )
706
+
707
+ logger.info("=" * 50)
708
+ logger.info("XML Field Extractor - Starting extraction")
709
+ logger.info(f"Configuration: {config}")
710
+ logger.info(f"Debug mode: {debug}")
711
+
712
+ input_path = Path(cfg.get("input_directory", "./example_data"))
713
+
714
+ if not input_path.exists():
715
+ console.print(f"[red]✗ Input directory not found: {input_path}[/red]")
716
+ return
717
+
718
+ console.print(f"[cyan]Input directory: {input_path}[/cyan]")
719
+ console.print(f"[cyan]Output file: {output_path}[/cyan]")
720
+ console.print(f"[cyan]Output format: {output_format.upper()}[/cyan]")
721
+ console.print(f"[cyan]Log file: {log_path}[/cyan]")
722
+ console.print()
723
+
724
+ extractor = XMLExtractor(cfg, debug=debug)
725
+ extractor.extract(input_path, output_path, output_format)
726
+
727
+ logger.info("Extraction completed successfully")
728
+ console.print("\n[green]✓ Extraction completed![/green]")
729
+
730
+ except Exception as e:
731
+ logger.exception("Fatal error during extraction")
732
+ console.print(f"\n[red]✗ Error: {e}[/red]")
733
+ raise click.Abort()
734
+
735
+
736
+ @cli.command("build")
737
+ @click.argument(
738
+ "xml_file",
739
+ type=click.Path(exists=True, path_type=Path),
740
+ )
741
+ def build_cmd(xml_file: Path) -> None:
742
+ """Interactively build a config YAML from an XML sample file."""
743
+ from xml_config_builder import ConfigBuilderApp
744
+
745
+ app = ConfigBuilderApp(xml_file)
746
+ config_path = app.run()
747
+
748
+ if config_path:
749
+ console.print(f"\n[cyan]Running extraction with {config_path} …[/cyan]\n")
750
+ cfg = load_config(config_path)
751
+ output_base = Path(cfg.get("output_file", config_path.stem))
752
+ output_path = output_base.with_suffix(".csv")
753
+ log_path = output_path.parent / f"{output_path.stem}.log"
754
+
755
+ logger.remove()
756
+ logger.add(
757
+ lambda msg: console.print(msg, end=""),
758
+ level="WARNING",
759
+ format="<level>{message}</level>",
760
+ )
761
+ logger.add(
762
+ log_path,
763
+ level="DEBUG",
764
+ format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {message}",
765
+ rotation="10 MB",
766
+ )
767
+
768
+ input_path = Path(cfg.get("input_directory", xml_file.parent))
769
+ extractor = XMLExtractor(cfg)
770
+ extractor.extract(input_path, output_path, "csv")
771
+ console.print("\n[green]✓ Extraction completed![/green]")
772
+
773
+
774
+ # keep bare `main` so `if __name__ == "__main__"` still works
775
+ main = cli
776
+
777
+ if __name__ == "__main__":
778
+ cli()