xml-data-extractor 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xml_config_builder.py ADDED
@@ -0,0 +1,617 @@
1
+ #!/usr/bin/env python3
2
+ """Interactive XML configuration builder for xml-extractor."""
3
+
4
+ import sys
5
+ from pathlib import Path
6
+ from typing import Dict, List, Optional
7
+
8
+ import click
9
+ import yaml
10
+ from lxml import etree
11
+ from textual import on
12
+ from textual.app import App, ComposeResult
13
+ from textual.binding import Binding
14
+ from textual.containers import Horizontal, Vertical
15
+ from textual.screen import ModalScreen
16
+ from textual.widgets import (
17
+ Button,
18
+ DataTable,
19
+ Footer,
20
+ Header,
21
+ Input,
22
+ Label,
23
+ Static,
24
+ Tree,
25
+ )
26
+ from textual.widgets.tree import TreeNode
27
+
28
+
29
+ # ── Namespace helpers ──────────────────────────────────────────────────────────
30
+
31
+ def collect_namespaces(root: etree._Element) -> Dict[str, str]:
32
+ """Collect all namespace URI→prefix mappings from the document."""
33
+ ns: Dict[str, str] = {}
34
+ for el in root.iter():
35
+ for prefix, uri in (el.nsmap or {}).items():
36
+ if uri and uri not in ns.values():
37
+ if prefix:
38
+ ns[prefix] = uri
39
+ elif "default" not in ns:
40
+ ns["default"] = uri
41
+ return ns
42
+
43
+
44
+ def tag_to_qualified(tag: str, ns: Dict[str, str]) -> str:
45
+ """Convert Clark notation {uri}local to prefix:local using ns map."""
46
+ if not isinstance(tag, str) or not tag.startswith("{"):
47
+ return tag
48
+ uri = tag[1 : tag.index("}")]
49
+ local = tag[tag.index("}") + 1 :]
50
+ prefix = next((p for p, u in ns.items() if u == uri), None)
51
+ return f"{prefix}:{local}" if prefix else local
52
+
53
+
54
+ def make_element_xpath(element: etree._Element, ns: Dict[str, str]) -> str:
55
+ return f".//{tag_to_qualified(element.tag, ns)}/text()"
56
+
57
+
58
+ def make_attr_xpath(element: etree._Element, ns: Dict[str, str], attr: str) -> str:
59
+ return f".//{tag_to_qualified(element.tag, ns)}/@{attr}"
60
+
61
+
62
+ def make_root_xpath(element: etree._Element, ns: Dict[str, str]) -> str:
63
+ return f".//{tag_to_qualified(element.tag, ns)}"
64
+
65
+
66
+ def display_tag(element: etree._Element, ns: Dict[str, str]) -> str:
67
+ return tag_to_qualified(element.tag, ns)
68
+
69
+
70
+ def suggested_column(element: etree._Element, ns: Dict[str, str], attr: Optional[str] = None) -> str:
71
+ tag = tag_to_qualified(element.tag, ns).split(":")[-1]
72
+ if attr:
73
+ return attr.lstrip("@").capitalize()
74
+ return tag.capitalize()
75
+
76
+
77
+ # ── Custom Tree widget ────────────────────────────────────────────────────────
78
+
79
+ class XMLTree(Tree):
80
+ """Tree with right/left arrow keys for expand/collapse instead of scroll."""
81
+
82
+ DEFAULT_CSS = """
83
+ XMLTree {
84
+ overflow-x: hidden;
85
+ }
86
+ """
87
+
88
+ def on_key(self, event) -> None:
89
+ node = self.cursor_node
90
+ if event.key == "right" and node is not None:
91
+ event.stop()
92
+ event.prevent_default()
93
+ if not node.is_expanded:
94
+ node.expand()
95
+ elif node.children:
96
+ self.move_cursor(node.children[0])
97
+ elif event.key == "left" and node is not None:
98
+ event.stop()
99
+ event.prevent_default()
100
+ if node.is_expanded:
101
+ node.collapse()
102
+ elif node.parent and node.parent.data is not None:
103
+ self.move_cursor(node.parent)
104
+
105
+
106
+ # ── Node data types ────────────────────────────────────────────────────────────
107
+
108
+ class ElemData:
109
+ def __init__(self, element: etree._Element):
110
+ self.element = element
111
+ self.is_attr = False
112
+
113
+
114
+ class AttrData:
115
+ def __init__(self, element: etree._Element, attr: str, value: str):
116
+ self.element = element
117
+ self.attr = attr
118
+ self.value = value
119
+ self.is_attr = True
120
+
121
+
122
+ # ── Modal screens ──────────────────────────────────────────────────────────────
123
+
124
+ class WelcomeDialog(ModalScreen):
125
+ """Startup instructions for Phase 1."""
126
+
127
+ DEFAULT_CSS = """
128
+ WelcomeDialog {
129
+ align: center middle;
130
+ }
131
+ #dialog {
132
+ background: $surface;
133
+ border: solid $primary;
134
+ padding: 1 3;
135
+ width: 64;
136
+ height: auto;
137
+ }
138
+ #title {
139
+ text-style: bold;
140
+ color: $primary;
141
+ margin-bottom: 1;
142
+ }
143
+ #body {
144
+ margin-bottom: 1;
145
+ color: $text;
146
+ }
147
+ #hint {
148
+ color: $text-muted;
149
+ margin-bottom: 1;
150
+ }
151
+ #btn-row {
152
+ height: auto;
153
+ align: right middle;
154
+ margin-top: 1;
155
+ }
156
+ """
157
+
158
+ BINDINGS = [Binding("enter", "ok", "OK"), Binding("escape", "ok", "OK")]
159
+
160
+ def compose(self) -> ComposeResult:
161
+ with Vertical(id="dialog"):
162
+ yield Label("Step 1 — Select the root element", id="title")
163
+ yield Label(
164
+ "Navigate the XML tree with the arrow keys and press [bold]Enter[/bold] "
165
+ "on the element that represents one record (one output row).\n\n"
166
+ "Example: in an OAI-PMH feed, that would be [bold]oai_dc:dc[/bold].",
167
+ id="body",
168
+ )
169
+ with Horizontal(id="btn-row"):
170
+ yield Button("Got it", variant="primary", id="btn-ok")
171
+
172
+ @on(Button.Pressed, "#btn-ok")
173
+ def action_ok(self) -> None:
174
+ self.dismiss(None)
175
+
176
+
177
+ class ColumnDialog(ModalScreen):
178
+ """Ask user for a column name for the selected XPath."""
179
+
180
+ DEFAULT_CSS = """
181
+ ColumnDialog {
182
+ align: center middle;
183
+ }
184
+ #dialog {
185
+ background: $surface;
186
+ border: solid $primary;
187
+ padding: 1 3;
188
+ width: 64;
189
+ height: auto;
190
+ }
191
+ #xpath-label {
192
+ color: $text-muted;
193
+ margin-bottom: 1;
194
+ }
195
+ #xpath-value {
196
+ color: $success;
197
+ margin-bottom: 1;
198
+ overflow-x: auto;
199
+ }
200
+ #col-input {
201
+ margin-bottom: 1;
202
+ }
203
+ #btn-row {
204
+ height: auto;
205
+ align: right middle;
206
+ }
207
+ Button { margin-left: 1; }
208
+ """
209
+
210
+ BINDINGS = [Binding("escape", "cancel", "Cancel")]
211
+
212
+ def __init__(self, xpath: str, suggested: str = ""):
213
+ super().__init__()
214
+ self.xpath = xpath
215
+ self.suggested = suggested
216
+
217
+ def compose(self) -> ComposeResult:
218
+ with Vertical(id="dialog"):
219
+ yield Label("XPath:", id="xpath-label")
220
+ yield Static(self.xpath, id="xpath-value")
221
+ yield Input(value=self.suggested, placeholder="Column name", id="col-input")
222
+ yield Label("Separator (for multiple values):", id="sep-label")
223
+ yield Input(value=" | ", placeholder="e.g. | or ,", id="sep-input")
224
+ with Horizontal(id="btn-row"):
225
+ yield Button("Add", variant="primary", id="btn-add")
226
+ yield Button("Cancel", id="btn-cancel")
227
+
228
+ def on_mount(self) -> None:
229
+ inp = self.query_one("#col-input", Input)
230
+ inp.focus()
231
+ inp.action_end()
232
+
233
+ @on(Input.Submitted)
234
+ def submitted(self) -> None:
235
+ self._confirm()
236
+
237
+ @on(Button.Pressed, "#btn-add")
238
+ def pressed_add(self) -> None:
239
+ self._confirm()
240
+
241
+ @on(Button.Pressed, "#btn-cancel")
242
+ def action_cancel(self) -> None:
243
+ self.dismiss(None)
244
+
245
+ def _confirm(self) -> None:
246
+ column = self.query_one("#col-input", Input).value.strip()
247
+ if not column:
248
+ self.dismiss(None)
249
+ return
250
+ separator = self.query_one("#sep-input", Input).value
251
+ self.dismiss({"column": column, "separator": separator})
252
+
253
+
254
+ class SaveDialog(ModalScreen):
255
+ """Ask user where to save the config YAML."""
256
+
257
+ DEFAULT_CSS = """
258
+ SaveDialog {
259
+ align: center middle;
260
+ }
261
+ #dialog {
262
+ background: $surface;
263
+ border: solid $primary;
264
+ padding: 1 3;
265
+ width: 50;
266
+ height: auto;
267
+ }
268
+ #btn-row {
269
+ height: auto;
270
+ align: right middle;
271
+ margin-top: 1;
272
+ }
273
+ Button { margin-left: 1; }
274
+ """
275
+
276
+ BINDINGS = [Binding("escape", "cancel", "Cancel")]
277
+
278
+ def __init__(self, default: str = "config.yaml"):
279
+ super().__init__()
280
+ self.default = default
281
+
282
+ def compose(self) -> ComposeResult:
283
+ with Vertical(id="dialog"):
284
+ yield Label("Save config as:")
285
+ yield Input(value=self.default, id="path-input")
286
+ with Horizontal(id="btn-row"):
287
+ yield Button("Save", variant="primary", id="btn-save")
288
+ yield Button("Cancel", id="btn-cancel")
289
+
290
+ def on_mount(self) -> None:
291
+ self.query_one("#path-input", Input).focus()
292
+
293
+ @on(Input.Submitted)
294
+ def submitted(self) -> None:
295
+ self._confirm()
296
+
297
+ @on(Button.Pressed, "#btn-save")
298
+ def pressed_save(self) -> None:
299
+ self._confirm()
300
+
301
+ @on(Button.Pressed, "#btn-cancel")
302
+ def action_cancel(self) -> None:
303
+ self.dismiss(None)
304
+
305
+ def _confirm(self) -> None:
306
+ value = self.query_one("#path-input", Input).value.strip()
307
+ self.dismiss(Path(value) if value else None)
308
+
309
+
310
+ class RunDialog(ModalScreen):
311
+ """Ask user whether to run extraction immediately."""
312
+
313
+ DEFAULT_CSS = """
314
+ RunDialog {
315
+ align: center middle;
316
+ }
317
+ #dialog {
318
+ background: $surface;
319
+ border: solid $success;
320
+ padding: 1 3;
321
+ width: 50;
322
+ height: auto;
323
+ }
324
+ #btn-row {
325
+ height: auto;
326
+ align: right middle;
327
+ margin-top: 1;
328
+ }
329
+ Button { margin-left: 1; }
330
+ """
331
+
332
+ def __init__(self, config_path: Path, input_dir: Path):
333
+ super().__init__()
334
+ self.config_path = config_path
335
+ self.input_dir = input_dir
336
+
337
+ def compose(self) -> ComposeResult:
338
+ with Vertical(id="dialog"):
339
+ yield Label(f"Saved to [bold]{self.config_path}[/bold]")
340
+ yield Label(f"Run extraction now on all XML files in [bold]{self.input_dir}[/bold]?")
341
+ with Horizontal(id="btn-row"):
342
+ yield Button("Run", variant="success", id="btn-run")
343
+ yield Button("Exit", id="btn-exit")
344
+
345
+ @on(Button.Pressed, "#btn-run")
346
+ def do_run(self) -> None:
347
+ self.dismiss(True)
348
+
349
+ @on(Button.Pressed, "#btn-exit")
350
+ def do_exit(self) -> None:
351
+ self.dismiss(False)
352
+
353
+
354
+ # ── Main App ───────────────────────────────────────────────────────────────────
355
+
356
+ PHASE_ROOT = "root"
357
+ PHASE_FIELDS = "fields"
358
+
359
+
360
+ class ConfigBuilderApp(App):
361
+ """Interactive XML → YAML config builder."""
362
+
363
+ TITLE = "xml-config-builder"
364
+ SUB_TITLE = "Build xml-extractor configs interactively"
365
+
366
+ DEFAULT_CSS = """
367
+ #main {
368
+ layout: horizontal;
369
+ height: 1fr;
370
+ overflow: hidden hidden;
371
+ }
372
+ #tree-pane {
373
+ width: 1fr;
374
+ border: solid $primary;
375
+ padding: 0 1;
376
+ overflow: hidden hidden;
377
+ }
378
+ #fields-pane {
379
+ width: 1fr;
380
+ border: solid $accent;
381
+ padding: 0 1;
382
+ overflow: hidden hidden;
383
+ }
384
+ .pane-title {
385
+ text-style: bold;
386
+ margin-bottom: 1;
387
+ }
388
+ #root-display {
389
+ color: $success;
390
+ margin-bottom: 1;
391
+ overflow-x: auto;
392
+ }
393
+ #status {
394
+ height: 3;
395
+ background: $panel;
396
+ padding: 0 2;
397
+ content-align: left middle;
398
+ color: $text-muted;
399
+ }
400
+ #fields-table {
401
+ height: 1fr;
402
+ }
403
+ """
404
+
405
+ BINDINGS = [
406
+ Binding("s", "save", "Save config"),
407
+ Binding("d", "delete_field", "Delete field"),
408
+ Binding("r", "reset", "Reset"),
409
+ Binding("q", "quit", "Quit"),
410
+ ]
411
+
412
+ def __init__(self, xml_file: Path):
413
+ super().__init__()
414
+ self.xml_file = xml_file
415
+ self.ns: Dict[str, str] = {}
416
+ self.phase = PHASE_ROOT
417
+ self.root_xpath: Optional[str] = None
418
+ self.root_element: Optional[etree._Element] = None
419
+ self.fields: List[Dict[str, str]] = []
420
+ self.run_config: Optional[Path] = None
421
+
422
+ def compose(self) -> ComposeResult:
423
+ yield Header()
424
+ with Horizontal(id="main"):
425
+ with Vertical(id="tree-pane"):
426
+ yield Static("XML Tree", classes="pane-title")
427
+ yield XMLTree("document", id="xml-tree")
428
+ with Vertical(id="fields-pane"):
429
+ yield Static("Mapped Fields", classes="pane-title")
430
+ yield Static("root_xpath: (not selected)", id="root-display")
431
+ yield DataTable(id="fields-table", cursor_type="row")
432
+ yield Static(id="status")
433
+ yield Footer()
434
+
435
+ def on_mount(self) -> None:
436
+ self._setup_table()
437
+ self._load_xml()
438
+ self._update_status()
439
+ self.push_screen(WelcomeDialog())
440
+
441
+ def _load_xml(self) -> None:
442
+ parser = etree.XMLParser(remove_blank_text=True, recover=True)
443
+ tree = etree.parse(str(self.xml_file), parser)
444
+ root = tree.getroot()
445
+ self.ns = collect_namespaces(root)
446
+ self._populate_tree(root)
447
+
448
+ def _populate_tree(self, root: etree._Element, subtree_root: Optional[etree._Element] = None) -> None:
449
+ target = subtree_root if subtree_root is not None else root
450
+ widget = self.query_one("#xml-tree", XMLTree)
451
+ widget.clear()
452
+ label = display_tag(target, self.ns)
453
+ if subtree_root is not None:
454
+ label += " [dim](record root)[/dim]"
455
+ widget.root.set_label(label)
456
+ widget.root.data = ElemData(target)
457
+ self._add_children(widget.root, target, depth=0)
458
+ widget.root.expand()
459
+
460
+ def _add_children(self, node: TreeNode, el: etree._Element, depth: int) -> None:
461
+ if depth >= 12:
462
+ return
463
+ for attr_name, attr_val in el.attrib.items():
464
+ preview = attr_val[:50]
465
+ child = node.add_leaf(f"[dim]@{attr_name}[/dim] = {preview}")
466
+ child.data = AttrData(el, attr_name, attr_val)
467
+ for child_el in el:
468
+ if not isinstance(child_el.tag, str):
469
+ continue
470
+ tag = display_tag(child_el, self.ns)
471
+ text = (child_el.text or "").strip()
472
+ preview = f" [dim]{text[:40]}[/dim]" if text else ""
473
+ has_children = bool(len(child_el) or child_el.attrib)
474
+ if has_children:
475
+ child_node = node.add(f"{tag}{preview}")
476
+ else:
477
+ child_node = node.add_leaf(f"{tag}{preview}")
478
+ child_node.data = ElemData(child_el)
479
+ self._add_children(child_node, child_el, depth + 1)
480
+
481
+ def _setup_table(self) -> None:
482
+ table = self.query_one("#fields-table", DataTable)
483
+ table.add_column("Column", width=20)
484
+ table.add_column("XPath")
485
+
486
+ def _refresh_table(self) -> None:
487
+ table = self.query_one("#fields-table", DataTable)
488
+ table.clear()
489
+ for f in self.fields:
490
+ table.add_row(f["column"], f["xpath"])
491
+
492
+ def _update_status(self) -> None:
493
+ if self.phase == PHASE_ROOT:
494
+ msg = "Phase 1 — Navigate to the RECORD element (one row = one record), press Enter to set as root_xpath"
495
+ else:
496
+ msg = "Phase 2 — Navigate to a field, press Enter to map it | [S] save | [D] delete selected | [R] reset"
497
+ self.query_one("#status", Static).update(msg)
498
+
499
+ def _update_root_display(self) -> None:
500
+ self.query_one("#root-display", Static).update(
501
+ f"root_xpath: [bold]{self.root_xpath}[/bold]"
502
+ )
503
+
504
+ @on(Tree.NodeSelected)
505
+ def on_node_selected(self, event: Tree.NodeSelected) -> None:
506
+ data = event.node.data
507
+ if data is None:
508
+ return
509
+
510
+ if self.phase == PHASE_ROOT:
511
+ if data.is_attr:
512
+ self.notify("Select an element, not an attribute, as root", severity="warning")
513
+ return
514
+ self.root_element = data.element
515
+ self.root_xpath = make_root_xpath(data.element, self.ns)
516
+ self._update_root_display()
517
+ self.phase = PHASE_FIELDS
518
+ self._update_status()
519
+ self._populate_tree(None, subtree_root=data.element)
520
+
521
+ elif self.phase == PHASE_FIELDS:
522
+ if data.is_attr:
523
+ xpath = make_attr_xpath(data.element, self.ns, data.attr)
524
+ sug = suggested_column(data.element, self.ns, data.attr)
525
+ else:
526
+ xpath = make_element_xpath(data.element, self.ns)
527
+ sug = suggested_column(data.element, self.ns)
528
+
529
+ def add_field(result: Optional[dict]) -> None:
530
+ if result:
531
+ field = {"column": result["column"], "xpath": xpath}
532
+ if result["separator"] != " | ":
533
+ field["separator"] = result["separator"]
534
+ self.fields.append(field)
535
+ self._refresh_table()
536
+
537
+ self.push_screen(ColumnDialog(xpath, sug), add_field)
538
+
539
+ def action_delete_field(self) -> None:
540
+ if self.phase != PHASE_FIELDS or not self.fields:
541
+ return
542
+ table = self.query_one("#fields-table", DataTable)
543
+ idx = table.cursor_row
544
+ if 0 <= idx < len(self.fields):
545
+ self.fields.pop(idx)
546
+ self._refresh_table()
547
+
548
+ def action_reset(self) -> None:
549
+ self.phase = PHASE_ROOT
550
+ self.root_xpath = None
551
+ self.root_element = None
552
+ self.fields = []
553
+ self._refresh_table()
554
+ self.query_one("#root-display", Static).update("root_xpath: (not selected)")
555
+ self._load_xml()
556
+ self._update_status()
557
+
558
+ def action_save(self) -> None:
559
+ if not self.root_xpath:
560
+ self.notify("Select a root element first (Phase 1)", severity="error")
561
+ return
562
+ if not self.fields:
563
+ self.notify("Map at least one field first", severity="warning")
564
+ return
565
+
566
+ default = self.xml_file.stem + "_config.yaml"
567
+
568
+ def on_path(path: Optional[Path]) -> None:
569
+ if not path:
570
+ return
571
+ self._write_yaml(path)
572
+
573
+ self.push_screen(SaveDialog(default), on_path)
574
+
575
+ def _write_yaml(self, path: Path) -> None:
576
+ config = {
577
+ "input_directory": str(self.xml_file.parent),
578
+ "output_file": path.stem.removesuffix("_config"),
579
+ "root_xpath": self.root_xpath,
580
+ "namespaces": self.ns,
581
+ "fields": self.fields,
582
+ }
583
+ try:
584
+ with open(path, "w", encoding="utf-8") as f:
585
+ yaml.dump(config, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
586
+ except Exception as e:
587
+ self.notify(f"Save failed: {e}", severity="error")
588
+ return
589
+
590
+ def on_run(should_run: Optional[bool]) -> None:
591
+ self.run_config = path if should_run else None
592
+ self.exit(self.run_config)
593
+
594
+ self.push_screen(RunDialog(path, self.xml_file.parent), on_run)
595
+
596
+
597
+ # ── CLI ────────────────────────────────────────────────────────────────────────
598
+
599
+ @click.command()
600
+ @click.argument("xml_file", type=click.Path(exists=True, path_type=Path))
601
+ def main(xml_file: Path) -> None:
602
+ """Interactively build an xml-extractor config from an XML sample file."""
603
+ app = ConfigBuilderApp(xml_file)
604
+ result = app.run() # returns config Path if user chose "Run", else None
605
+
606
+ if result:
607
+ click.echo(f"\nRunning extraction with {result} …\n")
608
+ try:
609
+ from xml_extractor import cli
610
+ cli(args=["run", str(result)], standalone_mode=False)
611
+ except Exception as e:
612
+ click.echo(f"Error: {e}", err=True)
613
+ sys.exit(1)
614
+
615
+
616
+ if __name__ == "__main__":
617
+ main()