xml-data-extractor 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xml_config_builder.py +617 -0
- xml_data_extractor-0.1.0.dist-info/METADATA +408 -0
- xml_data_extractor-0.1.0.dist-info/RECORD +7 -0
- xml_data_extractor-0.1.0.dist-info/WHEEL +5 -0
- xml_data_extractor-0.1.0.dist-info/entry_points.txt +2 -0
- xml_data_extractor-0.1.0.dist-info/top_level.txt +2 -0
- xml_extractor.py +778 -0
xml_config_builder.py
ADDED
|
@@ -0,0 +1,617 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Interactive XML configuration builder for xml-extractor."""
|
|
3
|
+
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Dict, List, Optional
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
import yaml
|
|
10
|
+
from lxml import etree
|
|
11
|
+
from textual import on
|
|
12
|
+
from textual.app import App, ComposeResult
|
|
13
|
+
from textual.binding import Binding
|
|
14
|
+
from textual.containers import Horizontal, Vertical
|
|
15
|
+
from textual.screen import ModalScreen
|
|
16
|
+
from textual.widgets import (
|
|
17
|
+
Button,
|
|
18
|
+
DataTable,
|
|
19
|
+
Footer,
|
|
20
|
+
Header,
|
|
21
|
+
Input,
|
|
22
|
+
Label,
|
|
23
|
+
Static,
|
|
24
|
+
Tree,
|
|
25
|
+
)
|
|
26
|
+
from textual.widgets.tree import TreeNode
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# ── Namespace helpers ──────────────────────────────────────────────────────────
|
|
30
|
+
|
|
31
|
+
def collect_namespaces(root: etree._Element) -> Dict[str, str]:
|
|
32
|
+
"""Collect all namespace URI→prefix mappings from the document."""
|
|
33
|
+
ns: Dict[str, str] = {}
|
|
34
|
+
for el in root.iter():
|
|
35
|
+
for prefix, uri in (el.nsmap or {}).items():
|
|
36
|
+
if uri and uri not in ns.values():
|
|
37
|
+
if prefix:
|
|
38
|
+
ns[prefix] = uri
|
|
39
|
+
elif "default" not in ns:
|
|
40
|
+
ns["default"] = uri
|
|
41
|
+
return ns
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def tag_to_qualified(tag: str, ns: Dict[str, str]) -> str:
|
|
45
|
+
"""Convert Clark notation {uri}local to prefix:local using ns map."""
|
|
46
|
+
if not isinstance(tag, str) or not tag.startswith("{"):
|
|
47
|
+
return tag
|
|
48
|
+
uri = tag[1 : tag.index("}")]
|
|
49
|
+
local = tag[tag.index("}") + 1 :]
|
|
50
|
+
prefix = next((p for p, u in ns.items() if u == uri), None)
|
|
51
|
+
return f"{prefix}:{local}" if prefix else local
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def make_element_xpath(element: etree._Element, ns: Dict[str, str]) -> str:
|
|
55
|
+
return f".//{tag_to_qualified(element.tag, ns)}/text()"
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def make_attr_xpath(element: etree._Element, ns: Dict[str, str], attr: str) -> str:
|
|
59
|
+
return f".//{tag_to_qualified(element.tag, ns)}/@{attr}"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def make_root_xpath(element: etree._Element, ns: Dict[str, str]) -> str:
|
|
63
|
+
return f".//{tag_to_qualified(element.tag, ns)}"
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def display_tag(element: etree._Element, ns: Dict[str, str]) -> str:
|
|
67
|
+
return tag_to_qualified(element.tag, ns)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def suggested_column(element: etree._Element, ns: Dict[str, str], attr: Optional[str] = None) -> str:
|
|
71
|
+
tag = tag_to_qualified(element.tag, ns).split(":")[-1]
|
|
72
|
+
if attr:
|
|
73
|
+
return attr.lstrip("@").capitalize()
|
|
74
|
+
return tag.capitalize()
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# ── Custom Tree widget ────────────────────────────────────────────────────────
|
|
78
|
+
|
|
79
|
+
class XMLTree(Tree):
|
|
80
|
+
"""Tree with right/left arrow keys for expand/collapse instead of scroll."""
|
|
81
|
+
|
|
82
|
+
DEFAULT_CSS = """
|
|
83
|
+
XMLTree {
|
|
84
|
+
overflow-x: hidden;
|
|
85
|
+
}
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
def on_key(self, event) -> None:
|
|
89
|
+
node = self.cursor_node
|
|
90
|
+
if event.key == "right" and node is not None:
|
|
91
|
+
event.stop()
|
|
92
|
+
event.prevent_default()
|
|
93
|
+
if not node.is_expanded:
|
|
94
|
+
node.expand()
|
|
95
|
+
elif node.children:
|
|
96
|
+
self.move_cursor(node.children[0])
|
|
97
|
+
elif event.key == "left" and node is not None:
|
|
98
|
+
event.stop()
|
|
99
|
+
event.prevent_default()
|
|
100
|
+
if node.is_expanded:
|
|
101
|
+
node.collapse()
|
|
102
|
+
elif node.parent and node.parent.data is not None:
|
|
103
|
+
self.move_cursor(node.parent)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
# ── Node data types ────────────────────────────────────────────────────────────
|
|
107
|
+
|
|
108
|
+
class ElemData:
|
|
109
|
+
def __init__(self, element: etree._Element):
|
|
110
|
+
self.element = element
|
|
111
|
+
self.is_attr = False
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class AttrData:
|
|
115
|
+
def __init__(self, element: etree._Element, attr: str, value: str):
|
|
116
|
+
self.element = element
|
|
117
|
+
self.attr = attr
|
|
118
|
+
self.value = value
|
|
119
|
+
self.is_attr = True
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# ── Modal screens ──────────────────────────────────────────────────────────────
|
|
123
|
+
|
|
124
|
+
class WelcomeDialog(ModalScreen):
|
|
125
|
+
"""Startup instructions for Phase 1."""
|
|
126
|
+
|
|
127
|
+
DEFAULT_CSS = """
|
|
128
|
+
WelcomeDialog {
|
|
129
|
+
align: center middle;
|
|
130
|
+
}
|
|
131
|
+
#dialog {
|
|
132
|
+
background: $surface;
|
|
133
|
+
border: solid $primary;
|
|
134
|
+
padding: 1 3;
|
|
135
|
+
width: 64;
|
|
136
|
+
height: auto;
|
|
137
|
+
}
|
|
138
|
+
#title {
|
|
139
|
+
text-style: bold;
|
|
140
|
+
color: $primary;
|
|
141
|
+
margin-bottom: 1;
|
|
142
|
+
}
|
|
143
|
+
#body {
|
|
144
|
+
margin-bottom: 1;
|
|
145
|
+
color: $text;
|
|
146
|
+
}
|
|
147
|
+
#hint {
|
|
148
|
+
color: $text-muted;
|
|
149
|
+
margin-bottom: 1;
|
|
150
|
+
}
|
|
151
|
+
#btn-row {
|
|
152
|
+
height: auto;
|
|
153
|
+
align: right middle;
|
|
154
|
+
margin-top: 1;
|
|
155
|
+
}
|
|
156
|
+
"""
|
|
157
|
+
|
|
158
|
+
BINDINGS = [Binding("enter", "ok", "OK"), Binding("escape", "ok", "OK")]
|
|
159
|
+
|
|
160
|
+
def compose(self) -> ComposeResult:
|
|
161
|
+
with Vertical(id="dialog"):
|
|
162
|
+
yield Label("Step 1 — Select the root element", id="title")
|
|
163
|
+
yield Label(
|
|
164
|
+
"Navigate the XML tree with the arrow keys and press [bold]Enter[/bold] "
|
|
165
|
+
"on the element that represents one record (one output row).\n\n"
|
|
166
|
+
"Example: in an OAI-PMH feed, that would be [bold]oai_dc:dc[/bold].",
|
|
167
|
+
id="body",
|
|
168
|
+
)
|
|
169
|
+
with Horizontal(id="btn-row"):
|
|
170
|
+
yield Button("Got it", variant="primary", id="btn-ok")
|
|
171
|
+
|
|
172
|
+
@on(Button.Pressed, "#btn-ok")
|
|
173
|
+
def action_ok(self) -> None:
|
|
174
|
+
self.dismiss(None)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
class ColumnDialog(ModalScreen):
|
|
178
|
+
"""Ask user for a column name for the selected XPath."""
|
|
179
|
+
|
|
180
|
+
DEFAULT_CSS = """
|
|
181
|
+
ColumnDialog {
|
|
182
|
+
align: center middle;
|
|
183
|
+
}
|
|
184
|
+
#dialog {
|
|
185
|
+
background: $surface;
|
|
186
|
+
border: solid $primary;
|
|
187
|
+
padding: 1 3;
|
|
188
|
+
width: 64;
|
|
189
|
+
height: auto;
|
|
190
|
+
}
|
|
191
|
+
#xpath-label {
|
|
192
|
+
color: $text-muted;
|
|
193
|
+
margin-bottom: 1;
|
|
194
|
+
}
|
|
195
|
+
#xpath-value {
|
|
196
|
+
color: $success;
|
|
197
|
+
margin-bottom: 1;
|
|
198
|
+
overflow-x: auto;
|
|
199
|
+
}
|
|
200
|
+
#col-input {
|
|
201
|
+
margin-bottom: 1;
|
|
202
|
+
}
|
|
203
|
+
#btn-row {
|
|
204
|
+
height: auto;
|
|
205
|
+
align: right middle;
|
|
206
|
+
}
|
|
207
|
+
Button { margin-left: 1; }
|
|
208
|
+
"""
|
|
209
|
+
|
|
210
|
+
BINDINGS = [Binding("escape", "cancel", "Cancel")]
|
|
211
|
+
|
|
212
|
+
def __init__(self, xpath: str, suggested: str = ""):
|
|
213
|
+
super().__init__()
|
|
214
|
+
self.xpath = xpath
|
|
215
|
+
self.suggested = suggested
|
|
216
|
+
|
|
217
|
+
def compose(self) -> ComposeResult:
|
|
218
|
+
with Vertical(id="dialog"):
|
|
219
|
+
yield Label("XPath:", id="xpath-label")
|
|
220
|
+
yield Static(self.xpath, id="xpath-value")
|
|
221
|
+
yield Input(value=self.suggested, placeholder="Column name", id="col-input")
|
|
222
|
+
yield Label("Separator (for multiple values):", id="sep-label")
|
|
223
|
+
yield Input(value=" | ", placeholder="e.g. | or ,", id="sep-input")
|
|
224
|
+
with Horizontal(id="btn-row"):
|
|
225
|
+
yield Button("Add", variant="primary", id="btn-add")
|
|
226
|
+
yield Button("Cancel", id="btn-cancel")
|
|
227
|
+
|
|
228
|
+
def on_mount(self) -> None:
|
|
229
|
+
inp = self.query_one("#col-input", Input)
|
|
230
|
+
inp.focus()
|
|
231
|
+
inp.action_end()
|
|
232
|
+
|
|
233
|
+
@on(Input.Submitted)
|
|
234
|
+
def submitted(self) -> None:
|
|
235
|
+
self._confirm()
|
|
236
|
+
|
|
237
|
+
@on(Button.Pressed, "#btn-add")
|
|
238
|
+
def pressed_add(self) -> None:
|
|
239
|
+
self._confirm()
|
|
240
|
+
|
|
241
|
+
@on(Button.Pressed, "#btn-cancel")
|
|
242
|
+
def action_cancel(self) -> None:
|
|
243
|
+
self.dismiss(None)
|
|
244
|
+
|
|
245
|
+
def _confirm(self) -> None:
|
|
246
|
+
column = self.query_one("#col-input", Input).value.strip()
|
|
247
|
+
if not column:
|
|
248
|
+
self.dismiss(None)
|
|
249
|
+
return
|
|
250
|
+
separator = self.query_one("#sep-input", Input).value
|
|
251
|
+
self.dismiss({"column": column, "separator": separator})
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
class SaveDialog(ModalScreen):
|
|
255
|
+
"""Ask user where to save the config YAML."""
|
|
256
|
+
|
|
257
|
+
DEFAULT_CSS = """
|
|
258
|
+
SaveDialog {
|
|
259
|
+
align: center middle;
|
|
260
|
+
}
|
|
261
|
+
#dialog {
|
|
262
|
+
background: $surface;
|
|
263
|
+
border: solid $primary;
|
|
264
|
+
padding: 1 3;
|
|
265
|
+
width: 50;
|
|
266
|
+
height: auto;
|
|
267
|
+
}
|
|
268
|
+
#btn-row {
|
|
269
|
+
height: auto;
|
|
270
|
+
align: right middle;
|
|
271
|
+
margin-top: 1;
|
|
272
|
+
}
|
|
273
|
+
Button { margin-left: 1; }
|
|
274
|
+
"""
|
|
275
|
+
|
|
276
|
+
BINDINGS = [Binding("escape", "cancel", "Cancel")]
|
|
277
|
+
|
|
278
|
+
def __init__(self, default: str = "config.yaml"):
|
|
279
|
+
super().__init__()
|
|
280
|
+
self.default = default
|
|
281
|
+
|
|
282
|
+
def compose(self) -> ComposeResult:
|
|
283
|
+
with Vertical(id="dialog"):
|
|
284
|
+
yield Label("Save config as:")
|
|
285
|
+
yield Input(value=self.default, id="path-input")
|
|
286
|
+
with Horizontal(id="btn-row"):
|
|
287
|
+
yield Button("Save", variant="primary", id="btn-save")
|
|
288
|
+
yield Button("Cancel", id="btn-cancel")
|
|
289
|
+
|
|
290
|
+
def on_mount(self) -> None:
|
|
291
|
+
self.query_one("#path-input", Input).focus()
|
|
292
|
+
|
|
293
|
+
@on(Input.Submitted)
|
|
294
|
+
def submitted(self) -> None:
|
|
295
|
+
self._confirm()
|
|
296
|
+
|
|
297
|
+
@on(Button.Pressed, "#btn-save")
|
|
298
|
+
def pressed_save(self) -> None:
|
|
299
|
+
self._confirm()
|
|
300
|
+
|
|
301
|
+
@on(Button.Pressed, "#btn-cancel")
|
|
302
|
+
def action_cancel(self) -> None:
|
|
303
|
+
self.dismiss(None)
|
|
304
|
+
|
|
305
|
+
def _confirm(self) -> None:
|
|
306
|
+
value = self.query_one("#path-input", Input).value.strip()
|
|
307
|
+
self.dismiss(Path(value) if value else None)
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
class RunDialog(ModalScreen):
|
|
311
|
+
"""Ask user whether to run extraction immediately."""
|
|
312
|
+
|
|
313
|
+
DEFAULT_CSS = """
|
|
314
|
+
RunDialog {
|
|
315
|
+
align: center middle;
|
|
316
|
+
}
|
|
317
|
+
#dialog {
|
|
318
|
+
background: $surface;
|
|
319
|
+
border: solid $success;
|
|
320
|
+
padding: 1 3;
|
|
321
|
+
width: 50;
|
|
322
|
+
height: auto;
|
|
323
|
+
}
|
|
324
|
+
#btn-row {
|
|
325
|
+
height: auto;
|
|
326
|
+
align: right middle;
|
|
327
|
+
margin-top: 1;
|
|
328
|
+
}
|
|
329
|
+
Button { margin-left: 1; }
|
|
330
|
+
"""
|
|
331
|
+
|
|
332
|
+
def __init__(self, config_path: Path, input_dir: Path):
|
|
333
|
+
super().__init__()
|
|
334
|
+
self.config_path = config_path
|
|
335
|
+
self.input_dir = input_dir
|
|
336
|
+
|
|
337
|
+
def compose(self) -> ComposeResult:
|
|
338
|
+
with Vertical(id="dialog"):
|
|
339
|
+
yield Label(f"Saved to [bold]{self.config_path}[/bold]")
|
|
340
|
+
yield Label(f"Run extraction now on all XML files in [bold]{self.input_dir}[/bold]?")
|
|
341
|
+
with Horizontal(id="btn-row"):
|
|
342
|
+
yield Button("Run", variant="success", id="btn-run")
|
|
343
|
+
yield Button("Exit", id="btn-exit")
|
|
344
|
+
|
|
345
|
+
@on(Button.Pressed, "#btn-run")
|
|
346
|
+
def do_run(self) -> None:
|
|
347
|
+
self.dismiss(True)
|
|
348
|
+
|
|
349
|
+
@on(Button.Pressed, "#btn-exit")
|
|
350
|
+
def do_exit(self) -> None:
|
|
351
|
+
self.dismiss(False)
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
# ── Main App ───────────────────────────────────────────────────────────────────
|
|
355
|
+
|
|
356
|
+
PHASE_ROOT = "root"
|
|
357
|
+
PHASE_FIELDS = "fields"
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
class ConfigBuilderApp(App):
|
|
361
|
+
"""Interactive XML → YAML config builder."""
|
|
362
|
+
|
|
363
|
+
TITLE = "xml-config-builder"
|
|
364
|
+
SUB_TITLE = "Build xml-extractor configs interactively"
|
|
365
|
+
|
|
366
|
+
DEFAULT_CSS = """
|
|
367
|
+
#main {
|
|
368
|
+
layout: horizontal;
|
|
369
|
+
height: 1fr;
|
|
370
|
+
overflow: hidden hidden;
|
|
371
|
+
}
|
|
372
|
+
#tree-pane {
|
|
373
|
+
width: 1fr;
|
|
374
|
+
border: solid $primary;
|
|
375
|
+
padding: 0 1;
|
|
376
|
+
overflow: hidden hidden;
|
|
377
|
+
}
|
|
378
|
+
#fields-pane {
|
|
379
|
+
width: 1fr;
|
|
380
|
+
border: solid $accent;
|
|
381
|
+
padding: 0 1;
|
|
382
|
+
overflow: hidden hidden;
|
|
383
|
+
}
|
|
384
|
+
.pane-title {
|
|
385
|
+
text-style: bold;
|
|
386
|
+
margin-bottom: 1;
|
|
387
|
+
}
|
|
388
|
+
#root-display {
|
|
389
|
+
color: $success;
|
|
390
|
+
margin-bottom: 1;
|
|
391
|
+
overflow-x: auto;
|
|
392
|
+
}
|
|
393
|
+
#status {
|
|
394
|
+
height: 3;
|
|
395
|
+
background: $panel;
|
|
396
|
+
padding: 0 2;
|
|
397
|
+
content-align: left middle;
|
|
398
|
+
color: $text-muted;
|
|
399
|
+
}
|
|
400
|
+
#fields-table {
|
|
401
|
+
height: 1fr;
|
|
402
|
+
}
|
|
403
|
+
"""
|
|
404
|
+
|
|
405
|
+
BINDINGS = [
|
|
406
|
+
Binding("s", "save", "Save config"),
|
|
407
|
+
Binding("d", "delete_field", "Delete field"),
|
|
408
|
+
Binding("r", "reset", "Reset"),
|
|
409
|
+
Binding("q", "quit", "Quit"),
|
|
410
|
+
]
|
|
411
|
+
|
|
412
|
+
def __init__(self, xml_file: Path):
|
|
413
|
+
super().__init__()
|
|
414
|
+
self.xml_file = xml_file
|
|
415
|
+
self.ns: Dict[str, str] = {}
|
|
416
|
+
self.phase = PHASE_ROOT
|
|
417
|
+
self.root_xpath: Optional[str] = None
|
|
418
|
+
self.root_element: Optional[etree._Element] = None
|
|
419
|
+
self.fields: List[Dict[str, str]] = []
|
|
420
|
+
self.run_config: Optional[Path] = None
|
|
421
|
+
|
|
422
|
+
def compose(self) -> ComposeResult:
|
|
423
|
+
yield Header()
|
|
424
|
+
with Horizontal(id="main"):
|
|
425
|
+
with Vertical(id="tree-pane"):
|
|
426
|
+
yield Static("XML Tree", classes="pane-title")
|
|
427
|
+
yield XMLTree("document", id="xml-tree")
|
|
428
|
+
with Vertical(id="fields-pane"):
|
|
429
|
+
yield Static("Mapped Fields", classes="pane-title")
|
|
430
|
+
yield Static("root_xpath: (not selected)", id="root-display")
|
|
431
|
+
yield DataTable(id="fields-table", cursor_type="row")
|
|
432
|
+
yield Static(id="status")
|
|
433
|
+
yield Footer()
|
|
434
|
+
|
|
435
|
+
def on_mount(self) -> None:
|
|
436
|
+
self._setup_table()
|
|
437
|
+
self._load_xml()
|
|
438
|
+
self._update_status()
|
|
439
|
+
self.push_screen(WelcomeDialog())
|
|
440
|
+
|
|
441
|
+
def _load_xml(self) -> None:
|
|
442
|
+
parser = etree.XMLParser(remove_blank_text=True, recover=True)
|
|
443
|
+
tree = etree.parse(str(self.xml_file), parser)
|
|
444
|
+
root = tree.getroot()
|
|
445
|
+
self.ns = collect_namespaces(root)
|
|
446
|
+
self._populate_tree(root)
|
|
447
|
+
|
|
448
|
+
def _populate_tree(self, root: etree._Element, subtree_root: Optional[etree._Element] = None) -> None:
|
|
449
|
+
target = subtree_root if subtree_root is not None else root
|
|
450
|
+
widget = self.query_one("#xml-tree", XMLTree)
|
|
451
|
+
widget.clear()
|
|
452
|
+
label = display_tag(target, self.ns)
|
|
453
|
+
if subtree_root is not None:
|
|
454
|
+
label += " [dim](record root)[/dim]"
|
|
455
|
+
widget.root.set_label(label)
|
|
456
|
+
widget.root.data = ElemData(target)
|
|
457
|
+
self._add_children(widget.root, target, depth=0)
|
|
458
|
+
widget.root.expand()
|
|
459
|
+
|
|
460
|
+
def _add_children(self, node: TreeNode, el: etree._Element, depth: int) -> None:
|
|
461
|
+
if depth >= 12:
|
|
462
|
+
return
|
|
463
|
+
for attr_name, attr_val in el.attrib.items():
|
|
464
|
+
preview = attr_val[:50]
|
|
465
|
+
child = node.add_leaf(f"[dim]@{attr_name}[/dim] = {preview}")
|
|
466
|
+
child.data = AttrData(el, attr_name, attr_val)
|
|
467
|
+
for child_el in el:
|
|
468
|
+
if not isinstance(child_el.tag, str):
|
|
469
|
+
continue
|
|
470
|
+
tag = display_tag(child_el, self.ns)
|
|
471
|
+
text = (child_el.text or "").strip()
|
|
472
|
+
preview = f" [dim]{text[:40]}[/dim]" if text else ""
|
|
473
|
+
has_children = bool(len(child_el) or child_el.attrib)
|
|
474
|
+
if has_children:
|
|
475
|
+
child_node = node.add(f"{tag}{preview}")
|
|
476
|
+
else:
|
|
477
|
+
child_node = node.add_leaf(f"{tag}{preview}")
|
|
478
|
+
child_node.data = ElemData(child_el)
|
|
479
|
+
self._add_children(child_node, child_el, depth + 1)
|
|
480
|
+
|
|
481
|
+
def _setup_table(self) -> None:
|
|
482
|
+
table = self.query_one("#fields-table", DataTable)
|
|
483
|
+
table.add_column("Column", width=20)
|
|
484
|
+
table.add_column("XPath")
|
|
485
|
+
|
|
486
|
+
def _refresh_table(self) -> None:
|
|
487
|
+
table = self.query_one("#fields-table", DataTable)
|
|
488
|
+
table.clear()
|
|
489
|
+
for f in self.fields:
|
|
490
|
+
table.add_row(f["column"], f["xpath"])
|
|
491
|
+
|
|
492
|
+
def _update_status(self) -> None:
|
|
493
|
+
if self.phase == PHASE_ROOT:
|
|
494
|
+
msg = "Phase 1 — Navigate to the RECORD element (one row = one record), press Enter to set as root_xpath"
|
|
495
|
+
else:
|
|
496
|
+
msg = "Phase 2 — Navigate to a field, press Enter to map it | [S] save | [D] delete selected | [R] reset"
|
|
497
|
+
self.query_one("#status", Static).update(msg)
|
|
498
|
+
|
|
499
|
+
def _update_root_display(self) -> None:
|
|
500
|
+
self.query_one("#root-display", Static).update(
|
|
501
|
+
f"root_xpath: [bold]{self.root_xpath}[/bold]"
|
|
502
|
+
)
|
|
503
|
+
|
|
504
|
+
@on(Tree.NodeSelected)
|
|
505
|
+
def on_node_selected(self, event: Tree.NodeSelected) -> None:
|
|
506
|
+
data = event.node.data
|
|
507
|
+
if data is None:
|
|
508
|
+
return
|
|
509
|
+
|
|
510
|
+
if self.phase == PHASE_ROOT:
|
|
511
|
+
if data.is_attr:
|
|
512
|
+
self.notify("Select an element, not an attribute, as root", severity="warning")
|
|
513
|
+
return
|
|
514
|
+
self.root_element = data.element
|
|
515
|
+
self.root_xpath = make_root_xpath(data.element, self.ns)
|
|
516
|
+
self._update_root_display()
|
|
517
|
+
self.phase = PHASE_FIELDS
|
|
518
|
+
self._update_status()
|
|
519
|
+
self._populate_tree(None, subtree_root=data.element)
|
|
520
|
+
|
|
521
|
+
elif self.phase == PHASE_FIELDS:
|
|
522
|
+
if data.is_attr:
|
|
523
|
+
xpath = make_attr_xpath(data.element, self.ns, data.attr)
|
|
524
|
+
sug = suggested_column(data.element, self.ns, data.attr)
|
|
525
|
+
else:
|
|
526
|
+
xpath = make_element_xpath(data.element, self.ns)
|
|
527
|
+
sug = suggested_column(data.element, self.ns)
|
|
528
|
+
|
|
529
|
+
def add_field(result: Optional[dict]) -> None:
|
|
530
|
+
if result:
|
|
531
|
+
field = {"column": result["column"], "xpath": xpath}
|
|
532
|
+
if result["separator"] != " | ":
|
|
533
|
+
field["separator"] = result["separator"]
|
|
534
|
+
self.fields.append(field)
|
|
535
|
+
self._refresh_table()
|
|
536
|
+
|
|
537
|
+
self.push_screen(ColumnDialog(xpath, sug), add_field)
|
|
538
|
+
|
|
539
|
+
def action_delete_field(self) -> None:
|
|
540
|
+
if self.phase != PHASE_FIELDS or not self.fields:
|
|
541
|
+
return
|
|
542
|
+
table = self.query_one("#fields-table", DataTable)
|
|
543
|
+
idx = table.cursor_row
|
|
544
|
+
if 0 <= idx < len(self.fields):
|
|
545
|
+
self.fields.pop(idx)
|
|
546
|
+
self._refresh_table()
|
|
547
|
+
|
|
548
|
+
def action_reset(self) -> None:
|
|
549
|
+
self.phase = PHASE_ROOT
|
|
550
|
+
self.root_xpath = None
|
|
551
|
+
self.root_element = None
|
|
552
|
+
self.fields = []
|
|
553
|
+
self._refresh_table()
|
|
554
|
+
self.query_one("#root-display", Static).update("root_xpath: (not selected)")
|
|
555
|
+
self._load_xml()
|
|
556
|
+
self._update_status()
|
|
557
|
+
|
|
558
|
+
def action_save(self) -> None:
|
|
559
|
+
if not self.root_xpath:
|
|
560
|
+
self.notify("Select a root element first (Phase 1)", severity="error")
|
|
561
|
+
return
|
|
562
|
+
if not self.fields:
|
|
563
|
+
self.notify("Map at least one field first", severity="warning")
|
|
564
|
+
return
|
|
565
|
+
|
|
566
|
+
default = self.xml_file.stem + "_config.yaml"
|
|
567
|
+
|
|
568
|
+
def on_path(path: Optional[Path]) -> None:
|
|
569
|
+
if not path:
|
|
570
|
+
return
|
|
571
|
+
self._write_yaml(path)
|
|
572
|
+
|
|
573
|
+
self.push_screen(SaveDialog(default), on_path)
|
|
574
|
+
|
|
575
|
+
def _write_yaml(self, path: Path) -> None:
|
|
576
|
+
config = {
|
|
577
|
+
"input_directory": str(self.xml_file.parent),
|
|
578
|
+
"output_file": path.stem.removesuffix("_config"),
|
|
579
|
+
"root_xpath": self.root_xpath,
|
|
580
|
+
"namespaces": self.ns,
|
|
581
|
+
"fields": self.fields,
|
|
582
|
+
}
|
|
583
|
+
try:
|
|
584
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
585
|
+
yaml.dump(config, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
586
|
+
except Exception as e:
|
|
587
|
+
self.notify(f"Save failed: {e}", severity="error")
|
|
588
|
+
return
|
|
589
|
+
|
|
590
|
+
def on_run(should_run: Optional[bool]) -> None:
|
|
591
|
+
self.run_config = path if should_run else None
|
|
592
|
+
self.exit(self.run_config)
|
|
593
|
+
|
|
594
|
+
self.push_screen(RunDialog(path, self.xml_file.parent), on_run)
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
# ── CLI ────────────────────────────────────────────────────────────────────────
|
|
598
|
+
|
|
599
|
+
@click.command()
|
|
600
|
+
@click.argument("xml_file", type=click.Path(exists=True, path_type=Path))
|
|
601
|
+
def main(xml_file: Path) -> None:
|
|
602
|
+
"""Interactively build an xml-extractor config from an XML sample file."""
|
|
603
|
+
app = ConfigBuilderApp(xml_file)
|
|
604
|
+
result = app.run() # returns config Path if user chose "Run", else None
|
|
605
|
+
|
|
606
|
+
if result:
|
|
607
|
+
click.echo(f"\nRunning extraction with {result} …\n")
|
|
608
|
+
try:
|
|
609
|
+
from xml_extractor import cli
|
|
610
|
+
cli(args=["run", str(result)], standalone_mode=False)
|
|
611
|
+
except Exception as e:
|
|
612
|
+
click.echo(f"Error: {e}", err=True)
|
|
613
|
+
sys.exit(1)
|
|
614
|
+
|
|
615
|
+
|
|
616
|
+
if __name__ == "__main__":
|
|
617
|
+
main()
|