answer42 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mcp_1c/rag/parsers.py ADDED
@@ -0,0 +1,387 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ import xml.etree.ElementTree as ET
5
+ from pathlib import Path
6
+ from typing import Iterable
7
+
8
+ from .detect import detect_source_format
9
+ from .model import (
10
+ Attribute,
11
+ DataCompositionField,
12
+ Form,
13
+ FormElement,
14
+ MetadataObject,
15
+ ParsedSource,
16
+ SourceInfo,
17
+ TablePart,
18
+ TablePartColumn,
19
+ )
20
+
21
+ KIND_DIRS = {
22
+ "Catalogs": "Справочник",
23
+ "Documents": "Документ",
24
+ "DataProcessors": "Обработка",
25
+ "Reports": "Отчет",
26
+ "Enums": "Перечисление",
27
+ "CommonModules": "ОбщийМодуль",
28
+ "InformationRegisters": "РегистрСведений",
29
+ "AccumulationRegisters": "РегистрНакопления",
30
+ "AccountingRegisters": "РегистрБухгалтерии",
31
+ "CalculationRegisters": "РегистрРасчета",
32
+ }
33
+
34
+
35
+ def _strip_ns(tag: str) -> str:
36
+ return tag.rsplit("}", 1)[-1]
37
+
38
+
39
+ def _text(elem: ET.Element | None) -> str | None:
40
+ if elem is None or elem.text is None:
41
+ return None
42
+ value = elem.text.strip()
43
+ return value or None
44
+
45
+
46
+ def _local_string_text(elem: ET.Element | None) -> str | None:
47
+ if elem is None:
48
+ return None
49
+ direct = _text(elem)
50
+ if direct:
51
+ return direct
52
+ ru_value: str | None = None
53
+ any_value: str | None = None
54
+ for item in elem.iter():
55
+ if _strip_ns(item.tag).lower() != "item":
56
+ continue
57
+ lang: str | None = None
58
+ content: str | None = None
59
+ for child in list(item):
60
+ tag = _strip_ns(child.tag).lower()
61
+ if tag in {"lang", "key"}:
62
+ lang = _text(child)
63
+ elif tag in {"content", "value"}:
64
+ content = _text(child)
65
+ if content:
66
+ any_value = content
67
+ if lang == "ru":
68
+ ru_value = content
69
+ if ru_value or any_value:
70
+ return ru_value or any_value
71
+ for child in elem.iter():
72
+ if _strip_ns(child.tag).lower() in {"content", "value"}:
73
+ any_value = _text(child) or any_value
74
+ return any_value
75
+
76
+
77
+ def _first_text(root: ET.Element, names: Iterable[str]) -> str | None:
78
+ wanted = set(names)
79
+ for elem in root.iter():
80
+ if _strip_ns(elem.tag) in wanted:
81
+ return _text(elem)
82
+ return None
83
+
84
+
85
+ def _direct_text(root: ET.Element, names: Iterable[str]) -> str | None:
86
+ wanted = set(names)
87
+ for elem in list(root):
88
+ if _strip_ns(elem.tag) in wanted:
89
+ return _text(elem)
90
+ return None
91
+
92
+
93
+ def _synonym_text(root: ET.Element) -> str | None:
94
+ for elem in root.iter():
95
+ if _strip_ns(elem.tag).lower() == "synonym":
96
+ direct = _text(elem)
97
+ if direct:
98
+ return direct
99
+ ru_value: str | None = None
100
+ any_value: str | None = None
101
+ children = list(elem)
102
+ for idx, child in enumerate(children):
103
+ tag = _strip_ns(child.tag).lower()
104
+ if tag == "value":
105
+ any_value = _text(child) or any_value
106
+ if idx > 0 and _strip_ns(children[idx - 1].tag).lower() == "key" and _text(children[idx - 1]) == "ru":
107
+ ru_value = _text(child)
108
+ return ru_value or any_value
109
+ return None
110
+
111
+
112
+ def _children_named(root: ET.Element, name: str) -> list[ET.Element]:
113
+ return [elem for elem in root.iter() if _strip_ns(elem.tag) == name]
114
+
115
+
116
+ def _metadata_child_name(elem: ET.Element) -> str | None:
117
+ """Name for metadata child in EDT or Configurator XML dump.
118
+
119
+ Configurator XML puts child properties under <Properties><Name>, while EDT
120
+ often uses direct <name>. Avoid recursing into nested column attributes
121
+ when reading a tabular section name.
122
+ """
123
+ direct = _direct_text(elem, ["name", "Name"])
124
+ if direct:
125
+ return direct
126
+ for child in list(elem):
127
+ if _strip_ns(child.tag).lower() == "properties":
128
+ value = _direct_text(child, ["name", "Name"])
129
+ if value:
130
+ return value
131
+ return None
132
+
133
+
134
+ def _xml_root(path: Path) -> ET.Element | None:
135
+ try:
136
+ return ET.parse(path).getroot()
137
+ except ET.ParseError:
138
+ return None
139
+
140
+
141
+ def _type_text(elem: ET.Element) -> str | None:
142
+ values: list[str] = []
143
+ for child in elem.iter():
144
+ tag = _strip_ns(child.tag)
145
+ if tag in {"type", "Type", "types", "Types", "class", "Class"} and child.text:
146
+ values.append(child.text.strip())
147
+ return ", ".join(dict.fromkeys(v for v in values if v)) or None
148
+
149
+
150
+ def _metadata_field_role(tag: str) -> str | None:
151
+ tag = tag.lower()
152
+ if tag in {"attribute", "attributes", "requisite", "requisites"}:
153
+ return "attribute"
154
+ if tag in {"dimension", "dimensions"}:
155
+ return "dimension"
156
+ if tag in {"resource", "resources"}:
157
+ return "resource"
158
+ if tag in {"standardattribute", "standardattributes"}:
159
+ return "standard_attribute"
160
+ return None
161
+
162
+
163
+ def parse_source(source: SourceInfo) -> ParsedSource:
164
+ fmt = source.format if source.format != "unknown" else detect_source_format(source.path)
165
+ source = SourceInfo(
166
+ name=source.name,
167
+ path=source.path,
168
+ kind=source.kind,
169
+ format=fmt,
170
+ repo_url=source.repo_url,
171
+ branch=source.branch,
172
+ commit=source.commit,
173
+ )
174
+ if fmt == "edt":
175
+ return parse_edt_source(source)
176
+ if fmt == "designer_xml":
177
+ return parse_designer_xml_source(source)
178
+ raise ValueError(f"Unsupported or unknown 1C source format for {source.path}: {fmt}")
179
+
180
+
181
+ def parse_edt_source(source: SourceInfo) -> ParsedSource:
182
+ parsed = ParsedSource(source=source)
183
+ for folder, ru_kind in KIND_DIRS.items():
184
+ for mdo in sorted((source.path / folder).glob("*/*.mdo")):
185
+ obj_name = mdo.parent.name
186
+ root = _xml_root(mdo)
187
+ if root is not None:
188
+ obj_name = _first_text(root, ["name", "Name"]) or obj_name
189
+ synonym = _synonym_text(root)
190
+ hierarchical = (_first_text(root, ["hierarchical", "Hierarchical"]) or "").lower() == "true"
191
+ owners = [(_text(x) or "") for x in _children_named(root, "owner") + _children_named(root, "Owner")]
192
+ else:
193
+ synonym = None
194
+ hierarchical = False
195
+ owners = []
196
+ full = f"{ru_kind}.{obj_name}"
197
+ parsed.objects.append(MetadataObject(full, ru_kind, obj_name, synonym, str(mdo), source.format, hierarchical, [o for o in owners if o]))
198
+ if root is not None:
199
+ _extract_metadata_children(parsed, root, full, mdo)
200
+ _extract_forms(parsed, mdo.parent, full)
201
+ _extract_data_composition_schemas(parsed, mdo.parent, full)
202
+ return parsed
203
+
204
+
205
+ def parse_designer_xml_source(source: SourceInfo) -> ParsedSource:
206
+ parsed = ParsedSource(source=source)
207
+ for folder, ru_kind in KIND_DIRS.items():
208
+ base = source.path / folder
209
+ if not base.exists():
210
+ continue
211
+ candidates = list(base.glob("*.xml")) + list(base.glob("*/*.xml"))
212
+ for xml in sorted(candidates):
213
+ if xml.name in {"Form.xml"} or "/Forms/" in str(xml):
214
+ continue
215
+ obj_name = xml.stem if xml.parent == base else xml.parent.name
216
+ root = _xml_root(xml)
217
+ if root is not None:
218
+ obj_name = _first_text(root, ["name", "Name"]) or obj_name
219
+ synonym = _synonym_text(root)
220
+ hierarchical = (_first_text(root, ["hierarchical", "Hierarchical"]) or "").lower() == "true"
221
+ owners = [(_text(x) or "") for x in _children_named(root, "owner") + _children_named(root, "Owner")]
222
+ else:
223
+ synonym = None
224
+ hierarchical = False
225
+ owners = []
226
+ full = f"{ru_kind}.{obj_name}"
227
+ parsed.objects.append(MetadataObject(full, ru_kind, obj_name, synonym, str(xml), source.format, hierarchical, [o for o in owners if o]))
228
+ if root is not None:
229
+ _extract_metadata_children(parsed, root, full, xml)
230
+ object_dir = xml.parent if xml.parent != base else base / xml.stem
231
+ _extract_forms(parsed, object_dir, full)
232
+ _extract_data_composition_schemas(parsed, object_dir, full)
233
+ return parsed
234
+
235
+
236
+ def _extract_metadata_children(parsed: ParsedSource, root: ET.Element, full: str, source_path: Path) -> None:
237
+ # Heuristic parser covering both EDT and designer XML structures.
238
+ # Keep object attributes and table-part columns separate: XML dumps often
239
+ # nest column attributes under a tabularSection/tablePart node.
240
+ table_part_descendants: set[int] = set()
241
+ for elem in root.iter():
242
+ tag = _strip_ns(elem.tag).lower()
243
+ if tag in {"tabularsection", "tabularsections", "tablepart", "tableparts"}:
244
+ for child in elem.iter():
245
+ table_part_descendants.add(id(child))
246
+
247
+ for elem in root.iter():
248
+ tag = _strip_ns(elem.tag).lower()
249
+ role = _metadata_field_role(tag)
250
+ if role is not None and id(elem) not in table_part_descendants:
251
+ name = _metadata_child_name(elem)
252
+ if name:
253
+ parsed.attributes.append(Attribute(full, name, _type_text(elem), _synonym_text(elem), None, str(source_path), role))
254
+ elif tag in {"tabularsection", "tabularsections", "tablepart", "tableparts"}:
255
+ name = _metadata_child_name(elem)
256
+ if name:
257
+ parsed.table_parts.append(TablePart(full, name, _synonym_text(elem), str(source_path)))
258
+ for col in elem.iter():
259
+ ctag = _strip_ns(col.tag).lower()
260
+ if _metadata_field_role(ctag) is not None:
261
+ cname = _metadata_child_name(col)
262
+ if cname and cname != name:
263
+ parsed.table_part_columns.append(TablePartColumn(full, name, cname, _type_text(col), _synonym_text(col), None, str(source_path)))
264
+
265
+
266
+ def _extract_forms(parsed: ParsedSource, object_dir: Path, full: str) -> None:
267
+ forms_dir = object_dir / "Forms"
268
+ if not forms_dir.exists():
269
+ return
270
+ for form_dir in sorted([p for p in forms_dir.iterdir() if p.is_dir()]):
271
+ form_name = form_dir.name
272
+ form_xml = form_dir / "Ext" / "Form.xml"
273
+ if not form_xml.exists():
274
+ form_xml = form_dir / "Form.xml"
275
+ parsed.forms.append(Form(full, form_name, None, str(form_xml) if form_xml.exists() else str(form_dir)))
276
+ if form_xml.exists():
277
+ root = _xml_root(form_xml)
278
+ if root is not None:
279
+ _extract_form_elements(parsed, root, full, form_name, form_xml)
280
+ _extract_embedded_data_composition_fields(parsed, root, full, form_name, form_xml)
281
+
282
+
283
+ def _extract_form_elements(parsed: ParsedSource, root: ET.Element, full: str, form_name: str, source_path: Path) -> None:
284
+ for elem in root.iter():
285
+ tag = _strip_ns(elem.tag)
286
+ low = tag.lower()
287
+ if low not in {"element", "item", "items", "button", "command", "field", "table", "group"}:
288
+ continue
289
+ name = elem.attrib.get("name") or elem.attrib.get("Name") or _first_text(elem, ["name", "Name"])
290
+ if not name:
291
+ continue
292
+ parsed.form_elements.append(
293
+ FormElement(
294
+ full,
295
+ form_name,
296
+ name,
297
+ tag,
298
+ elem.attrib.get("title") or elem.attrib.get("Title") or _first_text(elem, ["title", "Title", "caption", "Caption"]),
299
+ elem.attrib.get("dataPath") or elem.attrib.get("path") or _first_text(elem, ["dataPath", "path", "DataPath"]),
300
+ elem.attrib.get("commandName") or _first_text(elem, ["commandName", "CommandName"]),
301
+ None,
302
+ str(source_path),
303
+ )
304
+ )
305
+
306
+
307
+ def _extract_embedded_data_composition_fields(parsed: ParsedSource, root: ET.Element, full: str, form_name: str, source_path: Path) -> None:
308
+ dataset_name: str | None = None
309
+ for elem in root.iter():
310
+ tag = _strip_ns(elem.tag)
311
+ low = tag.lower()
312
+ if low in {"dataset", "datasetquery"}:
313
+ dataset_name = _metadata_child_name(elem) or dataset_name
314
+ if low not in {"fields", "field", "calculatedfield", "totalfield"}:
315
+ continue
316
+ data_path = _direct_text(elem, ["dataPath", "DataPath"]) or _direct_text(elem, ["name", "Name"])
317
+ field = _direct_text(elem, ["field", "Field"])
318
+ title_elem = next((child for child in list(elem) if _strip_ns(child.tag).lower() in {"title", "caption"}), None)
319
+ title = _local_string_text(title_elem)
320
+ expression = _direct_text(elem, ["expression", "Expression"])
321
+ if not data_path and not field and not title:
322
+ continue
323
+ parsed.data_composition_fields.append(
324
+ DataCompositionField(
325
+ full,
326
+ form_name,
327
+ data_path or field or title or "",
328
+ field,
329
+ title,
330
+ expression,
331
+ dataset_name,
332
+ tag,
333
+ str(source_path),
334
+ {"embedded_in_form": True},
335
+ )
336
+ )
337
+
338
+
339
+ def _extract_data_composition_schemas(parsed: ParsedSource, object_dir: Path, full: str) -> None:
340
+ templates_dir = object_dir / "Templates"
341
+ if not templates_dir.exists():
342
+ return
343
+ for dcs in sorted(templates_dir.glob("*/Template.dcs")):
344
+ root = _xml_root(dcs)
345
+ if root is None:
346
+ continue
347
+ schema_name = dcs.parent.name
348
+ dataset_name: str | None = None
349
+ for elem in root.iter():
350
+ tag = _strip_ns(elem.tag)
351
+ low = tag.lower()
352
+ if low == "dataset":
353
+ dataset_name = _metadata_child_name(elem) or dataset_name
354
+ if low not in {"field", "calculatedfield", "totalfield"}:
355
+ continue
356
+ data_path = _direct_text(elem, ["dataPath", "DataPath"]) or _direct_text(elem, ["name", "Name"])
357
+ field = _direct_text(elem, ["field", "Field"])
358
+ title_elem = next((child for child in list(elem) if _strip_ns(child.tag).lower() in {"title", "caption"}), None)
359
+ title = _local_string_text(title_elem)
360
+ expression = _direct_text(elem, ["expression", "Expression"])
361
+ if not data_path and not field and not title:
362
+ continue
363
+ parsed.data_composition_fields.append(
364
+ DataCompositionField(
365
+ full,
366
+ schema_name,
367
+ data_path or field or title or "",
368
+ field,
369
+ title,
370
+ expression,
371
+ dataset_name,
372
+ tag,
373
+ str(dcs),
374
+ )
375
+ )
376
+
377
+
378
+ def navigation_for_object(full_name: str, kind: str) -> tuple[str, str] | None:
379
+ if kind in {"Справочник", "Документ", "РегистрСведений", "РегистрНакопления"}:
380
+ return ("list", f"e1cib/list/{full_name}")
381
+ if kind in {"Обработка", "Отчет"}:
382
+ return ("app", f"e1cib/app/{full_name}")
383
+ return None
384
+
385
+
386
+ def normalize_query(text: str) -> str:
387
+ return re.sub(r"\s+", " ", text.strip().lower().replace("ё", "е"))