xml2xlsx-lidilem 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xml2xlsx_lidilem-0.1.0/PKG-INFO +10 -0
- xml2xlsx_lidilem-0.1.0/pyproject.toml +19 -0
- xml2xlsx_lidilem-0.1.0/setup.cfg +4 -0
- xml2xlsx_lidilem-0.1.0/xml2xlsx/__init__.py +0 -0
- xml2xlsx_lidilem-0.1.0/xml2xlsx/format_excel.py +287 -0
- xml2xlsx_lidilem-0.1.0/xml2xlsx/xml2xlsx.py +386 -0
- xml2xlsx_lidilem-0.1.0/xml2xlsx_lidilem.egg-info/PKG-INFO +10 -0
- xml2xlsx_lidilem-0.1.0/xml2xlsx_lidilem.egg-info/SOURCES.txt +10 -0
- xml2xlsx_lidilem-0.1.0/xml2xlsx_lidilem.egg-info/dependency_links.txt +1 -0
- xml2xlsx_lidilem-0.1.0/xml2xlsx_lidilem.egg-info/entry_points.txt +2 -0
- xml2xlsx_lidilem-0.1.0/xml2xlsx_lidilem.egg-info/requires.txt +3 -0
- xml2xlsx_lidilem-0.1.0/xml2xlsx_lidilem.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: xml2xlsx-lidilem
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Convert annotated XML to Excel
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.8
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: openpyxl
|
|
9
|
+
Requires-Dist: pandas
|
|
10
|
+
Requires-Dist: numpy
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=42", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "xml2xlsx-lidilem"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Convert annotated XML to Excel"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
license = {text = "MIT"}
|
|
12
|
+
dependencies = [
|
|
13
|
+
"openpyxl",
|
|
14
|
+
"pandas",
|
|
15
|
+
"numpy",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[project.scripts]
|
|
19
|
+
xml2xlsx = "xml2xlsx.xml2xlsx:main"
|
|
File without changes
|
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import math
|
|
4
|
+
from openpyxl.utils import get_column_letter
|
|
5
|
+
from openpyxl.styles import Alignment, PatternFill
|
|
6
|
+
from openpyxl.utils import get_column_letter
|
|
7
|
+
import sys
|
|
8
|
+
|
|
9
|
+
# Color map: tag name -> hex RGB text color (for write_rich_string, xlsxwriter font color)
|
|
10
|
+
TAG_COLORS = {
|
|
11
|
+
'INTRODD': '#1F4E79', # dark blue
|
|
12
|
+
'VDD': '#7030A0', # purple
|
|
13
|
+
'EXPANSION': '#C55A11', # dark orange
|
|
14
|
+
'MOD': '#833C00', # brown
|
|
15
|
+
'PPI': '#C00000', # dark red
|
|
16
|
+
'NONPPI': '#375623', # dark green
|
|
17
|
+
'MD': '#2E75B6', # medium blue
|
|
18
|
+
'APP': '#595959', # dark grey
|
|
19
|
+
'DD': '#1D6B5E', # teal foncé
|
|
20
|
+
}
|
|
21
|
+
TAG_COLOR_DEFAULT = '#000000' # black for unknown tags
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _parse_tagged_text(text):
|
|
25
|
+
"""
|
|
26
|
+
Parse a string containing XML-like tags and return a list of segments:
|
|
27
|
+
[{'type': 'text'|'open_tag'|'close_tag', 'value': str, 'tag': str or None}]
|
|
28
|
+
"""
|
|
29
|
+
pattern = re.compile(r'(</?(\w+)>)')
|
|
30
|
+
segments = []
|
|
31
|
+
pos = 0
|
|
32
|
+
for m in pattern.finditer(text):
|
|
33
|
+
start, end = m.start(), m.end()
|
|
34
|
+
if pos < start:
|
|
35
|
+
segments.append({'type': 'text', 'value': text[pos:start], 'tag': None})
|
|
36
|
+
full_tag = m.group(1)
|
|
37
|
+
tag_name = m.group(2)
|
|
38
|
+
is_close = full_tag.startswith('</')
|
|
39
|
+
segments.append({
|
|
40
|
+
'type': 'close_tag' if is_close else 'open_tag',
|
|
41
|
+
'value': full_tag,
|
|
42
|
+
'tag': tag_name
|
|
43
|
+
})
|
|
44
|
+
pos = end
|
|
45
|
+
if pos < len(text):
|
|
46
|
+
segments.append({'type': 'text', 'value': text[pos:], 'tag': None})
|
|
47
|
+
return segments
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _build_rich_string_args(text, workbook):
|
|
51
|
+
"""
|
|
52
|
+
Build args list for worksheet.write_rich_string() from a tagged paragraph_text.
|
|
53
|
+
Tags are shown in grey italic, tag content is colored per TAG_COLORS.
|
|
54
|
+
Plain text (outside any tag) is black.
|
|
55
|
+
Returns None if the text has no tags (use plain write instead).
|
|
56
|
+
"""
|
|
57
|
+
segments = _parse_tagged_text(text)
|
|
58
|
+
if not any(s['type'] in ('open_tag', 'close_tag') for s in segments):
|
|
59
|
+
return None
|
|
60
|
+
|
|
61
|
+
tag_fmt_cache = {}
|
|
62
|
+
plain_fmt = workbook.add_format({'font_color': '#000000', 'text_wrap': True, 'valign': 'top'})
|
|
63
|
+
tag_label_fmt = workbook.add_format({'font_color': '#AAAAAA', 'italic': True, 'text_wrap': True, 'valign': 'top'})
|
|
64
|
+
|
|
65
|
+
def get_tag_fmt(tag_name):
|
|
66
|
+
if tag_name not in tag_fmt_cache:
|
|
67
|
+
color = TAG_COLORS.get(tag_name, TAG_COLOR_DEFAULT)
|
|
68
|
+
tag_fmt_cache[tag_name] = workbook.add_format({
|
|
69
|
+
'font_color': color,
|
|
70
|
+
'text_wrap': True,
|
|
71
|
+
'valign': 'top',
|
|
72
|
+
'bold': True,
|
|
73
|
+
})
|
|
74
|
+
return tag_fmt_cache[tag_name]
|
|
75
|
+
|
|
76
|
+
args = []
|
|
77
|
+
# Track current open tag for coloring content
|
|
78
|
+
tag_stack = []
|
|
79
|
+
|
|
80
|
+
for seg in segments:
|
|
81
|
+
if seg['type'] == 'open_tag':
|
|
82
|
+
tag_stack.append(seg['tag'])
|
|
83
|
+
args.append(tag_label_fmt)
|
|
84
|
+
args.append(seg['value'])
|
|
85
|
+
elif seg['type'] == 'close_tag':
|
|
86
|
+
args.append(tag_label_fmt)
|
|
87
|
+
args.append(seg['value'])
|
|
88
|
+
if tag_stack and tag_stack[-1] == seg['tag']:
|
|
89
|
+
tag_stack.pop()
|
|
90
|
+
else: # plain text
|
|
91
|
+
val = seg['value']
|
|
92
|
+
if not val:
|
|
93
|
+
continue
|
|
94
|
+
if tag_stack:
|
|
95
|
+
# Color by innermost tag
|
|
96
|
+
args.append(get_tag_fmt(tag_stack[-1]))
|
|
97
|
+
else:
|
|
98
|
+
args.append(plain_fmt)
|
|
99
|
+
args.append(val)
|
|
100
|
+
|
|
101
|
+
# write_rich_string needs at least one format+string pair
|
|
102
|
+
# Filter empty strings
|
|
103
|
+
filtered = []
|
|
104
|
+
i = 0
|
|
105
|
+
while i < len(args):
|
|
106
|
+
if isinstance(args[i], str):
|
|
107
|
+
if args[i]:
|
|
108
|
+
filtered.append(args[i])
|
|
109
|
+
i += 1
|
|
110
|
+
else:
|
|
111
|
+
# it's a format object, pair with next string
|
|
112
|
+
if i + 1 < len(args) and isinstance(args[i+1], str) and args[i+1]:
|
|
113
|
+
filtered.append(args[i])
|
|
114
|
+
filtered.append(args[i+1])
|
|
115
|
+
i += 2
|
|
116
|
+
|
|
117
|
+
if not filtered:
|
|
118
|
+
return None
|
|
119
|
+
return filtered
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def format_ppi_bold(df, filename):
|
|
123
|
+
for col in df.columns:
|
|
124
|
+
if df[col].dtype == object:
|
|
125
|
+
df[col] = df[col].apply(
|
|
126
|
+
lambda v: v.replace('–', '\n–').replace('\n\n', '\n') if isinstance(v, str) else v
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
|
|
130
|
+
df = df.reset_index(drop=True)
|
|
131
|
+
|
|
132
|
+
with pd.ExcelWriter(filename, engine='xlsxwriter') as writer:
|
|
133
|
+
workbook = writer.book
|
|
134
|
+
worksheet = workbook.add_worksheet('Sheet1')
|
|
135
|
+
writer.sheets['Sheet1'] = worksheet
|
|
136
|
+
|
|
137
|
+
bold = workbook.add_format({'bold': True})
|
|
138
|
+
regular = workbook.add_format({'text_wrap': True, 'valign': 'top'})
|
|
139
|
+
header_format = workbook.add_format({'bold': True, 'valign': 'top', 'bg_color': '#F2F2F2'})
|
|
140
|
+
|
|
141
|
+
for col_num, col_name in enumerate(df.columns):
|
|
142
|
+
worksheet.write(0, col_num, col_name, header_format)
|
|
143
|
+
|
|
144
|
+
# Calculate column widths
|
|
145
|
+
col_widths = {col: len(str(col)) for col in df.columns}
|
|
146
|
+
for row_num in range(len(df)):
|
|
147
|
+
for col_num, col_name in enumerate(df.columns):
|
|
148
|
+
cell_value = df.iloc[row_num][col_name]
|
|
149
|
+
if pd.notna(cell_value):
|
|
150
|
+
text = str(cell_value)
|
|
151
|
+
if not text.strip():
|
|
152
|
+
continue
|
|
153
|
+
clean = re.sub(r'<[^>]+>', '', text)
|
|
154
|
+
clean = re.sub(r'\*\*', '', clean)
|
|
155
|
+
col_widths[col_name] = max(col_widths[col_name], len(clean) + 2)
|
|
156
|
+
|
|
157
|
+
for col_name in col_widths:
|
|
158
|
+
col_widths[col_name] = min(col_widths[col_name], 100)
|
|
159
|
+
|
|
160
|
+
# Calculate row heights
|
|
161
|
+
base_height = 15
|
|
162
|
+
row_heights = {}
|
|
163
|
+
for row_num in range(len(df)):
|
|
164
|
+
excel_row = row_num + 1
|
|
165
|
+
max_height = base_height
|
|
166
|
+
for col_num, col_name in enumerate(df.columns):
|
|
167
|
+
cell_value = df.iloc[row_num][col_name]
|
|
168
|
+
if pd.notna(cell_value):
|
|
169
|
+
text = str(cell_value)
|
|
170
|
+
clean = re.sub(r'<[^>]+>', '', text)
|
|
171
|
+
clean = re.sub(r'\*\*', '', clean)
|
|
172
|
+
col_width = col_widths[col_name]
|
|
173
|
+
chars_per_line = int(col_width * 1.1)
|
|
174
|
+
lines_needed = max(1, math.ceil(len(clean) / chars_per_line)) if chars_per_line > 0 else 1
|
|
175
|
+
max_height = max(max_height, base_height * lines_needed)
|
|
176
|
+
row_heights[excel_row] = max_height
|
|
177
|
+
|
|
178
|
+
# Set column widths
|
|
179
|
+
for col_num, col_name in enumerate(df.columns):
|
|
180
|
+
worksheet.set_column(col_num, col_num, col_widths[col_name], regular)
|
|
181
|
+
|
|
182
|
+
# Write content
|
|
183
|
+
for row_num in range(len(df)):
|
|
184
|
+
excel_row = row_num + 1
|
|
185
|
+
worksheet.set_row(excel_row, row_heights[excel_row])
|
|
186
|
+
|
|
187
|
+
for col_num, col_name in enumerate(df.columns):
|
|
188
|
+
cell_value = df.iloc[row_num][col_name]
|
|
189
|
+
|
|
190
|
+
if pd.notna(cell_value):
|
|
191
|
+
text = str(cell_value)
|
|
192
|
+
|
|
193
|
+
if col_name in ['paragraph_text','paragraph_text_dd']:
|
|
194
|
+
# Rich string with tag color coding
|
|
195
|
+
rich_args = _build_rich_string_args(text, workbook)
|
|
196
|
+
if rich_args:
|
|
197
|
+
worksheet.write_rich_string(excel_row, col_num, *rich_args)
|
|
198
|
+
else:
|
|
199
|
+
worksheet.write(excel_row, col_num, text, regular)
|
|
200
|
+
else:
|
|
201
|
+
# Existing PPI bold logic for other columns
|
|
202
|
+
text = re.sub(r'\*\*(.*?)\*\*', r'<PPI>\1</PPI>', text)
|
|
203
|
+
text = re.sub(r'<strong>(.*?)</strong>', r'<PPI>\1</PPI>', text)
|
|
204
|
+
parts = re.split(r'(<PPI>|</PPI>)', text)
|
|
205
|
+
|
|
206
|
+
if len(parts) > 1:
|
|
207
|
+
rich_string = []
|
|
208
|
+
is_bold = False
|
|
209
|
+
for part in parts:
|
|
210
|
+
if part == '<PPI>':
|
|
211
|
+
is_bold = True
|
|
212
|
+
rich_string.append('<PPI>')
|
|
213
|
+
elif part == '</PPI>':
|
|
214
|
+
is_bold = False
|
|
215
|
+
rich_string.append('</PPI>')
|
|
216
|
+
else:
|
|
217
|
+
if part:
|
|
218
|
+
if is_bold:
|
|
219
|
+
rich_string.append(bold)
|
|
220
|
+
rich_string.append(part)
|
|
221
|
+
else:
|
|
222
|
+
rich_string.append(part)
|
|
223
|
+
rich_string = [item for item in rich_string if item != '']
|
|
224
|
+
if rich_string:
|
|
225
|
+
worksheet.write_rich_string(excel_row, col_num, *rich_string)
|
|
226
|
+
else:
|
|
227
|
+
worksheet.write(excel_row, col_num, text, regular)
|
|
228
|
+
else:
|
|
229
|
+
worksheet.write(excel_row, col_num, text, regular)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def color_compare_pairs(df, filename):
|
|
233
|
+
df_copy = df.copy()
|
|
234
|
+
human_cols = [col for col in df_copy.columns if col.endswith('_human')]
|
|
235
|
+
pairs = []
|
|
236
|
+
for human_col in human_cols:
|
|
237
|
+
ia_col = human_col.replace('_human', '_ia')
|
|
238
|
+
if ia_col in df_copy.columns:
|
|
239
|
+
pairs.append((human_col, ia_col))
|
|
240
|
+
|
|
241
|
+
if not pairs:
|
|
242
|
+
print("No '_human' and '_ia' column pairs found")
|
|
243
|
+
return
|
|
244
|
+
|
|
245
|
+
with pd.ExcelWriter(filename, engine='openpyxl', mode='w') as writer:
|
|
246
|
+
df_copy.to_excel(writer, index=False, sheet_name='Sheet1')
|
|
247
|
+
workbook = writer.book
|
|
248
|
+
worksheet = writer.sheets['Sheet1']
|
|
249
|
+
|
|
250
|
+
green_fill = PatternFill(start_color='C6EFCE', end_color='C6EFCE', fill_type='solid')
|
|
251
|
+
red_fill = PatternFill(start_color='FFC7CE', end_color='FFC7CE', fill_type='solid')
|
|
252
|
+
|
|
253
|
+
for row_num in range(len(df_copy)):
|
|
254
|
+
excel_row = row_num + 2
|
|
255
|
+
for human_col, ia_col in pairs:
|
|
256
|
+
human_val = df_copy.iloc[row_num][human_col]
|
|
257
|
+
ia_val = df_copy.iloc[row_num][ia_col]
|
|
258
|
+
human_str = str(human_val) if pd.notna(human_val) else ""
|
|
259
|
+
ia_str = str(ia_val) if pd.notna(ia_val) else ""
|
|
260
|
+
are_equal = human_str.strip().lower() == ia_str.strip().lower()
|
|
261
|
+
human_col_idx = df_copy.columns.get_loc(human_col)
|
|
262
|
+
ia_col_idx = df_copy.columns.get_loc(ia_col)
|
|
263
|
+
fill = green_fill if are_equal else red_fill
|
|
264
|
+
worksheet.cell(row=excel_row, column=human_col_idx + 1).fill = fill
|
|
265
|
+
worksheet.cell(row=excel_row, column=ia_col_idx + 1).fill = fill
|
|
266
|
+
|
|
267
|
+
for column in worksheet.columns:
|
|
268
|
+
max_length = 0
|
|
269
|
+
column_letter = get_column_letter(column[0].column)
|
|
270
|
+
for cell in column:
|
|
271
|
+
try:
|
|
272
|
+
if len(str(cell.value)) > max_length:
|
|
273
|
+
max_length = len(str(cell.value))
|
|
274
|
+
except:
|
|
275
|
+
pass
|
|
276
|
+
worksheet.column_dimensions[column_letter].width = min(max_length + 2, 50)
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def format_and_compare(df, filename):
|
|
280
|
+
format_ppi_bold(df, filename)
|
|
281
|
+
temp_df = pd.read_excel(filename)
|
|
282
|
+
color_compare_pairs(temp_df, filename)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
if __name__ == "__main__":
|
|
286
|
+
df = pd.read_excel(sys.argv[1])
|
|
287
|
+
format_ppi_bold(df, sys.argv[1].replace(".xlsx", "_formatted.xlsx"))
|
|
@@ -0,0 +1,386 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import xml.etree.ElementTree as ET
|
|
3
|
+
import sys
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
6
|
+
from xml2xlsx.format_excel import format_ppi_bold
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def clean_text(text):
|
|
11
|
+
if not text:
|
|
12
|
+
return None
|
|
13
|
+
result = ' '.join(text.split())
|
|
14
|
+
return result if result else None
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def clean_lower(text):
|
|
18
|
+
return ' '.join(text.split()).lower() if text else None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
NO_LOWER = {'INTRODD_text', 'paragraph_text', 'APP_text', 'INTRODD_EXPANSION_text', 'INTRODD_EXPANSION_2_text','INTRODD_EXPANSION_3_text'}
|
|
22
|
+
|
|
23
|
+
# Tags to preserve in paragraph_text (order matters for nested: deepest first)
|
|
24
|
+
PRESERVE_TAGS = ['INTRODD', 'VDD', 'EXPANSION', 'MOD', 'PPI', 'NONPPI', 'MD', 'APP']
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def serialize_paragraph(p_elem):
|
|
28
|
+
"""Rebuild paragraph text preserving XML tags as literal text."""
|
|
29
|
+
def _serialize(elem):
|
|
30
|
+
result = ''
|
|
31
|
+
tag = elem.tag
|
|
32
|
+
if tag != 'p':
|
|
33
|
+
result += f'<{tag}>'
|
|
34
|
+
if elem.text:
|
|
35
|
+
result += elem.text
|
|
36
|
+
for child in elem:
|
|
37
|
+
result += _serialize(child)
|
|
38
|
+
if tag != 'p':
|
|
39
|
+
result += f'</{tag}>'
|
|
40
|
+
if elem.tail:
|
|
41
|
+
result += elem.tail
|
|
42
|
+
return result
|
|
43
|
+
|
|
44
|
+
raw = _serialize(p_elem)
|
|
45
|
+
return ' '.join(raw.split()) if raw.strip() else None
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_introdd_position(p_elem):
|
|
49
|
+
"""
|
|
50
|
+
Returns 'ANTE' if INTRODD appears before the first PPI in element order,
|
|
51
|
+
'POST' if INTRODD appears after the first PPI,
|
|
52
|
+
'AUTRE' if INTRODD is present but no PPI exists,
|
|
53
|
+
None if no INTRODD at all.
|
|
54
|
+
"""
|
|
55
|
+
tags = [child.tag for child in p_elem]
|
|
56
|
+
has_introdd = 'INTRODD' in tags
|
|
57
|
+
has_ppi = 'PPI' in tags
|
|
58
|
+
|
|
59
|
+
if not has_introdd:
|
|
60
|
+
return None
|
|
61
|
+
if not has_ppi:
|
|
62
|
+
return 'AUTRE'
|
|
63
|
+
|
|
64
|
+
introdd_idx = tags.index('INTRODD')
|
|
65
|
+
ppi_idx = tags.index('PPI')
|
|
66
|
+
|
|
67
|
+
if introdd_idx < ppi_idx:
|
|
68
|
+
return 'ANTE'
|
|
69
|
+
elif introdd_idx > ppi_idx:
|
|
70
|
+
return 'POST'
|
|
71
|
+
else:
|
|
72
|
+
return 'AUTRE'
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def get_introdd_position_dd(paragraph_text, ignore_multi_ppi=True):
|
|
76
|
+
if not paragraph_text:
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
if ignore_multi_ppi:
|
|
80
|
+
if len(re.findall(r'<PPI>', paragraph_text)) > 1:
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
introdd_start = re.search(r'<INTRODD>', paragraph_text)
|
|
84
|
+
introdd_end = re.search(r'</INTRODD>', paragraph_text)
|
|
85
|
+
|
|
86
|
+
if not introdd_start:
|
|
87
|
+
return None
|
|
88
|
+
|
|
89
|
+
intro_open = introdd_start.start()
|
|
90
|
+
intro_close = introdd_end.end() if introdd_end else intro_open
|
|
91
|
+
|
|
92
|
+
# Build all DD spans: (open_start, close_end, full_text_of_DD)
|
|
93
|
+
dd_spans = []
|
|
94
|
+
for m_open in re.finditer(r'<DD>', paragraph_text):
|
|
95
|
+
m_close = re.search(r'</DD>', paragraph_text[m_open.end():])
|
|
96
|
+
if m_close:
|
|
97
|
+
close_end = m_open.end() + m_close.end()
|
|
98
|
+
dd_text = paragraph_text[m_open.start():close_end]
|
|
99
|
+
dd_spans.append((m_open.start(), close_end, dd_text))
|
|
100
|
+
|
|
101
|
+
if not dd_spans:
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
def has_ppi(dd_text):
|
|
105
|
+
return bool(re.search(r'<PPI>', dd_text))
|
|
106
|
+
|
|
107
|
+
# --- INCISE case 1: INTRODD is fully inside a single DD ---
|
|
108
|
+
for (dd_open, dd_close_end, dd_text) in dd_spans:
|
|
109
|
+
if dd_open <= intro_open and intro_close <= dd_close_end:
|
|
110
|
+
if has_ppi(dd_text):
|
|
111
|
+
return 'INCISE'
|
|
112
|
+
|
|
113
|
+
# --- INCISE case 2: INTRODD is sandwiched between two immediate DDs ---
|
|
114
|
+
# Find the closest DD ending at or before intro_open
|
|
115
|
+
dds_before = [(o, ce, t) for (o, ce, t) in dd_spans if ce <= intro_open]
|
|
116
|
+
# Find the closest DD starting at or after intro_close
|
|
117
|
+
dds_after = [(o, ce, t) for (o, ce, t) in dd_spans if o >= intro_close]
|
|
118
|
+
|
|
119
|
+
if dds_before and dds_after:
|
|
120
|
+
nearest_before = dds_before[-1] # last DD before INTRODD
|
|
121
|
+
nearest_after = dds_after[0] # first DD after INTRODD
|
|
122
|
+
# "Immediate": no other DD boundary exists between them and INTRODD
|
|
123
|
+
no_dd_between_before = not any(
|
|
124
|
+
o > nearest_before[1] and ce <= intro_open
|
|
125
|
+
for (o, ce, t) in dd_spans
|
|
126
|
+
)
|
|
127
|
+
no_dd_between_after = not any(
|
|
128
|
+
o >= intro_close and o < nearest_after[0]
|
|
129
|
+
for (o, ce, t) in dd_spans
|
|
130
|
+
)
|
|
131
|
+
if no_dd_between_before and no_dd_between_after:
|
|
132
|
+
# INCISE only if there is nothing between the two DDs except the INTRODD
|
|
133
|
+
# (and punctuation/whitespace). Any narrator text outside INTRODD → POST/ANTE.
|
|
134
|
+
between = paragraph_text[nearest_before[1]:nearest_after[0]]
|
|
135
|
+
without_introdd = re.sub(r'<INTRODD>.*?</INTRODD>', '', between, flags=re.DOTALL)
|
|
136
|
+
without_tags = re.sub(r'<[^>]+>', '', without_introdd)
|
|
137
|
+
|
|
138
|
+
# A dash after removing the INTRODD signals a new speaker turn → not INCISE
|
|
139
|
+
has_turn_change = bool(re.search(r'[–—]', without_tags))
|
|
140
|
+
|
|
141
|
+
residual = re.sub(r'[\s\W]+', '', without_tags)
|
|
142
|
+
|
|
143
|
+
if residual == '' and not has_turn_change:
|
|
144
|
+
if has_ppi(nearest_before[2]) or has_ppi(nearest_after[2]):
|
|
145
|
+
return 'INCISE'
|
|
146
|
+
# --- ANTE: INTRODD comes before a DD that contains PPI ---
|
|
147
|
+
if dds_after and any(has_ppi(t) for (o, ce, t) in dds_after):
|
|
148
|
+
return 'ANTE'
|
|
149
|
+
|
|
150
|
+
# --- POST: INTRODD comes after a DD that contains PPI ---
|
|
151
|
+
if dds_before and any(has_ppi(t) for (o, ce, t) in dds_before):
|
|
152
|
+
return 'POST'
|
|
153
|
+
|
|
154
|
+
return None
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def extract_paragraphs(xml_path):
|
|
158
|
+
tree = ET.parse(xml_path)
|
|
159
|
+
root = tree.getroot()
|
|
160
|
+
rows = []
|
|
161
|
+
|
|
162
|
+
for p in root.findall('p'):
|
|
163
|
+
full_text = ''.join(p.itertext())
|
|
164
|
+
match = re.search(r'[«"]["\s]*([^"»]+)["\s]*[»"]', full_text)
|
|
165
|
+
p_id = match.group(1).strip() if match else None
|
|
166
|
+
|
|
167
|
+
row = {
|
|
168
|
+
'p_id': p_id,
|
|
169
|
+
'paragraph_text': serialize_paragraph(p)
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
# --- INTRODD (multiple) ---
|
|
173
|
+
introdds = p.findall('INTRODD')
|
|
174
|
+
for i, introdd in enumerate(introdds):
|
|
175
|
+
suffix = '' if i == 0 else f'_{i+1}'
|
|
176
|
+
row[f'INTRODD{suffix}_text'] = clean_text(''.join(introdd.itertext()))
|
|
177
|
+
|
|
178
|
+
# --- INTRODD EXPANSION ---
|
|
179
|
+
expansions = introdd.findall('EXPANSION')
|
|
180
|
+
for j, exp in enumerate(expansions):
|
|
181
|
+
exp_suffix = '' if j == 0 else f'_{j+1}'
|
|
182
|
+
row[f'INTRODD{suffix}_EXPANSION{exp_suffix}_text'] = clean_text(''.join(exp.itertext()))
|
|
183
|
+
row[f'INTRODD{suffix}_EXPANSION{exp_suffix}_constr'] = exp.get('constr')
|
|
184
|
+
row[f'INTRODD{suffix}_EXPANSION{exp_suffix}_type'] = exp.get('type')
|
|
185
|
+
if not expansions:
|
|
186
|
+
row[f'INTRODD{suffix}_EXPANSION_text'] = None
|
|
187
|
+
row[f'INTRODD{suffix}_EXPANSION_constr'] = None
|
|
188
|
+
row[f'INTRODD{suffix}_EXPANSION_type'] = None
|
|
189
|
+
|
|
190
|
+
# --- INTRODD MOD (multiple) ---
|
|
191
|
+
mods = introdd.findall('MOD')
|
|
192
|
+
for j, mod in enumerate(mods):
|
|
193
|
+
mod_suffix = '' if j == 0 else f'_{j+1}'
|
|
194
|
+
row[f'INTRODD{suffix}_MOD{mod_suffix}_text'] = clean_text(''.join(mod.itertext()))
|
|
195
|
+
if not mods:
|
|
196
|
+
row[f'INTRODD{suffix}_MOD_text'] = None
|
|
197
|
+
|
|
198
|
+
# --- INTRODD VDD (multiple) ---
|
|
199
|
+
vdds = introdd.findall('VDD')
|
|
200
|
+
for j, vdd in enumerate(vdds):
|
|
201
|
+
vdd_suffix = '' if j == 0 else f'_{j+1}'
|
|
202
|
+
row[f'INTRODD{suffix}_VDD{vdd_suffix}_text'] = clean_text(''.join(vdd.itertext()))
|
|
203
|
+
row[f'INTRODD{suffix}_VDD{vdd_suffix}_type'] = vdd.get('type')
|
|
204
|
+
|
|
205
|
+
# --- VDD EXPANSION ---
|
|
206
|
+
vdd_expansions = vdd.findall('EXPANSION')
|
|
207
|
+
for k, exp in enumerate(vdd_expansions):
|
|
208
|
+
vdd_exp_suffix = '' if k == 0 else f'_{k+1}'
|
|
209
|
+
row[f'INTRODD{suffix}_VDD{vdd_suffix}_EXPANSION{vdd_exp_suffix}_text'] = clean_text(''.join(exp.itertext()))
|
|
210
|
+
row[f'INTRODD{suffix}_VDD{vdd_suffix}_EXPANSION{vdd_exp_suffix}_constr'] = exp.get('constr')
|
|
211
|
+
row[f'INTRODD{suffix}_VDD{vdd_suffix}_EXPANSION{vdd_exp_suffix}_type'] = exp.get('type')
|
|
212
|
+
if not vdd_expansions:
|
|
213
|
+
row[f'INTRODD{suffix}_VDD{vdd_suffix}_EXPANSION_text'] = None
|
|
214
|
+
row[f'INTRODD{suffix}_VDD{vdd_suffix}_EXPANSION_constr'] = None
|
|
215
|
+
row[f'INTRODD{suffix}_VDD{vdd_suffix}_EXPANSION_type'] = None
|
|
216
|
+
if not vdds:
|
|
217
|
+
row[f'INTRODD{suffix}_VDD_text'] = None
|
|
218
|
+
row[f'INTRODD{suffix}_VDD_type'] = None
|
|
219
|
+
row[f'INTRODD{suffix}_VDD_EXPANSION_text'] = None
|
|
220
|
+
row[f'INTRODD{suffix}_VDD_EXPANSION_constr'] = None
|
|
221
|
+
row[f'INTRODD{suffix}_VDD_EXPANSION_type'] = None
|
|
222
|
+
|
|
223
|
+
if not introdds:
|
|
224
|
+
row['INTRODD_text'] = None
|
|
225
|
+
row['INTRODD_EXPANSION_text'] = None
|
|
226
|
+
row['INTRODD_EXPANSION_constr'] = None
|
|
227
|
+
row['INTRODD_EXPANSION_type'] = None
|
|
228
|
+
row['INTRODD_MOD_text'] = None
|
|
229
|
+
row['INTRODD_VDD_text'] = None
|
|
230
|
+
row['INTRODD_VDD_type'] = None
|
|
231
|
+
row['INTRODD_VDD_EXPANSION_text'] = None
|
|
232
|
+
row['INTRODD_VDD_EXPANSION_constr'] = None
|
|
233
|
+
row['INTRODD_VDD_EXPANSION_type'] = None
|
|
234
|
+
|
|
235
|
+
# --- POSITION_INTRODD ---
|
|
236
|
+
row['POSITION_INTRODD'] = get_introdd_position(p)
|
|
237
|
+
|
|
238
|
+
# --- All PPIs (multiple MD in each PPI) ---
|
|
239
|
+
ppis = p.findall('PPI')
|
|
240
|
+
for i, ppi in enumerate(ppis):
|
|
241
|
+
suffix = '' if i == 0 else f'_{i+1}'
|
|
242
|
+
row[f'PPI{suffix}_text'] = clean_text(''.join(ppi.itertext()))
|
|
243
|
+
row[f'PPI{suffix}_decl'] = ppi.get('decl')
|
|
244
|
+
row[f'PPI{suffix}_type'] = ppi.get('type')
|
|
245
|
+
|
|
246
|
+
# --- MD in PPI (multiple) ---
|
|
247
|
+
mds_in_ppi = ppi.findall('MD')
|
|
248
|
+
for j, md in enumerate(mds_in_ppi):
|
|
249
|
+
md_suffix = '' if j == 0 else f'_{j+1}'
|
|
250
|
+
row[f'PPI{suffix}_MD{md_suffix}_text'] = clean_text(''.join(md.itertext()))
|
|
251
|
+
if not mds_in_ppi:
|
|
252
|
+
row[f'PPI{suffix}_MD_text'] = None
|
|
253
|
+
|
|
254
|
+
if not ppis:
|
|
255
|
+
row['PPI_text'] = None
|
|
256
|
+
row['PPI_decl'] = None
|
|
257
|
+
row['PPI_type'] = None
|
|
258
|
+
row['PPI_MD_text'] = None
|
|
259
|
+
|
|
260
|
+
# --- All NONPPIs ---
|
|
261
|
+
nonppis = p.findall('NONPPI')
|
|
262
|
+
for i, nonppi in enumerate(nonppis):
|
|
263
|
+
suffix = '' if i == 0 else f'_{i+1}'
|
|
264
|
+
row[f'NONPPI{suffix}_text'] = clean_text(''.join(nonppi.itertext()))
|
|
265
|
+
if not nonppis:
|
|
266
|
+
row['NONPPI_text'] = None
|
|
267
|
+
|
|
268
|
+
# --- Standalone MD ---
|
|
269
|
+
standalone_mds = [child for child in p if child.tag == 'MD']
|
|
270
|
+
for i, md in enumerate(standalone_mds):
|
|
271
|
+
suffix = '' if i == 0 else f'_{i+1}'
|
|
272
|
+
row[f'MD{suffix}_text'] = clean_text(''.join(md.itertext()))
|
|
273
|
+
if not standalone_mds:
|
|
274
|
+
row['MD_text'] = None
|
|
275
|
+
|
|
276
|
+
# --- All APPs ---
|
|
277
|
+
apps = p.findall('APP')
|
|
278
|
+
for i, app in enumerate(apps):
|
|
279
|
+
suffix = '' if i == 0 else f'_{i+1}'
|
|
280
|
+
row[f'APP{suffix}_text'] = clean_text(''.join(app.itertext()))
|
|
281
|
+
if not apps:
|
|
282
|
+
row['APP_text'] = None
|
|
283
|
+
|
|
284
|
+
rows.append(row)
|
|
285
|
+
|
|
286
|
+
return rows
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def main():
|
|
290
|
+
if len(sys.argv) < 2:
|
|
291
|
+
print("Usage: xml2xlsx <xml_file_or_folder> [output_file] [ignore_multi_ppi=true|false]")
|
|
292
|
+
sys.exit(1)
|
|
293
|
+
|
|
294
|
+
input_path = sys.argv[1]
|
|
295
|
+
|
|
296
|
+
# argv[3]: ignore_multi_ppi flag (default True; pass 'false' to disable)
|
|
297
|
+
ignore_multi_ppi = True
|
|
298
|
+
if len(sys.argv) >= 4:
|
|
299
|
+
flag = sys.argv[3].strip().lower()
|
|
300
|
+
if flag == 'false':
|
|
301
|
+
ignore_multi_ppi = False
|
|
302
|
+
elif flag == 'true':
|
|
303
|
+
ignore_multi_ppi = True
|
|
304
|
+
else:
|
|
305
|
+
print(f"Warning: unrecognized value '{sys.argv[3]}' for ignore_multi_ppi — defaulting to True")
|
|
306
|
+
print(f"[config] ignore_multi_ppi = {ignore_multi_ppi}")
|
|
307
|
+
all_rows = []
|
|
308
|
+
|
|
309
|
+
if os.path.isfile(input_path) and input_path.endswith('.xml'):
|
|
310
|
+
rows = extract_paragraphs(input_path)
|
|
311
|
+
for row in rows:
|
|
312
|
+
row['source_file'] = os.path.basename(input_path)
|
|
313
|
+
all_rows = rows
|
|
314
|
+
output_file = input_path.replace(".xml", ".xlsx")
|
|
315
|
+
|
|
316
|
+
elif os.path.isdir(input_path):
|
|
317
|
+
output_file = os.path.join(input_path, "master_output.xlsx")
|
|
318
|
+
for f in sorted(os.listdir(input_path)):
|
|
319
|
+
if f.endswith('.xml'):
|
|
320
|
+
file_path = os.path.join(input_path, f)
|
|
321
|
+
rows = extract_paragraphs(file_path)
|
|
322
|
+
if rows:
|
|
323
|
+
individual_df = pd.DataFrame(rows)
|
|
324
|
+
individual_df = individual_df.replace('', np.nan)
|
|
325
|
+
individual_df.dropna(axis=1, how='all', inplace=True)
|
|
326
|
+
text_cols = [c for c in individual_df.columns if c.endswith('_text') and c not in NO_LOWER]
|
|
327
|
+
individual_df[text_cols] = individual_df[text_cols].apply(
|
|
328
|
+
lambda col: col.map(lambda v: v.lower() if isinstance(v, str) else v)
|
|
329
|
+
)
|
|
330
|
+
individual_output = file_path.replace(".xml", ".xlsx")
|
|
331
|
+
if 'paragraph_text_dd' in individual_df.columns:
|
|
332
|
+
pos = individual_df.columns.get_loc('paragraph_text_dd') + 1
|
|
333
|
+
individual_df.insert(pos, 'POSITION_INTRODD_DD',
|
|
334
|
+
individual_df['paragraph_text_dd'].map(
|
|
335
|
+
lambda t: get_introdd_position_dd(t, ignore_multi_ppi),
|
|
336
|
+
na_action='ignore'))
|
|
337
|
+
format_ppi_bold(individual_df, individual_output)
|
|
338
|
+
print(f"Saved individual file: {individual_output}")
|
|
339
|
+
for row in rows:
|
|
340
|
+
row['source_file'] = f
|
|
341
|
+
all_rows.extend(rows)
|
|
342
|
+
|
|
343
|
+
elif os.path.isfile(input_path) and input_path.endswith('.xlsx'):
|
|
344
|
+
df = pd.read_excel(input_path)
|
|
345
|
+
if 'paragraph_text_dd' not in df.columns:
|
|
346
|
+
print("Error: column 'paragraph_text_dd' not found in the xlsx file.")
|
|
347
|
+
sys.exit(1)
|
|
348
|
+
if 'POSITION_INTRODD_DD' in df.columns:
|
|
349
|
+
df['POSITION_INTRODD_DD'] = df['paragraph_text_dd'].map(
|
|
350
|
+
lambda t: get_introdd_position_dd(t, ignore_multi_ppi), na_action='ignore')
|
|
351
|
+
else:
|
|
352
|
+
pos = df.columns.get_loc('paragraph_text_dd') + 1
|
|
353
|
+
df.insert(pos, 'POSITION_INTRODD_DD', df['paragraph_text_dd'].map(
|
|
354
|
+
lambda t: get_introdd_position_dd(t, ignore_multi_ppi), na_action='ignore'))
|
|
355
|
+
format_ppi_bold(df, input_path)
|
|
356
|
+
print(f"Updated {input_path}")
|
|
357
|
+
sys.exit(0)
|
|
358
|
+
|
|
359
|
+
else:
|
|
360
|
+
print("Please provide a valid .xml file, .xlsx file, or folder containing .xml files")
|
|
361
|
+
sys.exit(1)
|
|
362
|
+
|
|
363
|
+
if all_rows:
|
|
364
|
+
df = pd.DataFrame(all_rows)
|
|
365
|
+
cols = ['source_file'] + [c for c in df.columns if c != 'source_file']
|
|
366
|
+
df = df[cols]
|
|
367
|
+
text_cols = [c for c in df.columns if c.endswith('_text') and c not in NO_LOWER]
|
|
368
|
+
df[text_cols] = df[text_cols].apply(
|
|
369
|
+
lambda col: col.map(lambda v: v.lower() if isinstance(v, str) else v)
|
|
370
|
+
)
|
|
371
|
+
df = df.replace('', np.nan)
|
|
372
|
+
df.dropna(axis=1, how='all', inplace=True)
|
|
373
|
+
if 'paragraph_text_dd' in df.columns:
|
|
374
|
+
pos = df.columns.get_loc('paragraph_text_dd') + 1
|
|
375
|
+
df.insert(pos, 'POSITION_INTRODD_DD',
|
|
376
|
+
df['paragraph_text_dd'].map(
|
|
377
|
+
lambda t: get_introdd_position_dd(t, ignore_multi_ppi),
|
|
378
|
+
na_action='ignore'))
|
|
379
|
+
format_ppi_bold(df, output_file)
|
|
380
|
+
print(f"Saved master file to {output_file}")
|
|
381
|
+
else:
|
|
382
|
+
print("No data extracted from input")
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
if __name__ == '__main__':
|
|
386
|
+
main()
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: xml2xlsx-lidilem
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Convert annotated XML to Excel
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.8
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: openpyxl
|
|
9
|
+
Requires-Dist: pandas
|
|
10
|
+
Requires-Dist: numpy
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
pyproject.toml
|
|
2
|
+
xml2xlsx/__init__.py
|
|
3
|
+
xml2xlsx/format_excel.py
|
|
4
|
+
xml2xlsx/xml2xlsx.py
|
|
5
|
+
xml2xlsx_lidilem.egg-info/PKG-INFO
|
|
6
|
+
xml2xlsx_lidilem.egg-info/SOURCES.txt
|
|
7
|
+
xml2xlsx_lidilem.egg-info/dependency_links.txt
|
|
8
|
+
xml2xlsx_lidilem.egg-info/entry_points.txt
|
|
9
|
+
xml2xlsx_lidilem.egg-info/requires.txt
|
|
10
|
+
xml2xlsx_lidilem.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
xml2xlsx
|