xml2xlsx-lidilem 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ Metadata-Version: 2.4
2
+ Name: xml2xlsx-lidilem
3
+ Version: 0.1.0
4
+ Summary: Convert annotated XML to Excel
5
+ License: MIT
6
+ Requires-Python: >=3.8
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: openpyxl
9
+ Requires-Dist: pandas
10
+ Requires-Dist: numpy
@@ -0,0 +1,19 @@
1
+ [build-system]
2
+ requires = ["setuptools>=42", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "xml2xlsx-lidilem"
7
+ version = "0.1.0"
8
+ description = "Convert annotated XML to Excel"
9
+ readme = "README.md"
10
+ requires-python = ">=3.8"
11
+ license = {text = "MIT"}
12
+ dependencies = [
13
+ "openpyxl",
14
+ "pandas",
15
+ "numpy",
16
+ ]
17
+
18
+ [project.scripts]
19
+ xml2xlsx = "xml2xlsx.xml2xlsx:main"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
File without changes
@@ -0,0 +1,287 @@
1
+ import re
2
+ import pandas as pd
3
+ import math
4
+ from openpyxl.utils import get_column_letter
5
+ from openpyxl.styles import Alignment, PatternFill
6
+ from openpyxl.utils import get_column_letter
7
+ import sys
8
+
9
+ # Color map: tag name -> hex RGB text color (for write_rich_string, xlsxwriter font color)
10
+ TAG_COLORS = {
11
+ 'INTRODD': '#1F4E79', # dark blue
12
+ 'VDD': '#7030A0', # purple
13
+ 'EXPANSION': '#C55A11', # dark orange
14
+ 'MOD': '#833C00', # brown
15
+ 'PPI': '#C00000', # dark red
16
+ 'NONPPI': '#375623', # dark green
17
+ 'MD': '#2E75B6', # medium blue
18
+ 'APP': '#595959', # dark grey
19
+ 'DD': '#1D6B5E', # teal foncé
20
+ }
21
+ TAG_COLOR_DEFAULT = '#000000' # black for unknown tags
22
+
23
+
24
+ def _parse_tagged_text(text):
25
+ """
26
+ Parse a string containing XML-like tags and return a list of segments:
27
+ [{'type': 'text'|'open_tag'|'close_tag', 'value': str, 'tag': str or None}]
28
+ """
29
+ pattern = re.compile(r'(</?(\w+)>)')
30
+ segments = []
31
+ pos = 0
32
+ for m in pattern.finditer(text):
33
+ start, end = m.start(), m.end()
34
+ if pos < start:
35
+ segments.append({'type': 'text', 'value': text[pos:start], 'tag': None})
36
+ full_tag = m.group(1)
37
+ tag_name = m.group(2)
38
+ is_close = full_tag.startswith('</')
39
+ segments.append({
40
+ 'type': 'close_tag' if is_close else 'open_tag',
41
+ 'value': full_tag,
42
+ 'tag': tag_name
43
+ })
44
+ pos = end
45
+ if pos < len(text):
46
+ segments.append({'type': 'text', 'value': text[pos:], 'tag': None})
47
+ return segments
48
+
49
+
50
+ def _build_rich_string_args(text, workbook):
51
+ """
52
+ Build args list for worksheet.write_rich_string() from a tagged paragraph_text.
53
+ Tags are shown in grey italic, tag content is colored per TAG_COLORS.
54
+ Plain text (outside any tag) is black.
55
+ Returns None if the text has no tags (use plain write instead).
56
+ """
57
+ segments = _parse_tagged_text(text)
58
+ if not any(s['type'] in ('open_tag', 'close_tag') for s in segments):
59
+ return None
60
+
61
+ tag_fmt_cache = {}
62
+ plain_fmt = workbook.add_format({'font_color': '#000000', 'text_wrap': True, 'valign': 'top'})
63
+ tag_label_fmt = workbook.add_format({'font_color': '#AAAAAA', 'italic': True, 'text_wrap': True, 'valign': 'top'})
64
+
65
+ def get_tag_fmt(tag_name):
66
+ if tag_name not in tag_fmt_cache:
67
+ color = TAG_COLORS.get(tag_name, TAG_COLOR_DEFAULT)
68
+ tag_fmt_cache[tag_name] = workbook.add_format({
69
+ 'font_color': color,
70
+ 'text_wrap': True,
71
+ 'valign': 'top',
72
+ 'bold': True,
73
+ })
74
+ return tag_fmt_cache[tag_name]
75
+
76
+ args = []
77
+ # Track current open tag for coloring content
78
+ tag_stack = []
79
+
80
+ for seg in segments:
81
+ if seg['type'] == 'open_tag':
82
+ tag_stack.append(seg['tag'])
83
+ args.append(tag_label_fmt)
84
+ args.append(seg['value'])
85
+ elif seg['type'] == 'close_tag':
86
+ args.append(tag_label_fmt)
87
+ args.append(seg['value'])
88
+ if tag_stack and tag_stack[-1] == seg['tag']:
89
+ tag_stack.pop()
90
+ else: # plain text
91
+ val = seg['value']
92
+ if not val:
93
+ continue
94
+ if tag_stack:
95
+ # Color by innermost tag
96
+ args.append(get_tag_fmt(tag_stack[-1]))
97
+ else:
98
+ args.append(plain_fmt)
99
+ args.append(val)
100
+
101
+ # write_rich_string needs at least one format+string pair
102
+ # Filter empty strings
103
+ filtered = []
104
+ i = 0
105
+ while i < len(args):
106
+ if isinstance(args[i], str):
107
+ if args[i]:
108
+ filtered.append(args[i])
109
+ i += 1
110
+ else:
111
+ # it's a format object, pair with next string
112
+ if i + 1 < len(args) and isinstance(args[i+1], str) and args[i+1]:
113
+ filtered.append(args[i])
114
+ filtered.append(args[i+1])
115
+ i += 2
116
+
117
+ if not filtered:
118
+ return None
119
+ return filtered
120
+
121
+
122
+ def format_ppi_bold(df, filename):
123
+ for col in df.columns:
124
+ if df[col].dtype == object:
125
+ df[col] = df[col].apply(
126
+ lambda v: v.replace('–', '\n–').replace('\n\n', '\n') if isinstance(v, str) else v
127
+ )
128
+
129
+ df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
130
+ df = df.reset_index(drop=True)
131
+
132
+ with pd.ExcelWriter(filename, engine='xlsxwriter') as writer:
133
+ workbook = writer.book
134
+ worksheet = workbook.add_worksheet('Sheet1')
135
+ writer.sheets['Sheet1'] = worksheet
136
+
137
+ bold = workbook.add_format({'bold': True})
138
+ regular = workbook.add_format({'text_wrap': True, 'valign': 'top'})
139
+ header_format = workbook.add_format({'bold': True, 'valign': 'top', 'bg_color': '#F2F2F2'})
140
+
141
+ for col_num, col_name in enumerate(df.columns):
142
+ worksheet.write(0, col_num, col_name, header_format)
143
+
144
+ # Calculate column widths
145
+ col_widths = {col: len(str(col)) for col in df.columns}
146
+ for row_num in range(len(df)):
147
+ for col_num, col_name in enumerate(df.columns):
148
+ cell_value = df.iloc[row_num][col_name]
149
+ if pd.notna(cell_value):
150
+ text = str(cell_value)
151
+ if not text.strip():
152
+ continue
153
+ clean = re.sub(r'<[^>]+>', '', text)
154
+ clean = re.sub(r'\*\*', '', clean)
155
+ col_widths[col_name] = max(col_widths[col_name], len(clean) + 2)
156
+
157
+ for col_name in col_widths:
158
+ col_widths[col_name] = min(col_widths[col_name], 100)
159
+
160
+ # Calculate row heights
161
+ base_height = 15
162
+ row_heights = {}
163
+ for row_num in range(len(df)):
164
+ excel_row = row_num + 1
165
+ max_height = base_height
166
+ for col_num, col_name in enumerate(df.columns):
167
+ cell_value = df.iloc[row_num][col_name]
168
+ if pd.notna(cell_value):
169
+ text = str(cell_value)
170
+ clean = re.sub(r'<[^>]+>', '', text)
171
+ clean = re.sub(r'\*\*', '', clean)
172
+ col_width = col_widths[col_name]
173
+ chars_per_line = int(col_width * 1.1)
174
+ lines_needed = max(1, math.ceil(len(clean) / chars_per_line)) if chars_per_line > 0 else 1
175
+ max_height = max(max_height, base_height * lines_needed)
176
+ row_heights[excel_row] = max_height
177
+
178
+ # Set column widths
179
+ for col_num, col_name in enumerate(df.columns):
180
+ worksheet.set_column(col_num, col_num, col_widths[col_name], regular)
181
+
182
+ # Write content
183
+ for row_num in range(len(df)):
184
+ excel_row = row_num + 1
185
+ worksheet.set_row(excel_row, row_heights[excel_row])
186
+
187
+ for col_num, col_name in enumerate(df.columns):
188
+ cell_value = df.iloc[row_num][col_name]
189
+
190
+ if pd.notna(cell_value):
191
+ text = str(cell_value)
192
+
193
+ if col_name in ['paragraph_text','paragraph_text_dd']:
194
+ # Rich string with tag color coding
195
+ rich_args = _build_rich_string_args(text, workbook)
196
+ if rich_args:
197
+ worksheet.write_rich_string(excel_row, col_num, *rich_args)
198
+ else:
199
+ worksheet.write(excel_row, col_num, text, regular)
200
+ else:
201
+ # Existing PPI bold logic for other columns
202
+ text = re.sub(r'\*\*(.*?)\*\*', r'<PPI>\1</PPI>', text)
203
+ text = re.sub(r'<strong>(.*?)</strong>', r'<PPI>\1</PPI>', text)
204
+ parts = re.split(r'(<PPI>|</PPI>)', text)
205
+
206
+ if len(parts) > 1:
207
+ rich_string = []
208
+ is_bold = False
209
+ for part in parts:
210
+ if part == '<PPI>':
211
+ is_bold = True
212
+ rich_string.append('<PPI>')
213
+ elif part == '</PPI>':
214
+ is_bold = False
215
+ rich_string.append('</PPI>')
216
+ else:
217
+ if part:
218
+ if is_bold:
219
+ rich_string.append(bold)
220
+ rich_string.append(part)
221
+ else:
222
+ rich_string.append(part)
223
+ rich_string = [item for item in rich_string if item != '']
224
+ if rich_string:
225
+ worksheet.write_rich_string(excel_row, col_num, *rich_string)
226
+ else:
227
+ worksheet.write(excel_row, col_num, text, regular)
228
+ else:
229
+ worksheet.write(excel_row, col_num, text, regular)
230
+
231
+
232
+ def color_compare_pairs(df, filename):
233
+ df_copy = df.copy()
234
+ human_cols = [col for col in df_copy.columns if col.endswith('_human')]
235
+ pairs = []
236
+ for human_col in human_cols:
237
+ ia_col = human_col.replace('_human', '_ia')
238
+ if ia_col in df_copy.columns:
239
+ pairs.append((human_col, ia_col))
240
+
241
+ if not pairs:
242
+ print("No '_human' and '_ia' column pairs found")
243
+ return
244
+
245
+ with pd.ExcelWriter(filename, engine='openpyxl', mode='w') as writer:
246
+ df_copy.to_excel(writer, index=False, sheet_name='Sheet1')
247
+ workbook = writer.book
248
+ worksheet = writer.sheets['Sheet1']
249
+
250
+ green_fill = PatternFill(start_color='C6EFCE', end_color='C6EFCE', fill_type='solid')
251
+ red_fill = PatternFill(start_color='FFC7CE', end_color='FFC7CE', fill_type='solid')
252
+
253
+ for row_num in range(len(df_copy)):
254
+ excel_row = row_num + 2
255
+ for human_col, ia_col in pairs:
256
+ human_val = df_copy.iloc[row_num][human_col]
257
+ ia_val = df_copy.iloc[row_num][ia_col]
258
+ human_str = str(human_val) if pd.notna(human_val) else ""
259
+ ia_str = str(ia_val) if pd.notna(ia_val) else ""
260
+ are_equal = human_str.strip().lower() == ia_str.strip().lower()
261
+ human_col_idx = df_copy.columns.get_loc(human_col)
262
+ ia_col_idx = df_copy.columns.get_loc(ia_col)
263
+ fill = green_fill if are_equal else red_fill
264
+ worksheet.cell(row=excel_row, column=human_col_idx + 1).fill = fill
265
+ worksheet.cell(row=excel_row, column=ia_col_idx + 1).fill = fill
266
+
267
+ for column in worksheet.columns:
268
+ max_length = 0
269
+ column_letter = get_column_letter(column[0].column)
270
+ for cell in column:
271
+ try:
272
+ if len(str(cell.value)) > max_length:
273
+ max_length = len(str(cell.value))
274
+ except:
275
+ pass
276
+ worksheet.column_dimensions[column_letter].width = min(max_length + 2, 50)
277
+
278
+
279
+ def format_and_compare(df, filename):
280
+ format_ppi_bold(df, filename)
281
+ temp_df = pd.read_excel(filename)
282
+ color_compare_pairs(temp_df, filename)
283
+
284
+
285
+ if __name__ == "__main__":
286
+ df = pd.read_excel(sys.argv[1])
287
+ format_ppi_bold(df, sys.argv[1].replace(".xlsx", "_formatted.xlsx"))
@@ -0,0 +1,386 @@
1
+ import pandas as pd
2
+ import xml.etree.ElementTree as ET
3
+ import sys
4
+ import os
5
+ import re
6
+ from xml2xlsx.format_excel import format_ppi_bold
7
+ import numpy as np
8
+
9
+
10
+ def clean_text(text):
11
+ if not text:
12
+ return None
13
+ result = ' '.join(text.split())
14
+ return result if result else None
15
+
16
+
17
+ def clean_lower(text):
18
+ return ' '.join(text.split()).lower() if text else None
19
+
20
+
21
+ NO_LOWER = {'INTRODD_text', 'paragraph_text', 'APP_text', 'INTRODD_EXPANSION_text', 'INTRODD_EXPANSION_2_text','INTRODD_EXPANSION_3_text'}
22
+
23
+ # Tags to preserve in paragraph_text (order matters for nested: deepest first)
24
+ PRESERVE_TAGS = ['INTRODD', 'VDD', 'EXPANSION', 'MOD', 'PPI', 'NONPPI', 'MD', 'APP']
25
+
26
+
27
+ def serialize_paragraph(p_elem):
28
+ """Rebuild paragraph text preserving XML tags as literal text."""
29
+ def _serialize(elem):
30
+ result = ''
31
+ tag = elem.tag
32
+ if tag != 'p':
33
+ result += f'<{tag}>'
34
+ if elem.text:
35
+ result += elem.text
36
+ for child in elem:
37
+ result += _serialize(child)
38
+ if tag != 'p':
39
+ result += f'</{tag}>'
40
+ if elem.tail:
41
+ result += elem.tail
42
+ return result
43
+
44
+ raw = _serialize(p_elem)
45
+ return ' '.join(raw.split()) if raw.strip() else None
46
+
47
+
48
+ def get_introdd_position(p_elem):
49
+ """
50
+ Returns 'ANTE' if INTRODD appears before the first PPI in element order,
51
+ 'POST' if INTRODD appears after the first PPI,
52
+ 'AUTRE' if INTRODD is present but no PPI exists,
53
+ None if no INTRODD at all.
54
+ """
55
+ tags = [child.tag for child in p_elem]
56
+ has_introdd = 'INTRODD' in tags
57
+ has_ppi = 'PPI' in tags
58
+
59
+ if not has_introdd:
60
+ return None
61
+ if not has_ppi:
62
+ return 'AUTRE'
63
+
64
+ introdd_idx = tags.index('INTRODD')
65
+ ppi_idx = tags.index('PPI')
66
+
67
+ if introdd_idx < ppi_idx:
68
+ return 'ANTE'
69
+ elif introdd_idx > ppi_idx:
70
+ return 'POST'
71
+ else:
72
+ return 'AUTRE'
73
+
74
+
75
+ def get_introdd_position_dd(paragraph_text, ignore_multi_ppi=True):
76
+ if not paragraph_text:
77
+ return None
78
+
79
+ if ignore_multi_ppi:
80
+ if len(re.findall(r'<PPI>', paragraph_text)) > 1:
81
+ return None
82
+
83
+ introdd_start = re.search(r'<INTRODD>', paragraph_text)
84
+ introdd_end = re.search(r'</INTRODD>', paragraph_text)
85
+
86
+ if not introdd_start:
87
+ return None
88
+
89
+ intro_open = introdd_start.start()
90
+ intro_close = introdd_end.end() if introdd_end else intro_open
91
+
92
+ # Build all DD spans: (open_start, close_end, full_text_of_DD)
93
+ dd_spans = []
94
+ for m_open in re.finditer(r'<DD>', paragraph_text):
95
+ m_close = re.search(r'</DD>', paragraph_text[m_open.end():])
96
+ if m_close:
97
+ close_end = m_open.end() + m_close.end()
98
+ dd_text = paragraph_text[m_open.start():close_end]
99
+ dd_spans.append((m_open.start(), close_end, dd_text))
100
+
101
+ if not dd_spans:
102
+ return None
103
+
104
+ def has_ppi(dd_text):
105
+ return bool(re.search(r'<PPI>', dd_text))
106
+
107
+ # --- INCISE case 1: INTRODD is fully inside a single DD ---
108
+ for (dd_open, dd_close_end, dd_text) in dd_spans:
109
+ if dd_open <= intro_open and intro_close <= dd_close_end:
110
+ if has_ppi(dd_text):
111
+ return 'INCISE'
112
+
113
+ # --- INCISE case 2: INTRODD is sandwiched between two immediate DDs ---
114
+ # Find the closest DD ending at or before intro_open
115
+ dds_before = [(o, ce, t) for (o, ce, t) in dd_spans if ce <= intro_open]
116
+ # Find the closest DD starting at or after intro_close
117
+ dds_after = [(o, ce, t) for (o, ce, t) in dd_spans if o >= intro_close]
118
+
119
+ if dds_before and dds_after:
120
+ nearest_before = dds_before[-1] # last DD before INTRODD
121
+ nearest_after = dds_after[0] # first DD after INTRODD
122
+ # "Immediate": no other DD boundary exists between them and INTRODD
123
+ no_dd_between_before = not any(
124
+ o > nearest_before[1] and ce <= intro_open
125
+ for (o, ce, t) in dd_spans
126
+ )
127
+ no_dd_between_after = not any(
128
+ o >= intro_close and o < nearest_after[0]
129
+ for (o, ce, t) in dd_spans
130
+ )
131
+ if no_dd_between_before and no_dd_between_after:
132
+ # INCISE only if there is nothing between the two DDs except the INTRODD
133
+ # (and punctuation/whitespace). Any narrator text outside INTRODD → POST/ANTE.
134
+ between = paragraph_text[nearest_before[1]:nearest_after[0]]
135
+ without_introdd = re.sub(r'<INTRODD>.*?</INTRODD>', '', between, flags=re.DOTALL)
136
+ without_tags = re.sub(r'<[^>]+>', '', without_introdd)
137
+
138
+ # A dash after removing the INTRODD signals a new speaker turn → not INCISE
139
+ has_turn_change = bool(re.search(r'[–—]', without_tags))
140
+
141
+ residual = re.sub(r'[\s\W]+', '', without_tags)
142
+
143
+ if residual == '' and not has_turn_change:
144
+ if has_ppi(nearest_before[2]) or has_ppi(nearest_after[2]):
145
+ return 'INCISE'
146
+ # --- ANTE: INTRODD comes before a DD that contains PPI ---
147
+ if dds_after and any(has_ppi(t) for (o, ce, t) in dds_after):
148
+ return 'ANTE'
149
+
150
+ # --- POST: INTRODD comes after a DD that contains PPI ---
151
+ if dds_before and any(has_ppi(t) for (o, ce, t) in dds_before):
152
+ return 'POST'
153
+
154
+ return None
155
+
156
+
157
+ def extract_paragraphs(xml_path):
158
+ tree = ET.parse(xml_path)
159
+ root = tree.getroot()
160
+ rows = []
161
+
162
+ for p in root.findall('p'):
163
+ full_text = ''.join(p.itertext())
164
+ match = re.search(r'[«"]["\s]*([^"»]+)["\s]*[»"]', full_text)
165
+ p_id = match.group(1).strip() if match else None
166
+
167
+ row = {
168
+ 'p_id': p_id,
169
+ 'paragraph_text': serialize_paragraph(p)
170
+ }
171
+
172
+ # --- INTRODD (multiple) ---
173
+ introdds = p.findall('INTRODD')
174
+ for i, introdd in enumerate(introdds):
175
+ suffix = '' if i == 0 else f'_{i+1}'
176
+ row[f'INTRODD{suffix}_text'] = clean_text(''.join(introdd.itertext()))
177
+
178
+ # --- INTRODD EXPANSION ---
179
+ expansions = introdd.findall('EXPANSION')
180
+ for j, exp in enumerate(expansions):
181
+ exp_suffix = '' if j == 0 else f'_{j+1}'
182
+ row[f'INTRODD{suffix}_EXPANSION{exp_suffix}_text'] = clean_text(''.join(exp.itertext()))
183
+ row[f'INTRODD{suffix}_EXPANSION{exp_suffix}_constr'] = exp.get('constr')
184
+ row[f'INTRODD{suffix}_EXPANSION{exp_suffix}_type'] = exp.get('type')
185
+ if not expansions:
186
+ row[f'INTRODD{suffix}_EXPANSION_text'] = None
187
+ row[f'INTRODD{suffix}_EXPANSION_constr'] = None
188
+ row[f'INTRODD{suffix}_EXPANSION_type'] = None
189
+
190
+ # --- INTRODD MOD (multiple) ---
191
+ mods = introdd.findall('MOD')
192
+ for j, mod in enumerate(mods):
193
+ mod_suffix = '' if j == 0 else f'_{j+1}'
194
+ row[f'INTRODD{suffix}_MOD{mod_suffix}_text'] = clean_text(''.join(mod.itertext()))
195
+ if not mods:
196
+ row[f'INTRODD{suffix}_MOD_text'] = None
197
+
198
+ # --- INTRODD VDD (multiple) ---
199
+ vdds = introdd.findall('VDD')
200
+ for j, vdd in enumerate(vdds):
201
+ vdd_suffix = '' if j == 0 else f'_{j+1}'
202
+ row[f'INTRODD{suffix}_VDD{vdd_suffix}_text'] = clean_text(''.join(vdd.itertext()))
203
+ row[f'INTRODD{suffix}_VDD{vdd_suffix}_type'] = vdd.get('type')
204
+
205
+ # --- VDD EXPANSION ---
206
+ vdd_expansions = vdd.findall('EXPANSION')
207
+ for k, exp in enumerate(vdd_expansions):
208
+ vdd_exp_suffix = '' if k == 0 else f'_{k+1}'
209
+ row[f'INTRODD{suffix}_VDD{vdd_suffix}_EXPANSION{vdd_exp_suffix}_text'] = clean_text(''.join(exp.itertext()))
210
+ row[f'INTRODD{suffix}_VDD{vdd_suffix}_EXPANSION{vdd_exp_suffix}_constr'] = exp.get('constr')
211
+ row[f'INTRODD{suffix}_VDD{vdd_suffix}_EXPANSION{vdd_exp_suffix}_type'] = exp.get('type')
212
+ if not vdd_expansions:
213
+ row[f'INTRODD{suffix}_VDD{vdd_suffix}_EXPANSION_text'] = None
214
+ row[f'INTRODD{suffix}_VDD{vdd_suffix}_EXPANSION_constr'] = None
215
+ row[f'INTRODD{suffix}_VDD{vdd_suffix}_EXPANSION_type'] = None
216
+ if not vdds:
217
+ row[f'INTRODD{suffix}_VDD_text'] = None
218
+ row[f'INTRODD{suffix}_VDD_type'] = None
219
+ row[f'INTRODD{suffix}_VDD_EXPANSION_text'] = None
220
+ row[f'INTRODD{suffix}_VDD_EXPANSION_constr'] = None
221
+ row[f'INTRODD{suffix}_VDD_EXPANSION_type'] = None
222
+
223
+ if not introdds:
224
+ row['INTRODD_text'] = None
225
+ row['INTRODD_EXPANSION_text'] = None
226
+ row['INTRODD_EXPANSION_constr'] = None
227
+ row['INTRODD_EXPANSION_type'] = None
228
+ row['INTRODD_MOD_text'] = None
229
+ row['INTRODD_VDD_text'] = None
230
+ row['INTRODD_VDD_type'] = None
231
+ row['INTRODD_VDD_EXPANSION_text'] = None
232
+ row['INTRODD_VDD_EXPANSION_constr'] = None
233
+ row['INTRODD_VDD_EXPANSION_type'] = None
234
+
235
+ # --- POSITION_INTRODD ---
236
+ row['POSITION_INTRODD'] = get_introdd_position(p)
237
+
238
+ # --- All PPIs (multiple MD in each PPI) ---
239
+ ppis = p.findall('PPI')
240
+ for i, ppi in enumerate(ppis):
241
+ suffix = '' if i == 0 else f'_{i+1}'
242
+ row[f'PPI{suffix}_text'] = clean_text(''.join(ppi.itertext()))
243
+ row[f'PPI{suffix}_decl'] = ppi.get('decl')
244
+ row[f'PPI{suffix}_type'] = ppi.get('type')
245
+
246
+ # --- MD in PPI (multiple) ---
247
+ mds_in_ppi = ppi.findall('MD')
248
+ for j, md in enumerate(mds_in_ppi):
249
+ md_suffix = '' if j == 0 else f'_{j+1}'
250
+ row[f'PPI{suffix}_MD{md_suffix}_text'] = clean_text(''.join(md.itertext()))
251
+ if not mds_in_ppi:
252
+ row[f'PPI{suffix}_MD_text'] = None
253
+
254
+ if not ppis:
255
+ row['PPI_text'] = None
256
+ row['PPI_decl'] = None
257
+ row['PPI_type'] = None
258
+ row['PPI_MD_text'] = None
259
+
260
+ # --- All NONPPIs ---
261
+ nonppis = p.findall('NONPPI')
262
+ for i, nonppi in enumerate(nonppis):
263
+ suffix = '' if i == 0 else f'_{i+1}'
264
+ row[f'NONPPI{suffix}_text'] = clean_text(''.join(nonppi.itertext()))
265
+ if not nonppis:
266
+ row['NONPPI_text'] = None
267
+
268
+ # --- Standalone MD ---
269
+ standalone_mds = [child for child in p if child.tag == 'MD']
270
+ for i, md in enumerate(standalone_mds):
271
+ suffix = '' if i == 0 else f'_{i+1}'
272
+ row[f'MD{suffix}_text'] = clean_text(''.join(md.itertext()))
273
+ if not standalone_mds:
274
+ row['MD_text'] = None
275
+
276
+ # --- All APPs ---
277
+ apps = p.findall('APP')
278
+ for i, app in enumerate(apps):
279
+ suffix = '' if i == 0 else f'_{i+1}'
280
+ row[f'APP{suffix}_text'] = clean_text(''.join(app.itertext()))
281
+ if not apps:
282
+ row['APP_text'] = None
283
+
284
+ rows.append(row)
285
+
286
+ return rows
287
+
288
+
289
+ def main():
290
+ if len(sys.argv) < 2:
291
+ print("Usage: xml2xlsx <xml_file_or_folder> [output_file] [ignore_multi_ppi=true|false]")
292
+ sys.exit(1)
293
+
294
+ input_path = sys.argv[1]
295
+
296
+ # argv[3]: ignore_multi_ppi flag (default True; pass 'false' to disable)
297
+ ignore_multi_ppi = True
298
+ if len(sys.argv) >= 4:
299
+ flag = sys.argv[3].strip().lower()
300
+ if flag == 'false':
301
+ ignore_multi_ppi = False
302
+ elif flag == 'true':
303
+ ignore_multi_ppi = True
304
+ else:
305
+ print(f"Warning: unrecognized value '{sys.argv[3]}' for ignore_multi_ppi — defaulting to True")
306
+ print(f"[config] ignore_multi_ppi = {ignore_multi_ppi}")
307
+ all_rows = []
308
+
309
+ if os.path.isfile(input_path) and input_path.endswith('.xml'):
310
+ rows = extract_paragraphs(input_path)
311
+ for row in rows:
312
+ row['source_file'] = os.path.basename(input_path)
313
+ all_rows = rows
314
+ output_file = input_path.replace(".xml", ".xlsx")
315
+
316
+ elif os.path.isdir(input_path):
317
+ output_file = os.path.join(input_path, "master_output.xlsx")
318
+ for f in sorted(os.listdir(input_path)):
319
+ if f.endswith('.xml'):
320
+ file_path = os.path.join(input_path, f)
321
+ rows = extract_paragraphs(file_path)
322
+ if rows:
323
+ individual_df = pd.DataFrame(rows)
324
+ individual_df = individual_df.replace('', np.nan)
325
+ individual_df.dropna(axis=1, how='all', inplace=True)
326
+ text_cols = [c for c in individual_df.columns if c.endswith('_text') and c not in NO_LOWER]
327
+ individual_df[text_cols] = individual_df[text_cols].apply(
328
+ lambda col: col.map(lambda v: v.lower() if isinstance(v, str) else v)
329
+ )
330
+ individual_output = file_path.replace(".xml", ".xlsx")
331
+ if 'paragraph_text_dd' in individual_df.columns:
332
+ pos = individual_df.columns.get_loc('paragraph_text_dd') + 1
333
+ individual_df.insert(pos, 'POSITION_INTRODD_DD',
334
+ individual_df['paragraph_text_dd'].map(
335
+ lambda t: get_introdd_position_dd(t, ignore_multi_ppi),
336
+ na_action='ignore'))
337
+ format_ppi_bold(individual_df, individual_output)
338
+ print(f"Saved individual file: {individual_output}")
339
+ for row in rows:
340
+ row['source_file'] = f
341
+ all_rows.extend(rows)
342
+
343
+ elif os.path.isfile(input_path) and input_path.endswith('.xlsx'):
344
+ df = pd.read_excel(input_path)
345
+ if 'paragraph_text_dd' not in df.columns:
346
+ print("Error: column 'paragraph_text_dd' not found in the xlsx file.")
347
+ sys.exit(1)
348
+ if 'POSITION_INTRODD_DD' in df.columns:
349
+ df['POSITION_INTRODD_DD'] = df['paragraph_text_dd'].map(
350
+ lambda t: get_introdd_position_dd(t, ignore_multi_ppi), na_action='ignore')
351
+ else:
352
+ pos = df.columns.get_loc('paragraph_text_dd') + 1
353
+ df.insert(pos, 'POSITION_INTRODD_DD', df['paragraph_text_dd'].map(
354
+ lambda t: get_introdd_position_dd(t, ignore_multi_ppi), na_action='ignore'))
355
+ format_ppi_bold(df, input_path)
356
+ print(f"Updated {input_path}")
357
+ sys.exit(0)
358
+
359
+ else:
360
+ print("Please provide a valid .xml file, .xlsx file, or folder containing .xml files")
361
+ sys.exit(1)
362
+
363
+ if all_rows:
364
+ df = pd.DataFrame(all_rows)
365
+ cols = ['source_file'] + [c for c in df.columns if c != 'source_file']
366
+ df = df[cols]
367
+ text_cols = [c for c in df.columns if c.endswith('_text') and c not in NO_LOWER]
368
+ df[text_cols] = df[text_cols].apply(
369
+ lambda col: col.map(lambda v: v.lower() if isinstance(v, str) else v)
370
+ )
371
+ df = df.replace('', np.nan)
372
+ df.dropna(axis=1, how='all', inplace=True)
373
+ if 'paragraph_text_dd' in df.columns:
374
+ pos = df.columns.get_loc('paragraph_text_dd') + 1
375
+ df.insert(pos, 'POSITION_INTRODD_DD',
376
+ df['paragraph_text_dd'].map(
377
+ lambda t: get_introdd_position_dd(t, ignore_multi_ppi),
378
+ na_action='ignore'))
379
+ format_ppi_bold(df, output_file)
380
+ print(f"Saved master file to {output_file}")
381
+ else:
382
+ print("No data extracted from input")
383
+
384
+
385
+ if __name__ == '__main__':
386
+ main()
@@ -0,0 +1,10 @@
1
+ Metadata-Version: 2.4
2
+ Name: xml2xlsx-lidilem
3
+ Version: 0.1.0
4
+ Summary: Convert annotated XML to Excel
5
+ License: MIT
6
+ Requires-Python: >=3.8
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: openpyxl
9
+ Requires-Dist: pandas
10
+ Requires-Dist: numpy
@@ -0,0 +1,10 @@
1
+ pyproject.toml
2
+ xml2xlsx/__init__.py
3
+ xml2xlsx/format_excel.py
4
+ xml2xlsx/xml2xlsx.py
5
+ xml2xlsx_lidilem.egg-info/PKG-INFO
6
+ xml2xlsx_lidilem.egg-info/SOURCES.txt
7
+ xml2xlsx_lidilem.egg-info/dependency_links.txt
8
+ xml2xlsx_lidilem.egg-info/entry_points.txt
9
+ xml2xlsx_lidilem.egg-info/requires.txt
10
+ xml2xlsx_lidilem.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ xml2xlsx = xml2xlsx.xml2xlsx:main
@@ -0,0 +1,3 @@
1
+ openpyxl
2
+ pandas
3
+ numpy