md-spreadsheet-parser 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,590 @@
1
+ import json
2
+ import re
3
+ from dataclasses import replace
4
+ from typing import Any
5
+
6
+ from .models import AlignmentType, Sheet, Table, Workbook
7
+ from .schemas import DEFAULT_SCHEMA, MultiTableParsingSchema, ParsingSchema
8
+
9
+
10
+ def clean_cell(cell: str, schema: ParsingSchema) -> str:
11
+ """
12
+ Clean a cell value by stripping whitespace and unescaping the separator.
13
+ """
14
+ if schema.strip_whitespace:
15
+ cell = cell.strip()
16
+
17
+ if schema.convert_br_to_newline:
18
+ # Replace <br>, <br/>, <br /> (case-insensitive) with \n
19
+ cell = re.sub(r"<br\s*/?>", "\n", cell, flags=re.IGNORECASE)
20
+
21
+ # Unescape the column separator (e.g. \| -> |)
22
+ # We also need to handle \\ -> \
23
+ # Simple replacement for now: replace \<sep> with <sep>
24
+ if "\\" in cell:
25
+ cell = cell.replace(f"\\{schema.column_separator}", schema.column_separator)
26
+
27
+ return cell
28
+
29
+ return cell
30
+
31
+
32
+ def split_row_gfm(line: str, separator: str) -> list[str]:
33
+ """
34
+ Split a line by separator, respecting GFM rules:
35
+ - Ignore separators inside inline code (backticks).
36
+ - Ignore escaped separators.
37
+ """
38
+ parts: list[str] = []
39
+ current_part: list[str] = []
40
+ in_code = False
41
+ i = 0
42
+ n = len(line)
43
+
44
+ while i < n:
45
+ char = line[i]
46
+
47
+ if char == "\\":
48
+ # Escape character
49
+ # If we are NOT in code, this might be escaping the separator.
50
+ # We keep the backslash for clean_cell to handle (e.g. \| -> |).
51
+ # But we must treat the next specific char as literal for splitting purposes.
52
+ current_part.append(char)
53
+ if i + 1 < n:
54
+ # Add the next char unconditionally (skip separator check for it)
55
+ current_part.append(line[i + 1])
56
+ i += 2
57
+ continue
58
+ else:
59
+ # Trailing backslash
60
+ i += 1
61
+ continue
62
+
63
+ if char == "`":
64
+ in_code = not in_code
65
+
66
+ if char == separator and not in_code:
67
+ # Found a valid separator
68
+ parts.append("".join(current_part))
69
+ current_part = []
70
+ else:
71
+ current_part.append(char)
72
+
73
+ i += 1
74
+
75
+ # Append the last part
76
+ parts.append("".join(current_part))
77
+ return parts
78
+
79
+
80
+ def parse_row(line: str, schema: ParsingSchema) -> list[str] | None:
81
+ """
82
+ Parse a single line into a list of cell values.
83
+ Handles escaped separators and GFM validation (pipes in code).
84
+ """
85
+ line = line.strip()
86
+ if not line:
87
+ return None
88
+
89
+ # Use state-aware splitter instead of regex
90
+ parts = split_row_gfm(line, schema.column_separator)
91
+
92
+ # Handle outer pipes if present
93
+ # If the line starts/ends with a separator (and it wasn't escaped),
94
+ # split will produce empty strings at start/end.
95
+ if len(parts) > 1:
96
+ if parts[0].strip() == "":
97
+ parts = parts[1:]
98
+ if parts and parts[-1].strip() == "":
99
+ parts = parts[:-1]
100
+
101
+ # Clean cells
102
+ cleaned_parts = [clean_cell(part, schema) for part in parts]
103
+ return cleaned_parts
104
+
105
+
106
+ def parse_separator_row(
107
+ row: list[str], schema: ParsingSchema
108
+ ) -> list[AlignmentType] | None:
109
+ """
110
+ Check if a row is a separator row. If so, return the list of alignments.
111
+ Returns None if it is not a separator row.
112
+ """
113
+ alignments: list[AlignmentType] = []
114
+ is_separator = True
115
+
116
+ for cell in row:
117
+ cell = cell.strip()
118
+ # Verify it resembles a separator (---, :---, :---:, ---:)
119
+ # Must contain at least one separator char
120
+ if schema.header_separator_char not in cell:
121
+ is_separator = False
122
+ break
123
+
124
+ # Remove expected chars to check validity
125
+ cleaned = (
126
+ cell.replace(schema.header_separator_char, "").replace(":", "").strip()
127
+ )
128
+ if cleaned:
129
+ # Contains unexpected characters
130
+ is_separator = False
131
+ break
132
+
133
+ # Determine alignment
134
+ starts_col = cell.startswith(":")
135
+ ends_col = cell.endswith(":")
136
+
137
+ if starts_col and ends_col:
138
+ alignments.append("center")
139
+ elif ends_col:
140
+ alignments.append("right")
141
+ elif starts_col:
142
+ alignments.append("left")
143
+ else:
144
+ alignments.append("default")
145
+
146
+ if is_separator:
147
+ return alignments
148
+ return None
149
+
150
+
151
+ def is_separator_row(row: list[str], schema: ParsingSchema) -> bool:
152
+ """
153
+ Deprecated: wrapper around parse_separator_row for backward compatibility if needed,
154
+ or just refactor usage.
155
+ """
156
+ return parse_separator_row(row, schema) is not None
157
+
158
+
159
+ def parse_table(markdown: str, schema: ParsingSchema = DEFAULT_SCHEMA) -> Table:
160
+ """
161
+ Parse a markdown table into a Table object.
162
+
163
+ Args:
164
+ markdown: The markdown string containing the table.
165
+ schema: Configuration for parsing.
166
+
167
+ Returns:
168
+ Table object with headers and rows.
169
+ """
170
+ lines = markdown.strip().split("\n")
171
+ headers: list[str] | None = None
172
+ rows: list[list[str]] = []
173
+ alignments: list[AlignmentType] | None = None
174
+ potential_header: list[str] | None = None
175
+ visual_metadata: dict | None = None
176
+
177
+ # Buffer for potential header row until we confirm it's a header with a separator
178
+ potential_header: list[str] | None = None
179
+
180
+ for line in lines:
181
+ line = line.strip()
182
+ if not line:
183
+ continue
184
+
185
+ # Check for metadata comment
186
+ metadata_match = re.match(
187
+ r"^<!-- md-spreadsheet-table-metadata: (.*) -->$", line
188
+ )
189
+ if metadata_match:
190
+ try:
191
+ json_content = metadata_match.group(1)
192
+ visual_metadata = json.loads(json_content)
193
+ continue
194
+ except json.JSONDecodeError:
195
+ # If invalid JSON, treat as normal text/comment (or ignore?)
196
+ # For robustness, we ignore it as metadata but let parse_row handle it or skip?
197
+ # Usually comments are ignored by parse_row if they don't look like tables?
198
+ # parse_row will likely return ["<!-- ... -->"].
199
+ # If we want to hide it from table data, we should continue here even if error?
200
+ # User constraint: "if user manually edits... handle gracefully".
201
+ # Let's log/ignore and continue, effectively stripping bad metadata lines from table data.
202
+ continue
203
+
204
+ parsed_row = parse_row(line, schema)
205
+
206
+ if parsed_row is None:
207
+ continue
208
+
209
+ if headers is None and potential_header is not None:
210
+ detected_alignments = parse_separator_row(parsed_row, schema)
211
+ if detected_alignments is not None:
212
+ headers = potential_header
213
+ alignments: list[AlignmentType] | None = detected_alignments
214
+ potential_header = None
215
+ continue
216
+ potential_header = None
217
+ continue
218
+ else:
219
+ # Previous row was not a header, treat as data
220
+ rows.append(potential_header)
221
+ potential_header = parsed_row
222
+ elif headers is None and potential_header is None:
223
+ potential_header = parsed_row
224
+ else:
225
+ rows.append(parsed_row)
226
+
227
+ if potential_header is not None:
228
+ rows.append(potential_header)
229
+
230
+ # Normalize rows to match header length
231
+ if headers:
232
+ header_len = len(headers)
233
+ normalized_rows = []
234
+ for row in rows:
235
+ if len(row) < header_len:
236
+ # Pad with empty strings
237
+ row.extend([""] * (header_len - len(row)))
238
+ elif len(row) > header_len:
239
+ # Truncate
240
+ row = row[:header_len]
241
+ normalized_rows.append(row)
242
+ rows = normalized_rows
243
+
244
+ metadata: dict[str, Any] = {"schema_used": str(schema)}
245
+ if visual_metadata:
246
+ metadata["visual"] = visual_metadata
247
+
248
+ return Table(headers=headers, rows=rows, metadata=metadata, alignments=alignments)
249
+
250
+
251
+ def _extract_tables_simple(
252
+ lines: list[str], schema: ParsingSchema, start_line_offset: int
253
+ ) -> list[Table]:
254
+ """
255
+ Extract tables by splitting lines by blank lines.
256
+ Used for content within a block or when no table header level is set.
257
+ """
258
+ tables: list[Table] = []
259
+ current_block: list[str] = []
260
+ block_start = 0
261
+
262
+ for idx, line in enumerate(lines):
263
+ if not line.strip():
264
+ if current_block:
265
+ # Process block
266
+ block_text = "\n".join(current_block)
267
+ if (
268
+ schema.column_separator in block_text
269
+ or "<!-- md-spreadsheet-table-metadata:" in block_text
270
+ ):
271
+ table = parse_table(block_text, schema)
272
+ if table.rows or table.headers:
273
+ table = replace(
274
+ table,
275
+ start_line=start_line_offset + block_start,
276
+ end_line=start_line_offset + idx,
277
+ )
278
+ tables.append(table)
279
+ elif table.metadata and "visual" in table.metadata and tables:
280
+ last_table = tables[-1]
281
+ last_metadata = last_table.metadata or {}
282
+ current_vis = last_metadata.get("visual", {})
283
+ new_vis = current_vis.copy()
284
+ new_vis.update(table.metadata["visual"])
285
+
286
+ updated_md = last_metadata.copy()
287
+ updated_md["visual"] = new_vis
288
+
289
+ tables[-1] = replace(last_table, metadata=updated_md)
290
+ current_block = []
291
+ # Tables that are only metadata (and no previous table) are ignored (orphan)
292
+ block_start = idx + 1
293
+ else:
294
+ if not current_block:
295
+ block_start = idx
296
+ current_block.append(line)
297
+
298
+ # Last block
299
+ if current_block:
300
+ block_text = "\n".join(current_block)
301
+ if (
302
+ schema.column_separator in block_text
303
+ or "<!-- md-spreadsheet-metadata:" in block_text
304
+ ):
305
+ table = parse_table(block_text, schema)
306
+ if table.rows or table.headers:
307
+ table = replace(
308
+ table,
309
+ start_line=start_line_offset + block_start,
310
+ end_line=start_line_offset + len(lines),
311
+ )
312
+ tables.append(table)
313
+ elif table.metadata and "visual" in table.metadata and tables:
314
+ last_table = tables[-1]
315
+ last_metadata = last_table.metadata or {}
316
+ current_vis = last_metadata.get("visual", {})
317
+ new_vis = current_vis.copy()
318
+ new_vis.update(table.metadata["visual"])
319
+
320
+ updated_md = last_metadata.copy()
321
+ updated_md["visual"] = new_vis
322
+
323
+ tables[-1] = replace(last_table, metadata=updated_md)
324
+
325
+ return tables
326
+
327
+
328
+ def _extract_tables(
329
+ text: str, schema: MultiTableParsingSchema, start_line_offset: int = 0
330
+ ) -> list[Table]:
331
+ """
332
+ Extract tables from text.
333
+ If table_header_level is set, splits by that header.
334
+ Otherwise, splits by blank lines.
335
+ """
336
+ if schema.table_header_level is None:
337
+ return _extract_tables_simple(text.split("\n"), schema, start_line_offset)
338
+
339
+ # Split by table header
340
+ header_prefix = "#" * schema.table_header_level + " "
341
+ lines = text.split("\n")
342
+ tables: list[Table] = []
343
+
344
+ current_table_lines: list[str] = []
345
+ current_table_name: str | None = None
346
+ current_description_lines: list[str] = []
347
+ current_block_start_line = start_line_offset
348
+
349
+ def process_table_block(end_line_idx: int):
350
+ if not current_table_lines:
351
+ return
352
+
353
+ # Try to separate description from table content
354
+ # Simple heuristic: find the first line that looks like a table row
355
+ table_start_idx = -1
356
+ for idx, line in enumerate(current_table_lines):
357
+ if schema.column_separator in line:
358
+ table_start_idx = idx
359
+ break
360
+
361
+ if table_start_idx != -1:
362
+ # Description is everything before table start
363
+ desc_lines = (
364
+ current_description_lines + current_table_lines[:table_start_idx]
365
+ )
366
+
367
+ # Content is everything after (and including) table start
368
+ content_lines = current_table_lines[table_start_idx:]
369
+
370
+ # Logic adjustment:
371
+ # If named, content starts at header_line + 1.
372
+ # If unnamed, content starts at current_block_start_line.
373
+ offset_correction = 1 if current_table_name else 0
374
+
375
+ # Absolute start line of the content part
376
+ abs_content_start = (
377
+ start_line_offset
378
+ + current_block_start_line
379
+ + offset_correction
380
+ + table_start_idx
381
+ )
382
+
383
+ # Parse tables from the content lines
384
+ block_tables = _extract_tables_simple(
385
+ content_lines, schema, abs_content_start
386
+ )
387
+
388
+ if block_tables:
389
+ # The first table found gets the name and description
390
+ first_table = block_tables[0]
391
+
392
+ description = (
393
+ "\n".join(line.strip() for line in desc_lines if line.strip())
394
+ if schema.capture_description
395
+ else None
396
+ )
397
+ if description == "":
398
+ description = None
399
+
400
+ first_table = replace(
401
+ first_table, name=current_table_name, description=description
402
+ )
403
+ block_tables[0] = first_table
404
+
405
+ # Append all found tables
406
+ tables.extend(block_tables)
407
+
408
+ for idx, line in enumerate(lines):
409
+ stripped = line.strip()
410
+ if stripped.startswith(header_prefix):
411
+ process_table_block(idx)
412
+ current_table_name = stripped[len(header_prefix) :].strip()
413
+ current_table_lines = []
414
+ current_description_lines = []
415
+ current_block_start_line = idx
416
+ else:
417
+ # Accumulate lines regardless of whether we have a name
418
+ current_table_lines.append(line)
419
+
420
+ process_table_block(len(lines))
421
+
422
+ return tables
423
+
424
+
425
+ def parse_sheet(
426
+ markdown: str,
427
+ name: str,
428
+ schema: MultiTableParsingSchema,
429
+ start_line_offset: int = 0,
430
+ ) -> Sheet:
431
+ """
432
+ Parse a sheet (section) containing one or more tables.
433
+ """
434
+ metadata: dict[str, Any] | None = None
435
+
436
+ # Scan for sheet metadata
437
+ # We prioritize the first match if multiple exist (though usually only one)
438
+ metadata_match = re.search(
439
+ r"^<!-- md-spreadsheet-sheet-metadata: (.*) -->$", markdown, re.MULTILINE
440
+ )
441
+ if metadata_match:
442
+ try:
443
+ metadata = json.loads(metadata_match.group(1))
444
+ except json.JSONDecodeError:
445
+ pass # Ignore invalid JSON
446
+
447
+ tables = _extract_tables(markdown, schema, start_line_offset)
448
+ return Sheet(name=name, tables=tables, metadata=metadata)
449
+
450
+
451
+ def parse_workbook(
452
+ markdown: str, schema: MultiTableParsingSchema = MultiTableParsingSchema()
453
+ ) -> Workbook:
454
+ """
455
+ Parse a markdown document into a Workbook.
456
+ """
457
+ lines = markdown.split("\n")
458
+ sheets: list[Sheet] = []
459
+ metadata: dict[str, Any] | None = None
460
+
461
+ # Check for Workbook metadata at the end of the file
462
+ # Scan for Workbook metadata anywhere in the file
463
+ # We filter it out from the lines so it doesn't interfere with sheet content
464
+ filtered_lines: list[str] = []
465
+ wb_metadata_pattern = re.compile(
466
+ r"^<!-- md-spreadsheet-workbook-metadata: (.*) -->$"
467
+ )
468
+
469
+ for line in lines:
470
+ stripped = line.strip()
471
+ match = wb_metadata_pattern.match(stripped)
472
+ if match:
473
+ try:
474
+ metadata = json.loads(match.group(1))
475
+ except json.JSONDecodeError:
476
+ pass
477
+ # Skip adding this line to filtered_lines
478
+ else:
479
+ filtered_lines.append(line)
480
+
481
+ lines = filtered_lines
482
+
483
+ # Find root marker
484
+ start_index = 0
485
+ in_code_block = False
486
+ if schema.root_marker:
487
+ found = False
488
+ for i, line in enumerate(lines):
489
+ stripped = line.strip()
490
+ if stripped.startswith("```"):
491
+ in_code_block = not in_code_block
492
+
493
+ if not in_code_block and stripped == schema.root_marker:
494
+ start_index = i + 1
495
+ found = True
496
+ break
497
+ if not found:
498
+ return Workbook(sheets=[], metadata=metadata)
499
+
500
+ # Split by sheet headers
501
+ header_prefix = "#" * schema.sheet_header_level + " "
502
+
503
+ current_sheet_name: str | None = None
504
+ current_sheet_lines: list[str] = []
505
+ current_sheet_start_line = start_index
506
+
507
+ # Reset code block state for the second pass
508
+ # If we started after a root marker, check if that root marker line was just a marker.
509
+ # We assume valid markdown structure where root marker is not inside a code block (handled above).
510
+ in_code_block = False
511
+
512
+ for idx, line in enumerate(lines[start_index:], start=start_index):
513
+ stripped = line.strip()
514
+
515
+ if stripped.startswith("```"):
516
+ in_code_block = not in_code_block
517
+
518
+ if in_code_block:
519
+ # Just collect lines if we are in a sheet
520
+ if current_sheet_name:
521
+ current_sheet_lines.append(line)
522
+ continue
523
+
524
+ # Check if line is a header
525
+ if stripped.startswith("#"):
526
+ # Count header level
527
+ level = 0
528
+ for char in stripped:
529
+ if char == "#":
530
+ level += 1
531
+ else:
532
+ break
533
+
534
+ # If header level is less than sheet_header_level (e.g. # vs ##),
535
+ # it indicates a higher-level section, so we stop parsing the workbook.
536
+ if level < schema.sheet_header_level:
537
+ break
538
+
539
+ if stripped.startswith(header_prefix):
540
+ if current_sheet_name:
541
+ sheet_content = "\n".join(current_sheet_lines)
542
+ # The content starts at current_sheet_start_line + 1 (header line)
543
+ # Wait, current_sheet_lines collected lines AFTER the header.
544
+ # So the offset for content is current_sheet_start_line + 1.
545
+ sheets.append(
546
+ parse_sheet(
547
+ sheet_content,
548
+ current_sheet_name,
549
+ schema,
550
+ start_line_offset=current_sheet_start_line + 1,
551
+ )
552
+ )
553
+
554
+ current_sheet_name = stripped[len(header_prefix) :].strip()
555
+ current_sheet_lines = []
556
+ current_sheet_start_line = idx
557
+ else:
558
+ if current_sheet_name:
559
+ current_sheet_lines.append(line)
560
+
561
+ if current_sheet_name:
562
+ sheet_content = "\n".join(current_sheet_lines)
563
+ sheets.append(
564
+ parse_sheet(
565
+ sheet_content,
566
+ current_sheet_name,
567
+ schema,
568
+ start_line_offset=current_sheet_start_line + 1,
569
+ )
570
+ )
571
+
572
+ return Workbook(sheets=sheets, metadata=metadata)
573
+
574
+
575
+ def scan_tables(
576
+ markdown: str, schema: MultiTableParsingSchema | None = None
577
+ ) -> list[Table]:
578
+ """
579
+ Scan a markdown document for all tables, ignoring sheet structure.
580
+
581
+ Args:
582
+ markdown: The markdown text.
583
+ schema: Optional schema. If None, uses default MultiTableParsingSchema.
584
+
585
+ Returns:
586
+ """
587
+ if schema is None:
588
+ schema = MultiTableParsingSchema()
589
+
590
+ return _extract_tables(markdown, schema)
File without changes