md-spreadsheet-parser 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,66 @@
1
+ from .parsing import (
2
+ parse_table,
3
+ parse_sheet,
4
+ parse_workbook,
5
+ scan_tables,
6
+ )
7
+ from .loader import (
8
+ parse_table_from_file,
9
+ parse_workbook_from_file,
10
+ scan_tables_from_file,
11
+ scan_tables_iter,
12
+ )
13
+ from .schemas import (
14
+ ParsingSchema,
15
+ DEFAULT_SCHEMA,
16
+ MultiTableParsingSchema,
17
+ ConversionSchema,
18
+ DEFAULT_CONVERSION_SCHEMA,
19
+ ExcelParsingSchema,
20
+ DEFAULT_EXCEL_SCHEMA,
21
+ )
22
+ from .models import (
23
+ Table,
24
+ Sheet,
25
+ Workbook,
26
+ )
27
+ from .validation import TableValidationError
28
+ from .generator import (
29
+ generate_table_markdown,
30
+ generate_sheet_markdown,
31
+ generate_workbook_markdown,
32
+ )
33
+ from .excel import (
34
+ parse_excel,
35
+ parse_excel_text,
36
+ )
37
+
38
+ __all__ = [
39
+ "parse_table",
40
+ "parse_sheet",
41
+ "parse_workbook",
42
+ "scan_tables",
43
+ "parse_table_from_file",
44
+ "parse_workbook_from_file",
45
+ "scan_tables_from_file",
46
+ "scan_tables_iter",
47
+ "ParsingSchema",
48
+ "MultiTableParsingSchema",
49
+ "ConversionSchema",
50
+ "ExcelParsingSchema",
51
+ "Table",
52
+ "Sheet",
53
+ "Workbook",
54
+ "DEFAULT_SCHEMA",
55
+ "DEFAULT_CONVERSION_SCHEMA",
56
+ "DEFAULT_EXCEL_SCHEMA",
57
+ "TableValidationError",
58
+ "generate_table_markdown",
59
+ "generate_sheet_markdown",
60
+ "generate_workbook_markdown",
61
+ "parse_excel",
62
+ "parse_excel_text",
63
+ "converters",
64
+ ]
65
+
66
+ from . import converters
@@ -0,0 +1,128 @@
1
+ import argparse
2
+ import json
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ from .parsing import parse_workbook, scan_tables
7
+ from .schemas import MultiTableParsingSchema
8
+
9
+
10
+ def main():
11
+ parser = argparse.ArgumentParser(
12
+ description="Parse Markdown tables to JSON.",
13
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
14
+ )
15
+ parser.add_argument(
16
+ "file",
17
+ nargs="?",
18
+ type=str,
19
+ help="Path to Markdown file. If omitted or '-', reads from stdin.",
20
+ )
21
+ parser.add_argument(
22
+ "--scan",
23
+ action="store_true",
24
+ help="Scan for all tables ignoring workbook structure (uses scan_tables).",
25
+ )
26
+ parser.add_argument(
27
+ "--root-marker",
28
+ type=str,
29
+ default="# Tables",
30
+ help="Marker indicating start of data section (for workbook mode).",
31
+ )
32
+ parser.add_argument(
33
+ "--sheet-header-level",
34
+ type=int,
35
+ default=2,
36
+ help="Header level for sheets (for workbook mode).",
37
+ )
38
+ parser.add_argument(
39
+ "--table-header-level",
40
+ type=int,
41
+ default=None,
42
+ help="Header level for tables.",
43
+ )
44
+ parser.add_argument(
45
+ "--capture-description",
46
+ action="store_true",
47
+ help="Capture text between header and table as description. Requires --table-header-level.",
48
+ )
49
+ parser.add_argument(
50
+ "--column-separator",
51
+ type=str,
52
+ default="|",
53
+ help="Character used to separate columns.",
54
+ )
55
+ parser.add_argument(
56
+ "--header-separator-char",
57
+ type=str,
58
+ default="-",
59
+ help="Character used in the separator row.",
60
+ )
61
+ parser.add_argument(
62
+ "--no-outer-pipes",
63
+ action="store_false",
64
+ dest="require_outer_pipes",
65
+ help="Allow tables without outer pipes.",
66
+ )
67
+ parser.add_argument(
68
+ "--no-strip-whitespace",
69
+ action="store_false",
70
+ dest="strip_whitespace",
71
+ help="Do not strip whitespace from cell values.",
72
+ )
73
+ parser.add_argument(
74
+ "--no-br-conversion",
75
+ action="store_false",
76
+ dest="convert_br_to_newline",
77
+ help="Disable automatic conversion of <br> tags to newlines.",
78
+ )
79
+
80
+ args = parser.parse_args()
81
+
82
+ # Validate configuration
83
+ if args.capture_description and args.table_header_level is None:
84
+ print(
85
+ "Error: --capture-description requires --table-header-level to be set.",
86
+ file=sys.stderr,
87
+ )
88
+ sys.exit(1)
89
+
90
+ # Read input
91
+ if args.file and args.file != "-":
92
+ try:
93
+ content = Path(args.file).read_text(encoding="utf-8")
94
+ except FileNotFoundError:
95
+ print(f"Error: File '{args.file}' not found.", file=sys.stderr)
96
+ sys.exit(1)
97
+ else:
98
+ content = sys.stdin.read()
99
+
100
+ # Configure schema
101
+ schema = MultiTableParsingSchema(
102
+ root_marker=args.root_marker,
103
+ sheet_header_level=args.sheet_header_level,
104
+ table_header_level=args.table_header_level,
105
+ capture_description=args.capture_description,
106
+ column_separator=args.column_separator,
107
+ header_separator_char=args.header_separator_char,
108
+ require_outer_pipes=args.require_outer_pipes,
109
+ strip_whitespace=args.strip_whitespace,
110
+ convert_br_to_newline=args.convert_br_to_newline,
111
+ )
112
+
113
+ # Parse
114
+ try:
115
+ if args.scan:
116
+ tables = scan_tables(content, schema)
117
+ # Output list of tables
118
+ print(json.dumps([t.json for t in tables], indent=2, ensure_ascii=False))
119
+ else:
120
+ workbook = parse_workbook(content, schema)
121
+ print(json.dumps(workbook.json, indent=2, ensure_ascii=False))
122
+ except Exception as e:
123
+ print(f"Error parsing content: {e}", file=sys.stderr)
124
+ sys.exit(1)
125
+
126
+
127
+ if __name__ == "__main__":
128
+ main()
@@ -0,0 +1,126 @@
1
+ from decimal import Decimal
2
+ from typing import Callable, Iterable
3
+ from datetime import datetime
4
+ import re
5
+
6
+ try:
7
+ from zoneinfo import ZoneInfo
8
+ except ImportError:
9
+ # Python 3.8 support or fallback
10
+ # In this library we assume Python 3.10+ based on user environment but good to be safe?
11
+ # User env is 3.12.12. ZoneInfo is standard.
12
+ from zoneinfo import ZoneInfo
13
+
14
+
15
+ def to_decimal_clean(value: str) -> Decimal:
16
+ """
17
+ Convert a string to Decimal, removing common currency symbols and grouping separators.
18
+ Removes: '$', '¥', '€', '£', ',', ' ' (space), '_'
19
+ """
20
+ clean_val = re.sub(r"[ $¥€£,_]", "", value)
21
+ if not clean_val:
22
+ # What should empty string be?
23
+ # Usually schema validation handles empty string via Optional[Decimal] -> None.
24
+ # If we are here, it's likely a non-empty string or expected to be a value.
25
+ # But if the string was just "$", it becomes empty.
26
+ raise ValueError(f"Cannot convert '{value}' to Decimal")
27
+
28
+ return Decimal(clean_val)
29
+
30
+
31
+ def make_datetime_converter(
32
+ fmt: str | None = None, tz: ZoneInfo | None = None
33
+ ) -> Callable[[str], datetime]:
34
+ """
35
+ Create a converter function for datetime.
36
+
37
+ Args:
38
+ fmt: str format for strptime. If None, uses datetime.fromisoformat().
39
+ tz: ZoneInfo to attach (if naive) or convert to (if aware).
40
+
41
+ Returns:
42
+ Function that accepts a string and returns a datetime.
43
+ """
44
+
45
+ def converter(value: str) -> datetime:
46
+ value = value.strip()
47
+ if fmt:
48
+ dt = datetime.strptime(value, fmt)
49
+ else:
50
+ dt = datetime.fromisoformat(value)
51
+
52
+ if tz:
53
+ if dt.tzinfo is None:
54
+ # Attach timezone if naive
55
+ dt = dt.replace(tzinfo=tz)
56
+ else:
57
+ # Convert timezone if aware
58
+ dt = dt.astimezone(tz)
59
+ return dt
60
+
61
+ return converter
62
+
63
+
64
+ def make_list_converter(
65
+ separator: str = ",", strip_items: bool = True, distinct: bool = False
66
+ ) -> Callable[[str], list[str]]:
67
+ """
68
+ Create a converter that splits a string into a list.
69
+
70
+ Args:
71
+ separator: Character/string to split by. Default ",".
72
+ strip_items: Whether to strip whitespace from each item. Default True.
73
+ distinct: Whether to remove duplicates (maintaining order). Default False.
74
+
75
+ Returns:
76
+ Function that accepts a string and returns a list of strings.
77
+ """
78
+
79
+ def converter(value: str) -> list[str]:
80
+ if not value:
81
+ return []
82
+
83
+ parts = value.split(separator)
84
+ if strip_items:
85
+ parts = [p.strip() for p in parts]
86
+
87
+ if distinct:
88
+ seen = set()
89
+ deduper = []
90
+ for p in parts:
91
+ if p not in seen:
92
+ deduper.append(p)
93
+ seen.add(p)
94
+ parts = deduper
95
+
96
+ return parts
97
+
98
+ return converter
99
+
100
+
101
+ def make_bool_converter(
102
+ true_values: Iterable[str] = ("true", "yes", "1", "on"),
103
+ false_values: Iterable[str] = ("false", "no", "0", "off"),
104
+ ) -> Callable[[str], bool]:
105
+ """
106
+ Create a strict boolean converter.
107
+
108
+ Args:
109
+ true_values: List of case-insensitive strings treated as True.
110
+ false_values: List of case-insensitive strings treated as False.
111
+
112
+ Returns:
113
+ Function that returns bool or raises ValueError.
114
+ """
115
+ t_set = {v.lower() for v in true_values}
116
+ f_set = {v.lower() for v in false_values}
117
+
118
+ def converter(value: str) -> bool:
119
+ lower = value.strip().lower()
120
+ if lower in t_set:
121
+ return True
122
+ if lower in f_set:
123
+ return False
124
+ raise ValueError(f"Invalid boolean value: '{value}'")
125
+
126
+ return converter
@@ -0,0 +1,183 @@
1
+ """
2
+ Excel/TSV/CSV parsing module with merged cell and hierarchical header support.
3
+
4
+ This module provides functions to parse tabular data exported from Excel
5
+ (TSV/CSV format) or directly from openpyxl Worksheets, handling:
6
+ - Merged cells (forward-fill empty cells)
7
+ - Hierarchical headers (flatten to "Parent - Child" format)
8
+ """
9
+
10
+ import csv
11
+ import io
12
+ from typing import Any, TYPE_CHECKING, Union
13
+
14
+ from .models import Table
15
+ from .schemas import ExcelParsingSchema, DEFAULT_EXCEL_SCHEMA
16
+
17
+ # --- Optional openpyxl support ---
18
+ if TYPE_CHECKING:
19
+ from openpyxl.worksheet.worksheet import Worksheet
20
+
21
+ # Type alias for parse_excel source parameter
22
+ # Note: Worksheet is only available at runtime if openpyxl is installed
23
+ ExcelSource = Union[str, list[list[str]], "Worksheet"]
24
+
25
+ try:
26
+ import openpyxl
27
+
28
+ HAS_OPENPYXL = True
29
+ except ImportError:
30
+ HAS_OPENPYXL = False
31
+ openpyxl = None # type: ignore
32
+
33
+
34
+ def _parse_tsv(text: str, delimiter: str) -> list[list[str]]:
35
+ """Parse TSV/CSV text into a 2D list using Python's csv module."""
36
+ reader = csv.reader(io.StringIO(text), delimiter=delimiter)
37
+ return list(reader)
38
+
39
+
40
+ def _forward_fill(row: list[str]) -> list[str]:
41
+ """Fill empty cells with the previous non-empty value (left-to-right)."""
42
+ result = []
43
+ prev = ""
44
+ for cell in row:
45
+ if cell.strip():
46
+ prev = cell
47
+ result.append(prev)
48
+ return result
49
+
50
+
51
+ def _flatten_headers(
52
+ parent_row: list[str], child_row: list[str], separator: str
53
+ ) -> list[str]:
54
+ """
55
+ Flatten 2-row headers into single row.
56
+ Format: "Parent - Child" if Parent differs from Child, else just Child.
57
+ """
58
+ headers = []
59
+ max_len = max(len(parent_row), len(child_row))
60
+
61
+ for i in range(max_len):
62
+ parent = parent_row[i] if i < len(parent_row) else ""
63
+ child = child_row[i] if i < len(child_row) else ""
64
+
65
+ if parent and child and parent != child:
66
+ headers.append(f"{parent}{separator}{child}")
67
+ else:
68
+ headers.append(child if child else parent)
69
+
70
+ return headers
71
+
72
+
73
+ def _safe_str(value: Any) -> str:
74
+ """
75
+ Convert value to string, handling None and integer-floats cleanly.
76
+ """
77
+ if value is None:
78
+ return ""
79
+ if isinstance(value, float) and value.is_integer():
80
+ return str(int(value))
81
+ return str(value)
82
+
83
+
84
+ def parse_excel_text(
85
+ rows: list[list[str]],
86
+ schema: ExcelParsingSchema = DEFAULT_EXCEL_SCHEMA,
87
+ ) -> Table:
88
+ """
89
+ Parse a 2D string array into a Table with merged cell and header handling.
90
+
91
+ Args:
92
+ rows: 2D list of strings (e.g., from csv.reader or worksheet iteration).
93
+ schema: Configuration for header processing.
94
+
95
+ Returns:
96
+ Table object with processed headers and data rows.
97
+ """
98
+ if not rows:
99
+ return Table(headers=None, rows=[])
100
+
101
+ if schema.header_rows == 1:
102
+ # Single header row
103
+ header_row = rows[0]
104
+ if schema.fill_merged_headers:
105
+ header_row = _forward_fill(header_row)
106
+ headers = header_row
107
+ data_rows = rows[1:]
108
+
109
+ elif schema.header_rows == 2:
110
+ # Two header rows: Parent-Child flattening
111
+ if len(rows) < 2:
112
+ # Not enough rows for 2-row header
113
+ return Table(headers=rows[0] if rows else None, rows=[])
114
+
115
+ parent_row = rows[0]
116
+ child_row = rows[1]
117
+
118
+ if schema.fill_merged_headers:
119
+ parent_row = _forward_fill(parent_row)
120
+
121
+ headers = _flatten_headers(parent_row, child_row, schema.header_separator)
122
+ data_rows = rows[2:]
123
+
124
+ else:
125
+ # Should not reach here due to schema validation
126
+ raise ValueError(f"Invalid header_rows: {schema.header_rows}")
127
+
128
+ # Convert data_rows to list[list[str]] ensuring all are strings
129
+ processed_rows = [[_safe_str(cell) for cell in row] for row in data_rows]
130
+
131
+ return Table(headers=headers, rows=processed_rows)
132
+
133
+
134
+ def parse_excel(
135
+ source: ExcelSource,
136
+ schema: ExcelParsingSchema = DEFAULT_EXCEL_SCHEMA,
137
+ ) -> Table:
138
+ """
139
+ Parse Excel data from various sources.
140
+
141
+ Args:
142
+ source: One of:
143
+ - openpyxl.Worksheet (if openpyxl is installed)
144
+ - str: TSV/CSV text content
145
+ - list[list[str]]: Pre-parsed 2D array
146
+ schema: Configuration for parsing.
147
+
148
+ Returns:
149
+ Table object with processed headers and data.
150
+
151
+ Raises:
152
+ TypeError: If source type is not supported.
153
+ """
154
+ rows: list[list[str]]
155
+
156
+ # Check for openpyxl Worksheet (duck typing via hasattr)
157
+ if HAS_OPENPYXL and hasattr(source, "iter_rows"):
158
+ # At runtime, source is a Worksheet with iter_rows method
159
+ ws: Any = source
160
+ rows = [
161
+ [_safe_str(cell) for cell in row] for row in ws.iter_rows(values_only=True)
162
+ ]
163
+
164
+ # Check for string (TSV/CSV content)
165
+ elif isinstance(source, str):
166
+ rows = _parse_tsv(source, schema.delimiter)
167
+
168
+ # Check for pre-parsed 2D array
169
+ elif isinstance(source, list):
170
+ # Assume it's already list[list[str]]
171
+ rows = source
172
+
173
+ else:
174
+ supported = "openpyxl.Worksheet, str, or list[list[str]]"
175
+ if not HAS_OPENPYXL:
176
+ supported = (
177
+ "str or list[list[str]] (install openpyxl for Worksheet support)"
178
+ )
179
+ raise TypeError(
180
+ f"Unsupported source type: {type(source).__name__}. Expected {supported}."
181
+ )
182
+
183
+ return parse_excel_text(rows, schema)
@@ -0,0 +1,170 @@
1
+ import json
2
+ from typing import TYPE_CHECKING
3
+
4
+ from .schemas import DEFAULT_SCHEMA, MultiTableParsingSchema, ParsingSchema
5
+
6
+ if TYPE_CHECKING:
7
+ from .models import Sheet, Table, Workbook
8
+
9
+
10
+ def generate_table_markdown(
11
+ table: "Table", schema: ParsingSchema = DEFAULT_SCHEMA
12
+ ) -> str:
13
+ """
14
+ Generates a Markdown string representation of the table.
15
+
16
+ Args:
17
+ table: The Table object.
18
+ schema (ParsingSchema, optional): Configuration for formatting.
19
+
20
+ Returns:
21
+ str: The Markdown string.
22
+ """
23
+ lines = []
24
+
25
+ # Handle metadata (name and description) if MultiTableParsingSchema
26
+ if isinstance(schema, MultiTableParsingSchema):
27
+ if table.name and schema.table_header_level is not None:
28
+ lines.append(f"{'#' * schema.table_header_level} {table.name}")
29
+ lines.append("") # Empty line after name
30
+
31
+ if table.description and schema.capture_description:
32
+ lines.append(table.description)
33
+ lines.append("") # Empty line after description
34
+
35
+ # Build table
36
+ sep = f" {schema.column_separator} "
37
+
38
+ def _prepare_cell(cell: str) -> str:
39
+ """Prepare cell for markdown generation."""
40
+ if schema.convert_br_to_newline and "\n" in cell:
41
+ return cell.replace("\n", "<br>")
42
+ return cell
43
+
44
+ # Headers
45
+ if table.headers:
46
+ # Add outer pipes if required
47
+ processed_headers = [_prepare_cell(h) for h in table.headers]
48
+ header_row = sep.join(processed_headers)
49
+ if schema.require_outer_pipes:
50
+ header_row = (
51
+ f"{schema.column_separator} {header_row} {schema.column_separator}"
52
+ )
53
+ lines.append(header_row)
54
+
55
+ # Separator row
56
+ separator_cells = []
57
+ for i, _ in enumerate(table.headers):
58
+ alignment = "default"
59
+ if table.alignments and i < len(table.alignments):
60
+ # Ensure we handle potentially None values if list has gaps (unlikely by design but safe)
61
+ alignment = table.alignments[i] or "default"
62
+
63
+ # Construct separator cell based on alignment
64
+ # Use 3 hyphens as base
65
+ if alignment == "left":
66
+ cell = ":" + schema.header_separator_char * 3
67
+ elif alignment == "right":
68
+ cell = schema.header_separator_char * 3 + ":"
69
+ elif alignment == "center":
70
+ cell = ":" + schema.header_separator_char * 3 + ":"
71
+ else:
72
+ # default
73
+ cell = schema.header_separator_char * 3
74
+
75
+ separator_cells.append(cell)
76
+
77
+ separator_row = sep.join(separator_cells)
78
+ if schema.require_outer_pipes:
79
+ separator_row = (
80
+ f"{schema.column_separator} {separator_row} {schema.column_separator}"
81
+ )
82
+ lines.append(separator_row)
83
+
84
+ # Rows
85
+ for row in table.rows:
86
+ processed_row = [_prepare_cell(cell) for cell in row]
87
+ row_str = sep.join(processed_row)
88
+ if schema.require_outer_pipes:
89
+ row_str = f"{schema.column_separator} {row_str} {schema.column_separator}"
90
+ lines.append(row_str)
91
+
92
+ # Append Metadata if present
93
+ if table.metadata and "visual" in table.metadata:
94
+ metadata_json = json.dumps(table.metadata["visual"])
95
+ comment = f"<!-- md-spreadsheet-table-metadata: {metadata_json} -->"
96
+ lines.append("")
97
+ lines.append(comment)
98
+
99
+ return "\n".join(lines)
100
+
101
+
102
+ def generate_sheet_markdown(
103
+ sheet: "Sheet", schema: ParsingSchema = DEFAULT_SCHEMA
104
+ ) -> str:
105
+ """
106
+ Generates a Markdown string representation of the sheet.
107
+
108
+ Args:
109
+ sheet: The Sheet object.
110
+ schema (ParsingSchema, optional): Configuration for formatting.
111
+
112
+ Returns:
113
+ str: The Markdown string.
114
+ """
115
+ lines = []
116
+
117
+ if isinstance(schema, MultiTableParsingSchema):
118
+ lines.append(f"{'#' * schema.sheet_header_level} {sheet.name}")
119
+ lines.append("")
120
+
121
+ for i, table in enumerate(sheet.tables):
122
+ lines.append(generate_table_markdown(table, schema))
123
+ if i < len(sheet.tables) - 1:
124
+ lines.append("") # Empty line between tables
125
+
126
+ # Append Sheet Metadata if present (at the end)
127
+ if isinstance(schema, MultiTableParsingSchema) and sheet.metadata:
128
+ lines.append("")
129
+ metadata_json = json.dumps(sheet.metadata)
130
+ comment = f"<!-- md-spreadsheet-sheet-metadata: {metadata_json} -->"
131
+ lines.append(comment)
132
+
133
+ return "\n".join(lines)
134
+
135
+
136
+ def generate_workbook_markdown(
137
+ workbook: "Workbook", schema: MultiTableParsingSchema
138
+ ) -> str:
139
+ """
140
+ Generates a Markdown string representation of the workbook.
141
+
142
+ Args:
143
+ workbook: The Workbook object.
144
+ schema (MultiTableParsingSchema): Configuration for formatting.
145
+
146
+ Returns:
147
+ str: The Markdown string.
148
+ """
149
+ lines = []
150
+
151
+ if schema.root_marker:
152
+ lines.append(schema.root_marker)
153
+ lines.append("")
154
+
155
+ for i, sheet in enumerate(workbook.sheets):
156
+ lines.append(generate_sheet_markdown(sheet, schema))
157
+ if i < len(workbook.sheets) - 1:
158
+ lines.append("") # Empty line between sheets
159
+
160
+ # Append Workbook Metadata if present
161
+ if workbook.metadata:
162
+ # Ensure separation from last sheet
163
+ if lines and lines[-1] != "":
164
+ lines.append("")
165
+
166
+ metadata_json = json.dumps(workbook.metadata)
167
+ comment = f"<!-- md-spreadsheet-workbook-metadata: {metadata_json} -->"
168
+ lines.append(comment)
169
+
170
+ return "\n".join(lines)