md-spreadsheet-parser 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,130 @@
1
+ import json
2
+ from typing import Type, get_origin
3
+ from pydantic import BaseModel, ValidationError as PydanticValidationError
4
+
5
+ from .schemas import ConversionSchema
6
+ from .models import Table
7
+ from .validation import TableValidationError
8
+
9
+ from .utils import normalize_header
10
+
11
+
12
+ def validate_table_pydantic(
13
+ table: Table,
14
+ schema_cls: Type[BaseModel],
15
+ conversion_schema: ConversionSchema,
16
+ ) -> list[BaseModel]:
17
+ """
18
+ Validates a Table using Pydantic.
19
+ """
20
+ # Map headers to fields (checking aliases)
21
+ model_fields = schema_cls.model_fields
22
+
23
+ # helper: find field name by alias or name
24
+ # Pydantic v2 stores alias in FieldInfo
25
+ header_map: dict[int, str] = {} # column_index -> field_name
26
+
27
+ # Pre-calculate normalized map of field names/aliases
28
+ # We map normalized_string -> key_to_use_in_dict
29
+ lookup_map = {}
30
+
31
+ for name, field_info in model_fields.items():
32
+ # By default Pydantic expects the alias if it exists
33
+ # UNLESS populate_by_name=True is set.
34
+ # To be safe and support common case (headers match alias), we prioritize alias.
35
+
36
+ # If alias is defined, map its normalized version to the ALIAS string
37
+ if field_info.alias:
38
+ lookup_map[normalize_header(field_info.alias)] = field_info.alias
39
+
40
+ # Also allow mapping field name if populate_by_name is likely?
41
+ # But we can't easily know the config.
42
+ # Let's support both: normalized(name) -> name
43
+ # But if collision? Alias usually wins in user intent.
44
+ if normalize_header(name) not in lookup_map:
45
+ lookup_map[normalize_header(name)] = name
46
+ else:
47
+ lookup_map[normalize_header(name)] = name
48
+
49
+ normalized_headers = [normalize_header(h) for h in (table.headers or [])]
50
+
51
+ for idx, header in enumerate(normalized_headers):
52
+ if header in lookup_map:
53
+ header_map[idx] = lookup_map[header]
54
+
55
+ results = []
56
+ errors = []
57
+
58
+ for row_idx, row in enumerate(table.rows):
59
+ row_data = {}
60
+ for col_idx, cell_value in enumerate(row):
61
+ if col_idx in header_map:
62
+ target_key = header_map[col_idx]
63
+
64
+ # Check for field-specific converter first (Library specific override)
65
+ if target_key in conversion_schema.field_converters:
66
+ converter = conversion_schema.field_converters[target_key]
67
+ try:
68
+ val = converter(cell_value)
69
+ row_data[target_key] = val
70
+ except Exception as e:
71
+ errors.append(
72
+ f"Row {row_idx + 1}: Column '{target_key}' conversion failed: {e}"
73
+ )
74
+ else:
75
+ if cell_value.strip() == "":
76
+ row_data[target_key] = None
77
+ else:
78
+ # Pydantic JSON Pre-parsing
79
+ # If field type is dict or list, try parsing as JSON
80
+ # Pydantic v2 stores type in field_info.annotation
81
+ val_to_set = cell_value
82
+
83
+ # Find FieldInfo to check type
84
+ # We have schema_cls.model_fields[name] -> FieldInfo
85
+ # Need to find the NAME corresponding to target_key
86
+ # Wait, target_key IS the field name (or alias) used in the dict.
87
+ # But Pydantic accepts name OR alias.
88
+
89
+ target_field_name = None
90
+ if target_key in model_fields:
91
+ target_field_name = target_key
92
+ else:
93
+ # Reverse lookup for alias?
94
+ # In Pydantic v2, model_fields keys are attribute names.
95
+ # If target_key matches alias, we need to find the attribute name to get type.
96
+ for fname, f in model_fields.items():
97
+ if f.alias == target_key:
98
+ target_field_name = fname
99
+ break
100
+
101
+ if target_field_name:
102
+ field_def = model_fields[target_field_name]
103
+ ftype = field_def.annotation
104
+ origin = get_origin(ftype)
105
+
106
+ if (ftype is dict or ftype is list) or (
107
+ origin is dict or origin is list
108
+ ):
109
+ try:
110
+ val_to_set = json.loads(cell_value)
111
+ except json.JSONDecodeError:
112
+ # Fallback: Let Pydantic validation handle it (might raise error or work if string is expected)
113
+ pass
114
+
115
+ row_data[target_key] = val_to_set
116
+
117
+ try:
118
+ obj = schema_cls(**row_data)
119
+ results.append(obj)
120
+ except PydanticValidationError as e:
121
+ # Format Pydantic errors nicely
122
+ for err in e.errors():
123
+ loc = ".".join(map(str, err["loc"]))
124
+ msg = err["msg"]
125
+ errors.append(f"Row {row_idx + 1}: Field '{loc}' - {msg}")
126
+
127
+ if errors:
128
+ raise TableValidationError(errors)
129
+
130
+ return results
@@ -0,0 +1,108 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Callable, Any
3
+
4
+
5
+ @dataclass(frozen=True)
6
+ class ParsingSchema:
7
+ """
8
+ Configuration for parsing markdown tables.
9
+ Designed to be immutable and passed to pure functions.
10
+
11
+ Attributes:
12
+ column_separator (str): Character used to separate columns. Defaults to "|".
13
+ header_separator_char (str): Character used in the separator row. Defaults to "-".
14
+ require_outer_pipes (bool): Whether tables must have outer pipes (e.g. `| col |`). Defaults to True.
15
+ strip_whitespace (bool): Whether to strip whitespace from cell values. Defaults to True.
16
+ """
17
+
18
+ column_separator: str = "|"
19
+ header_separator_char: str = "-"
20
+ require_outer_pipes: bool = True
21
+ strip_whitespace: bool = True
22
+ convert_br_to_newline: bool = True
23
+
24
+
25
+ # Default schema for standard Markdown tables (GFM style)
26
+ DEFAULT_SCHEMA = ParsingSchema()
27
+
28
+
29
+ @dataclass(frozen=True)
30
+ class MultiTableParsingSchema(ParsingSchema):
31
+ """
32
+ Configuration for parsing multiple tables (workbook mode).
33
+ Inherits from ParsingSchema.
34
+
35
+ Attributes:
36
+ root_marker (str): The marker indicating the start of the data section. Defaults to "# Tables".
37
+ sheet_header_level (int): The markdown header level for sheets. Defaults to 2 (e.g. `## Sheet`).
38
+ table_header_level (int | None): The markdown header level for tables. If None, table names are not extracted. Defaults to None.
39
+ capture_description (bool): Whether to capture text between the table header and the table as a description. Defaults to False.
40
+ """
41
+
42
+ root_marker: str = "# Tables"
43
+ sheet_header_level: int = 2
44
+ table_header_level: int | None = 3
45
+ capture_description: bool = True
46
+
47
+ def __post_init__(self):
48
+ if self.capture_description and self.table_header_level is None:
49
+ raise ValueError(
50
+ "capture_description=True requires table_header_level to be set"
51
+ )
52
+
53
+
54
+ @dataclass(frozen=True)
55
+ class ConversionSchema:
56
+ """
57
+ Configuration for converting string values to Python types.
58
+
59
+ Attributes:
60
+ boolean_pairs: Pairs of strings representing (True, False). Case-insensitive.
61
+ Example: `(("yes", "no"), ("on", "off"))`.
62
+ custom_converters: Dictionary mapping ANY Python type to a conversion function `str -> Any`.
63
+ You can specify:
64
+ - Built-in types: `int`, `float`, `bool` (to override default behavior)
65
+ - Standard library types: `Decimal`, `datetime`, `date`, `ZoneInfo`
66
+ - Custom classes: `MyClass`, `Product`
67
+ field_converters: Dictionary mapping field names (str) to conversion functions.
68
+ Takes precedence over `custom_converters`.
69
+ """
70
+
71
+ boolean_pairs: tuple[tuple[str, str], ...] = (
72
+ ("true", "false"),
73
+ ("yes", "no"),
74
+ ("1", "0"),
75
+ ("on", "off"),
76
+ )
77
+ custom_converters: dict[type, Callable[[str], Any]] = field(default_factory=dict)
78
+ field_converters: dict[str, Callable[[str], Any]] = field(default_factory=dict)
79
+
80
+
81
+ DEFAULT_CONVERSION_SCHEMA = ConversionSchema()
82
+
83
+
84
+ @dataclass(frozen=True)
85
+ class ExcelParsingSchema:
86
+ """
87
+ Configuration for parsing Excel-exported data (TSV/CSV or openpyxl).
88
+
89
+ Attributes:
90
+ header_rows: Number of header rows (1 or 2).
91
+ If 2, headers are flattened to "Parent - Child" format.
92
+ fill_merged_headers: Whether to forward-fill empty header cells
93
+ (for merged cells in Excel exports).
94
+ delimiter: Column separator for TSV/CSV parsing. Default is tab.
95
+ header_separator: Separator used when flattening 2-row headers.
96
+ """
97
+
98
+ header_rows: int = 1
99
+ fill_merged_headers: bool = True
100
+ delimiter: str = "\t"
101
+ header_separator: str = " - "
102
+
103
+ def __post_init__(self):
104
+ if self.header_rows not in (1, 2):
105
+ raise ValueError("header_rows must be 1 or 2")
106
+
107
+
108
+ DEFAULT_EXCEL_SCHEMA = ExcelParsingSchema()
@@ -0,0 +1,6 @@
1
+ def normalize_header(header: str) -> str:
2
+ """
3
+ Normalizes a header string to match field names (lowercase, snake_case).
4
+ Example: "User Name" -> "user_name"
5
+ """
6
+ return header.lower().replace(" ", "_").strip()
@@ -0,0 +1,348 @@
1
+ import json
2
+ import types
3
+ from dataclasses import fields, is_dataclass
4
+ from typing import TYPE_CHECKING, Any, Type, TypeVar, get_args, get_origin, is_typeddict
5
+
6
+ if TYPE_CHECKING:
7
+ from .models import Table
8
+ from .schemas import DEFAULT_CONVERSION_SCHEMA, ConversionSchema
9
+ from .utils import normalize_header
10
+
11
+ T = TypeVar("T")
12
+
13
+
14
+ class TableValidationError(Exception):
15
+ """
16
+ Exception raised when table validation fails.
17
+ Contains a list of errors found during validation.
18
+ """
19
+
20
+ def __init__(self, errors: list[str]):
21
+ self.errors = errors
22
+ super().__init__(
23
+ f"Validation failed with {len(errors)} errors:\n" + "\n".join(errors)
24
+ )
25
+
26
+
27
+ def _convert_value(
28
+ value: str, target_type: Type, schema: ConversionSchema = DEFAULT_CONVERSION_SCHEMA
29
+ ) -> Any:
30
+ """
31
+ Converts a string value to the target type.
32
+ Supports int, float, bool, str, and Optional types.
33
+ """
34
+ # Check custom converters first
35
+ if target_type in schema.custom_converters:
36
+ return schema.custom_converters[target_type](value)
37
+
38
+ origin = get_origin(target_type)
39
+ args = get_args(target_type)
40
+
41
+ # Handle Optional[T] (Union[T, None])
42
+ # Robust check for Union-like types
43
+ if origin is not None and (origin is types.UnionType or "Union" in str(origin)):
44
+ if type(None) in args:
45
+ if not value.strip():
46
+ return None
47
+ # Find the non-None type
48
+ for arg in args:
49
+ if arg is not type(None):
50
+ return _convert_value(value, arg, schema)
51
+
52
+ # Handle basic types
53
+ if target_type is int:
54
+ if not value.strip():
55
+ raise ValueError("Empty value for int field")
56
+ return int(value)
57
+
58
+ if target_type is float:
59
+ if not value.strip():
60
+ raise ValueError("Empty value for float field")
61
+ return float(value)
62
+
63
+ if target_type is bool:
64
+ lower_val = value.lower().strip()
65
+ for true_val, false_val in schema.boolean_pairs:
66
+ if lower_val == true_val.lower():
67
+ return True
68
+ if lower_val == false_val.lower():
69
+ return False
70
+
71
+ raise ValueError(f"Invalid boolean value: '{value}'")
72
+
73
+ if target_type is str:
74
+ return value
75
+
76
+ # JSON Parsing for dict/list
77
+ # Logic: If target is strict dict or list, try parsing as JSON
78
+ # This covers dict, list, dict[str, Any], list[int], etc.
79
+ if origin in (dict, list) or target_type in (dict, list):
80
+ if not value.strip():
81
+ # Empty string -> Empty dict/list? Or None?
82
+ # Let's say empty string is not valid JSON, so strictly it should fail or return empty type.
83
+ # For user friendliness, let's treat empty string as empty container if not Optional
84
+ # For user friendliness, let's treat empty string as empty container if not Optional
85
+ if origin:
86
+ return origin() # type: ignore
87
+ if target_type is dict:
88
+ return {}
89
+ if target_type is list:
90
+ return []
91
+ return target_type() # type: ignore
92
+ try:
93
+ return json.loads(value)
94
+ except json.JSONDecodeError as e:
95
+ raise ValueError(f"Invalid JSON for {target_type}: {e}")
96
+
97
+ # Fallback for other types (or if type hint is missing)
98
+ return value
99
+
100
+
101
+ # --- Pydantic Support (Optional) ---
102
+
103
+ try:
104
+ from pydantic import BaseModel
105
+
106
+ HAS_PYDANTIC = True
107
+ except ImportError:
108
+ HAS_PYDANTIC = False
109
+ HAS_PYDANTIC = False
110
+ BaseModel = object # type: ignore
111
+
112
+
113
+ def _validate_table_dataclass(
114
+ table: "Table",
115
+ schema_cls: Type[T],
116
+ conversion_schema: ConversionSchema,
117
+ ) -> list[T]:
118
+ """
119
+ Validates a Table using standard dataclasses.
120
+ """
121
+ # Map headers to fields
122
+ cls_fields = {f.name: f for f in fields(schema_cls)} # type: ignore
123
+ header_map: dict[int, str] = {} # column_index -> field_name
124
+
125
+ normalized_headers = [normalize_header(h) for h in (table.headers or [])]
126
+
127
+ for idx, header in enumerate(normalized_headers):
128
+ if header in cls_fields:
129
+ header_map[idx] = header
130
+
131
+ # Process rows
132
+ results: list[T] = []
133
+ errors: list[str] = []
134
+
135
+ for row_idx, row in enumerate(table.rows):
136
+ row_data = {}
137
+ row_errors = []
138
+
139
+ for col_idx, cell_value in enumerate(row):
140
+ if col_idx in header_map:
141
+ field_name = header_map[col_idx]
142
+ field_def = cls_fields[field_name]
143
+
144
+ try:
145
+ # Check for field-specific converter first
146
+ if field_name in conversion_schema.field_converters:
147
+ converter = conversion_schema.field_converters[field_name]
148
+ converted_value = converter(cell_value)
149
+ else:
150
+ converted_value = _convert_value(
151
+ cell_value,
152
+ field_def.type, # type: ignore
153
+ conversion_schema, # type: ignore
154
+ )
155
+ row_data[field_name] = converted_value
156
+ except ValueError as e:
157
+ row_errors.append(f"Column '{field_name}': {str(e)}")
158
+ except Exception:
159
+ row_errors.append(
160
+ f"Column '{field_name}': Failed to convert '{cell_value}' to {field_def.type}"
161
+ )
162
+
163
+ if row_errors:
164
+ for err in row_errors:
165
+ errors.append(f"Row {row_idx + 1}: {err}")
166
+ continue
167
+
168
+ try:
169
+ obj = schema_cls(**row_data)
170
+ results.append(obj)
171
+ except TypeError as e:
172
+ # This catches missing required arguments
173
+ errors.append(f"Row {row_idx + 1}: {str(e)}")
174
+
175
+ if errors:
176
+ raise TableValidationError(errors)
177
+
178
+ return results
179
+
180
+
181
+ def _validate_table_typeddict(
182
+ table: "Table",
183
+ schema_cls: Type[T],
184
+ conversion_schema: ConversionSchema,
185
+ ) -> list[T]:
186
+ """
187
+ Validates a Table using TypedDict.
188
+ """
189
+ # TypedDict annotations
190
+ # __annotations__ or __required_keys__ / __optional_keys__ behavior
191
+ # For simplicity, we trust __annotations__ for type hints
192
+ annotations = schema_cls.__annotations__
193
+
194
+ header_map: dict[int, str] = {}
195
+ normalized_headers = [normalize_header(h) for h in (table.headers or [])]
196
+
197
+ # Map headers to TypedDict keys
198
+ # Prioritize exact match, then normalized match
199
+ # TypedDict doesn't support 'alias' natively usually, so simple mapping
200
+ for idx, header in enumerate(normalized_headers):
201
+ # 1. Check direct match with key names (normalized)
202
+ for key in annotations:
203
+ if normalize_header(key) == header:
204
+ header_map[idx] = key
205
+ break
206
+
207
+ results: list[T] = []
208
+ errors: list[str] = []
209
+
210
+ for row_idx, row in enumerate(table.rows):
211
+ row_data = {}
212
+ row_errors = []
213
+
214
+ for col_idx, cell_value in enumerate(row):
215
+ if col_idx in header_map:
216
+ key = header_map[col_idx]
217
+ target_type = annotations[key]
218
+
219
+ try:
220
+ if key in conversion_schema.field_converters:
221
+ converter = conversion_schema.field_converters[key]
222
+ converted_value = converter(cell_value)
223
+ else:
224
+ converted_value = _convert_value(
225
+ cell_value, target_type, conversion_schema
226
+ )
227
+ row_data[key] = converted_value
228
+ except Exception as e:
229
+ row_errors.append(f"Column '{key}': {str(e)}")
230
+
231
+ if row_errors:
232
+ for err in row_errors:
233
+ errors.append(f"Row {row_idx + 1}: {err}")
234
+ continue
235
+
236
+ # Create TypedDict (it's just a dict at runtime)
237
+ # We should check required keys if using TypedDict features (Python 3.9+)
238
+ # But for now, simple dict construction
239
+ try:
240
+ # Basic check: Missing keys?
241
+ # TypedDict doesn't complain on instantiation (it's a dict),
242
+ # but static type checkers do.
243
+ # We should probably validate required keys if possible, but let's keep it simple for now.
244
+ results.append(row_data) # type: ignore
245
+ except Exception as e:
246
+ errors.append(f"Row {row_idx + 1}: {str(e)}")
247
+
248
+ if errors:
249
+ raise TableValidationError(errors)
250
+
251
+ return results
252
+
253
+
254
+ def _validate_table_dict(
255
+ table: "Table",
256
+ conversion_schema: ConversionSchema,
257
+ ) -> list[dict[str, Any]]:
258
+ """
259
+ Converts a Table to a list of dicts.
260
+ Keys are derived from headers.
261
+ """
262
+ # normalized_headers = [normalize_header(h) for h in table.headers]
263
+
264
+ # Use original header names or normalized?
265
+ # Usually users prefer original headers as keys if they passed 'dict'.
266
+ # But wait, validate_table usually normalizes.
267
+ # Let's use the actual header string from the table as the key,
268
+ # but normalize for field_converter lookups.
269
+
270
+ results = []
271
+
272
+ for row in table.rows:
273
+ row_data = {}
274
+ for idx, cell_value in enumerate(row):
275
+ if table.headers and idx < len(table.headers):
276
+ original_header = table.headers[idx]
277
+ key_for_conversion = normalize_header(original_header)
278
+
279
+ # Check converters
280
+ if key_for_conversion in conversion_schema.field_converters:
281
+ converter = conversion_schema.field_converters[key_for_conversion]
282
+ try:
283
+ val = converter(cell_value)
284
+ except Exception:
285
+ val = (
286
+ cell_value # Fallback or Raise? Let's fallback for raw dict
287
+ )
288
+ else:
289
+ val = cell_value
290
+
291
+ row_data[original_header] = val
292
+ results.append(row_data)
293
+
294
+ return results
295
+
296
+
297
+ def validate_table(
298
+ table: "Table",
299
+ schema_cls: Type[T],
300
+ conversion_schema: ConversionSchema = DEFAULT_CONVERSION_SCHEMA,
301
+ ) -> list[T]:
302
+ """
303
+ Validates a Table object against a dataclass OR Pydantic schema.
304
+
305
+ Args:
306
+ table: The Table object to validate.
307
+ schema_cls: The dataclass or Pydantic model type to validate against.
308
+ conversion_schema: Configuration for type conversion.
309
+
310
+ Returns:
311
+ list[T]: A list of validated instances.
312
+
313
+ Raises:
314
+ ValueError: If schema_cls is not a valid schema.
315
+ TableValidationError: If validation fails.
316
+ """
317
+ # Check for Pydantic Model
318
+ if HAS_PYDANTIC and BaseModel and issubclass(schema_cls, BaseModel):
319
+ if not table.headers:
320
+ raise TableValidationError(["Table has no headers"])
321
+ # Import adapter lazily to avoid unused imports when pydantic is not used
322
+ # (though we checked HAS_PYDANTIC so it exists)
323
+ from .pydantic_adapter import validate_table_pydantic
324
+
325
+ return validate_table_pydantic(table, schema_cls, conversion_schema) # type: ignore
326
+
327
+ # Check for Dataclass
328
+ if is_dataclass(schema_cls):
329
+ if not table.headers:
330
+ raise TableValidationError(["Table has no headers"])
331
+ return _validate_table_dataclass(table, schema_cls, conversion_schema)
332
+
333
+ # Check for TypedDict
334
+ if is_typeddict(schema_cls):
335
+ if not table.headers:
336
+ raise TableValidationError(["Table has no headers"])
337
+ return _validate_table_typeddict(table, schema_cls, conversion_schema)
338
+
339
+ # Check for simple dict
340
+ # We compare schema_cls against dict type
341
+ if schema_cls is dict:
342
+ if not table.headers:
343
+ raise TableValidationError(["Table has no headers"])
344
+ return _validate_table_dict(table, conversion_schema) # type: ignore
345
+
346
+ raise ValueError(
347
+ f"{schema_cls} must be a dataclass, Pydantic model, TypedDict, or dict"
348
+ )