md-spreadsheet-parser 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,183 @@
1
+ from typing import Union, TextIO, Iterator, Iterable
2
+ from pathlib import Path
3
+ from dataclasses import replace
4
+
5
+ from .models import Table, Workbook
6
+ from .schemas import ParsingSchema, MultiTableParsingSchema, DEFAULT_SCHEMA
7
+ from .parsing import parse_table, parse_workbook, scan_tables
8
+
9
+
10
+ def _read_content(source: Union[str, Path, TextIO]) -> str:
11
+ """Helper to read content from file path or file object."""
12
+ if isinstance(source, (str, Path)):
13
+ with open(source, "r", encoding="utf-8") as f:
14
+ return f.read()
15
+ if hasattr(source, "read"):
16
+ return source.read()
17
+ raise ValueError(f"Invalid source type: {type(source)}")
18
+
19
+
20
+ def parse_table_from_file(
21
+ source: Union[str, Path, TextIO], schema: ParsingSchema = DEFAULT_SCHEMA
22
+ ) -> Table:
23
+ """
24
+ Parse a markdown table from a file.
25
+
26
+ Args:
27
+ source: File path (str/Path) or file-like object.
28
+ schema: Parsing configuration.
29
+ """
30
+ content = _read_content(source)
31
+ return parse_table(content, schema)
32
+
33
+
34
+ def parse_workbook_from_file(
35
+ source: Union[str, Path, TextIO],
36
+ schema: MultiTableParsingSchema = MultiTableParsingSchema(),
37
+ ) -> Workbook:
38
+ """
39
+ Parse a markdown workbook from a file.
40
+
41
+ Args:
42
+ source: File path (str/Path) or file-like object.
43
+ schema: Parsing configuration.
44
+ """
45
+ content = _read_content(source)
46
+ return parse_workbook(content, schema)
47
+
48
+
49
+ def scan_tables_from_file(
50
+ source: Union[str, Path, TextIO], schema: MultiTableParsingSchema | None = None
51
+ ) -> list[Table]:
52
+ """
53
+ Scan a markdown file for all tables.
54
+
55
+ Args:
56
+ source: File path (str/Path) or file-like object.
57
+ schema: Optional schema.
58
+ """
59
+ content = _read_content(source)
60
+ return scan_tables(content, schema)
61
+
62
+
63
+ def _iter_lines(source: Union[str, Path, TextIO, Iterable[str]]) -> Iterator[str]:
64
+ """Helper to iterate lines from various sources."""
65
+ if isinstance(source, (str, Path)):
66
+ # If it's a file path, valid file
67
+ with open(source, "r", encoding="utf-8") as f:
68
+ yield from f
69
+ elif hasattr(source, "read") or isinstance(source, Iterable):
70
+ # File object or list of strings
71
+ # If it's a file object, iterating it yields lines
72
+ for line in source:
73
+ yield line
74
+ else:
75
+ raise ValueError(f"Invalid source type for iteration: {type(source)}")
76
+
77
+
78
+ def scan_tables_iter(
79
+ source: Union[str, Path, TextIO, Iterable[str]],
80
+ schema: MultiTableParsingSchema | None = None,
81
+ ) -> Iterator[Table]:
82
+ """
83
+ Stream tables from a source (file path, file object, or iterable) one by one.
84
+ This allows processing files larger than memory, provided that individual tables fit in memory.
85
+
86
+ Args:
87
+ source: File path, open file object, or iterable of strings.
88
+ schema: Parsing configuration.
89
+
90
+ Yields:
91
+ Table objects found in the stream.
92
+ """
93
+ if schema is None:
94
+ schema = MultiTableParsingSchema()
95
+
96
+ header_prefix = None
97
+ if schema.table_header_level is not None:
98
+ header_prefix = "#" * schema.table_header_level + " "
99
+
100
+ current_lines: list[str] = []
101
+ current_name: str | None = None
102
+ # We track line number manually for metadata
103
+ current_line_idx = 0
104
+ # Start of the current block
105
+ block_start_line = 0
106
+
107
+ def parse_and_yield(
108
+ lines: list[str], name: str | None, start_offset: int
109
+ ) -> Iterator[Table]:
110
+ if not lines:
111
+ return
112
+
113
+ # Check if block looks like a table (has separator)
114
+ block_text = "".join(lines)
115
+
116
+ if schema.column_separator not in block_text:
117
+ return
118
+
119
+ # Simple extraction logic similar to process_table_block
120
+ # We reuse parsing logic.
121
+
122
+ # Split description vs table
123
+ # We need list of lines stripped of newline for index finding
124
+ stripped_lines = [line_val.rstrip("\n") for line_val in lines]
125
+
126
+ table_start_idx = -1
127
+ for idx, line in enumerate(stripped_lines):
128
+ if schema.column_separator in line:
129
+ table_start_idx = idx
130
+ break
131
+
132
+ if table_start_idx != -1:
133
+ desc_lines = stripped_lines[:table_start_idx]
134
+ table_lines = stripped_lines[table_start_idx:]
135
+
136
+ table_text = "\n".join(table_lines)
137
+ table = parse_table(table_text, schema)
138
+
139
+ if table.rows or table.headers:
140
+ description = None
141
+ if schema.capture_description:
142
+ desc_text = "\n".join(d.strip() for d in desc_lines if d.strip())
143
+ if desc_text:
144
+ description = desc_text
145
+
146
+ table = replace(
147
+ table,
148
+ name=name,
149
+ description=description,
150
+ start_line=start_offset + table_start_idx,
151
+ end_line=start_offset + len(lines),
152
+ )
153
+ yield table
154
+
155
+ for line in _iter_lines(source):
156
+ # normalize: file iter yields line with \n
157
+ stripped_line = line.strip()
158
+
159
+ is_header = header_prefix and stripped_line.startswith(header_prefix)
160
+
161
+ if is_header:
162
+ # New section starts. Yield previous buffer if any.
163
+ yield from parse_and_yield(current_lines, current_name, block_start_line)
164
+
165
+ assert header_prefix is not None
166
+ current_name = stripped_line[len(header_prefix) :].strip()
167
+ current_lines = []
168
+ block_start_line = current_line_idx
169
+
170
+ elif stripped_line == "":
171
+ # Blank line.
172
+ yield from parse_and_yield(current_lines, current_name, block_start_line)
173
+ current_lines = []
174
+ # block_start_line for NEXT block will be current_line_idx + 1
175
+ block_start_line = current_line_idx + 1
176
+
177
+ else:
178
+ current_lines.append(line)
179
+
180
+ current_line_idx += 1
181
+
182
+ # End of stream
183
+ yield from parse_and_yield(current_lines, current_name, block_start_line)
@@ -0,0 +1,491 @@
1
+ from dataclasses import dataclass, replace
2
+ from typing import Any, Literal, TypedDict, TypeVar
3
+
4
+ from .generator import (
5
+ generate_sheet_markdown,
6
+ generate_table_markdown,
7
+ generate_workbook_markdown,
8
+ )
9
+ from .schemas import (
10
+ DEFAULT_CONVERSION_SCHEMA,
11
+ DEFAULT_SCHEMA,
12
+ ConversionSchema,
13
+ MultiTableParsingSchema,
14
+ ParsingSchema,
15
+ )
16
+ from .validation import validate_table
17
+
18
+ T = TypeVar("T")
19
+
20
+ AlignmentType = Literal["left", "center", "right", "default"]
21
+
22
+
23
+ class TableJSON(TypedDict):
24
+ """
25
+ JSON-compatible dictionary representation of a Table.
26
+ """
27
+
28
+ name: str | None
29
+ description: str | None
30
+ headers: list[str] | None
31
+ rows: list[list[str]]
32
+ metadata: dict[str, Any]
33
+ start_line: int | None
34
+ end_line: int | None
35
+ alignments: list[AlignmentType] | None
36
+
37
+
38
+ class SheetJSON(TypedDict):
39
+ """
40
+ JSON-compatible dictionary representation of a Sheet.
41
+ """
42
+
43
+ name: str
44
+ tables: list[TableJSON]
45
+ metadata: dict[str, Any]
46
+
47
+
48
+ class WorkbookJSON(TypedDict):
49
+ """
50
+ JSON-compatible dictionary representation of a Workbook.
51
+ """
52
+
53
+ sheets: list[SheetJSON]
54
+ metadata: dict[str, Any]
55
+
56
+
57
+ @dataclass(frozen=True)
58
+ class Table:
59
+ """
60
+ Represents a parsed table with optional metadata.
61
+
62
+ Attributes:
63
+ headers (list[str] | None): List of column headers, or None if the table has no headers.
64
+ rows (list[list[str]]): List of data rows.
65
+ alignments (list[AlignmentType] | None): List of column alignments ('left', 'center', 'right'). Defaults to None.
66
+ name (str | None): Name of the table (e.g. from a header). Defaults to None.
67
+ description (str | None): Description of the table. Defaults to None.
68
+ metadata (dict[str, Any] | None): Arbitrary metadata. Defaults to None.
69
+ """
70
+
71
+ headers: list[str] | None
72
+ rows: list[list[str]]
73
+ alignments: list[AlignmentType] | None = None
74
+ name: str | None = None
75
+ description: str | None = None
76
+ metadata: dict[str, Any] | None = None
77
+ start_line: int | None = None
78
+ end_line: int | None = None
79
+
80
+ def __post_init__(self):
81
+ if self.metadata is None:
82
+ # Hack to allow default value for mutable type in frozen dataclass
83
+ object.__setattr__(self, "metadata", {})
84
+
85
+ @property
86
+ def json(self) -> TableJSON:
87
+ """
88
+ Returns a JSON-compatible dictionary representation of the table.
89
+
90
+ Returns:
91
+ TableJSON: A dictionary containing the table data.
92
+ """
93
+ return {
94
+ "name": self.name,
95
+ "description": self.description,
96
+ "headers": self.headers,
97
+ "rows": self.rows,
98
+ "metadata": self.metadata if self.metadata is not None else {},
99
+ "start_line": self.start_line,
100
+ "end_line": self.end_line,
101
+ "alignments": self.alignments,
102
+ }
103
+
104
+ def to_models(
105
+ self,
106
+ schema_cls: type[T],
107
+ conversion_schema: ConversionSchema = DEFAULT_CONVERSION_SCHEMA,
108
+ ) -> list[T]:
109
+ """
110
+ Converts the table rows into a list of dataclass instances, performing validation and type conversion.
111
+
112
+ Args:
113
+ schema_cls (type[T]): The dataclass type to validate against.
114
+ conversion_schema (ConversionSchema, optional): Configuration for type conversion.
115
+
116
+ Returns:
117
+ list[T]: A list of validated dataclass instances.
118
+
119
+ Raises:
120
+ ValueError: If schema_cls is not a dataclass.
121
+ TableValidationError: If validation fails for any row or if the table has no headers.
122
+ """
123
+ return validate_table(self, schema_cls, conversion_schema)
124
+
125
+ def to_markdown(self, schema: ParsingSchema = DEFAULT_SCHEMA) -> str:
126
+ """
127
+ Generates a Markdown string representation of the table.
128
+
129
+ Args:
130
+ schema (ParsingSchema, optional): Configuration for formatting.
131
+
132
+ Returns:
133
+ str: The Markdown string.
134
+ """
135
+ return generate_table_markdown(self, schema)
136
+
137
+ def update_cell(self, row_idx: int, col_idx: int, value: str) -> "Table":
138
+ """
139
+ Return a new Table with the specified cell updated.
140
+ """
141
+ # Handle header update
142
+ if row_idx == -1:
143
+ if self.headers is None:
144
+ # Determine width from rows if possible, or start fresh
145
+ width = len(self.rows[0]) if self.rows else (col_idx + 1)
146
+ new_headers = [""] * width
147
+ # Ensure width enough
148
+ if col_idx >= len(new_headers):
149
+ new_headers.extend([""] * (col_idx - len(new_headers) + 1))
150
+ else:
151
+ new_headers = list(self.headers)
152
+ if col_idx >= len(new_headers):
153
+ new_headers.extend([""] * (col_idx - len(new_headers) + 1))
154
+
155
+ # Update alignments if headers grew
156
+ new_alignments = list(self.alignments) if self.alignments else []
157
+ if len(new_headers) > len(new_alignments):
158
+ # Fill with default/None up to new width
159
+ # But we only need as many alignments as columns.
160
+ # If alignments is None, it stays None?
161
+ # Ideally if we start tracking alignments, we should init it?
162
+ # If self.alignments was None, we might keep it None unless explicitly set?
163
+ # Consistent behavior: If alignments is NOT None, expand it.
164
+ if self.alignments is not None:
165
+ # Cast or explicit type check might be needed for strict type checkers with literals
166
+ # Using a typed list to satisfy invariant list[AlignmentType]
167
+ extension: list[AlignmentType] = ["default"] * (
168
+ len(new_headers) - len(new_alignments)
169
+ )
170
+ new_alignments.extend(extension)
171
+
172
+ final_alignments = new_alignments if self.alignments is not None else None
173
+
174
+ new_headers[col_idx] = value
175
+
176
+ return replace(self, headers=new_headers, alignments=final_alignments)
177
+
178
+ # Handle Body update
179
+ # 1. Ensure row exists
180
+ new_rows = [list(r) for r in self.rows]
181
+
182
+ # Grow rows if needed
183
+ if row_idx >= len(new_rows):
184
+ # Calculate width
185
+ width = (
186
+ len(self.headers)
187
+ if self.headers
188
+ else (len(new_rows[0]) if new_rows else 0)
189
+ )
190
+ if width == 0:
191
+ width = col_idx + 1 # At least cover the new cell
192
+
193
+ rows_to_add = row_idx - len(new_rows) + 1
194
+ for _ in range(rows_to_add):
195
+ new_rows.append([""] * width)
196
+
197
+ # If columns expanded due to row update, we might need to expand alignments too
198
+ current_width = len(new_rows[0]) if new_rows else 0
199
+ if col_idx >= current_width:
200
+ # This means we are expanding columns
201
+ if self.alignments is not None:
202
+ width_needed = col_idx + 1
203
+ current_align_len = len(self.alignments)
204
+ if width_needed > current_align_len:
205
+ new_alignments = list(self.alignments)
206
+ extension: list[AlignmentType] = ["default"] * (
207
+ width_needed - current_align_len
208
+ )
209
+ new_alignments.extend(extension)
210
+ return replace(
211
+ self,
212
+ rows=self._update_rows_cell(new_rows, row_idx, col_idx, value),
213
+ alignments=new_alignments,
214
+ )
215
+
216
+ return replace(
217
+ self, rows=self._update_rows_cell(new_rows, row_idx, col_idx, value)
218
+ )
219
+
220
+ def _update_rows_cell(self, new_rows, row_idx, col_idx, value):
221
+ target_row = new_rows[row_idx]
222
+ if col_idx >= len(target_row):
223
+ target_row.extend([""] * (col_idx - len(target_row) + 1))
224
+ target_row[col_idx] = value
225
+ return new_rows
226
+
227
+ def delete_row(self, row_idx: int) -> "Table":
228
+ """
229
+ Return a new Table with the row at index removed.
230
+ """
231
+ new_rows = [list(r) for r in self.rows]
232
+ if 0 <= row_idx < len(new_rows):
233
+ new_rows.pop(row_idx)
234
+ return replace(self, rows=new_rows)
235
+
236
+ def delete_column(self, col_idx: int) -> "Table":
237
+ """
238
+ Return a new Table with the column at index removed.
239
+ """
240
+ new_headers = list(self.headers) if self.headers else None
241
+ if new_headers and 0 <= col_idx < len(new_headers):
242
+ new_headers.pop(col_idx)
243
+
244
+ new_rows = []
245
+ for row in self.rows:
246
+ new_row = list(row)
247
+ if 0 <= col_idx < len(new_row):
248
+ new_row.pop(col_idx)
249
+ new_rows.append(new_row)
250
+
251
+ new_alignments = None
252
+ if self.alignments is not None:
253
+ new_alignments = list(self.alignments)
254
+ if 0 <= col_idx < len(new_alignments):
255
+ new_alignments.pop(col_idx)
256
+
257
+ return replace(
258
+ self, headers=new_headers, rows=new_rows, alignments=new_alignments
259
+ )
260
+
261
+ def clear_column_data(self, col_idx: int) -> "Table":
262
+ """
263
+ Return a new Table with data in the specified column cleared (set to empty string),
264
+ but headers and column structure preserved.
265
+ """
266
+ # Headers remain unchanged
267
+
268
+ new_rows = []
269
+ for row in self.rows:
270
+ new_row = list(row)
271
+ if 0 <= col_idx < len(new_row):
272
+ new_row[col_idx] = ""
273
+ new_rows.append(new_row)
274
+
275
+ return replace(self, rows=new_rows)
276
+
277
+ def insert_row(self, row_idx: int) -> "Table":
278
+ """
279
+ Return a new Table with an empty row inserted at row_idx.
280
+ Subsequent rows are shifted down.
281
+ """
282
+ new_rows = [list(r) for r in self.rows]
283
+
284
+ # Determine width
285
+ width = (
286
+ len(self.headers) if self.headers else (len(new_rows[0]) if new_rows else 0)
287
+ )
288
+ if width == 0:
289
+ width = 1 # Default to 1 column if table is empty
290
+
291
+ new_row = [""] * width
292
+
293
+ if row_idx < 0:
294
+ row_idx = 0
295
+ if row_idx > len(new_rows):
296
+ row_idx = len(new_rows)
297
+
298
+ new_rows.insert(row_idx, new_row)
299
+ return replace(self, rows=new_rows)
300
+
301
+ def insert_column(self, col_idx: int) -> "Table":
302
+ """
303
+ Return a new Table with an empty column inserted at col_idx.
304
+ Subsequent columns are shifted right.
305
+ """
306
+ new_headers = list(self.headers) if self.headers else None
307
+
308
+ if new_headers:
309
+ if col_idx < 0:
310
+ col_idx = 0
311
+ if col_idx > len(new_headers):
312
+ col_idx = len(new_headers)
313
+ new_headers.insert(col_idx, "")
314
+
315
+ new_alignments = None
316
+ if self.alignments is not None:
317
+ new_alignments = list(self.alignments)
318
+ # Pad if needed before insertion?
319
+ if col_idx > len(new_alignments):
320
+ extension: list[AlignmentType] = ["default"] * (
321
+ col_idx - len(new_alignments)
322
+ )
323
+ new_alignments.extend(extension)
324
+ new_alignments.insert(col_idx, "default") # Default alignment
325
+
326
+ new_rows = []
327
+ for row in self.rows:
328
+ new_row = list(row)
329
+ # Ensure row is long enough before insertion logic?
330
+ # Or just insert.
331
+ # If col_idx is way past end, we might need padding?
332
+ # Standard list.insert handles index > len -> append.
333
+ current_len = len(new_row)
334
+ target_idx = col_idx
335
+ if target_idx > current_len:
336
+ # Pad up to target
337
+ new_row.extend([""] * (target_idx - current_len))
338
+ target_idx = len(new_row) # Append
339
+
340
+ new_row.insert(target_idx, "")
341
+ new_rows.append(new_row)
342
+
343
+ return replace(
344
+ self, headers=new_headers, rows=new_rows, alignments=new_alignments
345
+ )
346
+
347
+
348
+ @dataclass(frozen=True)
349
+ class Sheet:
350
+ """
351
+ Represents a single sheet containing tables.
352
+
353
+ Attributes:
354
+ name (str): Name of the sheet.
355
+ tables (list[Table]): List of tables contained in this sheet.
356
+ metadata (dict[str, Any] | None): Arbitrary metadata (e.g. layout). Defaults to None.
357
+ """
358
+
359
+ name: str
360
+ tables: list[Table]
361
+ metadata: dict[str, Any] | None = None
362
+
363
+ def __post_init__(self):
364
+ if self.metadata is None:
365
+ # Hack to allow default value for mutable type in frozen dataclass
366
+ object.__setattr__(self, "metadata", {})
367
+
368
+ @property
369
+ def json(self) -> SheetJSON:
370
+ """
371
+ Returns a JSON-compatible dictionary representation of the sheet.
372
+
373
+ Returns:
374
+ SheetJSON: A dictionary containing the sheet data.
375
+ """
376
+ return {
377
+ "name": self.name,
378
+ "tables": [t.json for t in self.tables],
379
+ "metadata": self.metadata if self.metadata is not None else {},
380
+ }
381
+
382
+ def get_table(self, name: str) -> Table | None:
383
+ """
384
+ Retrieve a table by its name.
385
+
386
+ Args:
387
+ name (str): The name of the table to retrieve.
388
+
389
+ Returns:
390
+ Table | None: The table object if found, otherwise None.
391
+ """
392
+ for table in self.tables:
393
+ if table.name == name:
394
+ return table
395
+ return None
396
+
397
+ def to_markdown(self, schema: ParsingSchema = DEFAULT_SCHEMA) -> str:
398
+ """
399
+ Generates a Markdown string representation of the sheet.
400
+
401
+ Args:
402
+ schema (ParsingSchema, optional): Configuration for formatting.
403
+
404
+ Returns:
405
+ str: The Markdown string.
406
+ """
407
+ return generate_sheet_markdown(self, schema)
408
+
409
+
410
+ @dataclass(frozen=True)
411
+ class Workbook:
412
+ """
413
+ Represents a collection of sheets (multi-table output).
414
+
415
+ Attributes:
416
+ sheets (list[Sheet]): List of sheets in the workbook.
417
+ metadata (dict[str, Any] | None): Arbitrary metadata. Defaults to None.
418
+ """
419
+
420
+ sheets: list[Sheet]
421
+ metadata: dict[str, Any] | None = None
422
+
423
+ def __post_init__(self):
424
+ if self.metadata is None:
425
+ # Hack to allow default value for mutable type in frozen dataclass
426
+ object.__setattr__(self, "metadata", {})
427
+
428
+ @property
429
+ def json(self) -> WorkbookJSON:
430
+ """
431
+ Returns a JSON-compatible dictionary representation of the workbook.
432
+
433
+ Returns:
434
+ WorkbookJSON: A dictionary containing the workbook data.
435
+ """
436
+ return {
437
+ "sheets": [s.json for s in self.sheets],
438
+ "metadata": self.metadata if self.metadata is not None else {},
439
+ }
440
+
441
+ def get_sheet(self, name: str) -> Sheet | None:
442
+ """
443
+ Retrieve a sheet by its name.
444
+
445
+ Args:
446
+ name (str): The name of the sheet to retrieve.
447
+
448
+ Returns:
449
+ Sheet | None: The sheet object if found, otherwise None.
450
+ """
451
+ for sheet in self.sheets:
452
+ if sheet.name == name:
453
+ return sheet
454
+ return None
455
+
456
+ def to_markdown(self, schema: MultiTableParsingSchema) -> str:
457
+ """
458
+ Generates a Markdown string representation of the workbook.
459
+
460
+ Args:
461
+ schema (MultiTableParsingSchema): Configuration for formatting.
462
+
463
+ Returns:
464
+ str: The Markdown string.
465
+ """
466
+ return generate_workbook_markdown(self, schema)
467
+
468
+ def add_sheet(self, name: str) -> "Workbook":
469
+ """
470
+ Return a new Workbook with a new sheet added.
471
+ """
472
+ # Create new sheet with one empty table as default
473
+ new_table = Table(headers=["A", "B", "C"], rows=[["", "", ""]])
474
+ new_sheet = Sheet(name=name, tables=[new_table])
475
+
476
+ new_sheets = list(self.sheets)
477
+ new_sheets.append(new_sheet)
478
+
479
+ return replace(self, sheets=new_sheets)
480
+
481
+ def delete_sheet(self, index: int) -> "Workbook":
482
+ """
483
+ Return a new Workbook with the sheet at index removed.
484
+ """
485
+ if index < 0 or index >= len(self.sheets):
486
+ raise IndexError("Sheet index out of range")
487
+
488
+ new_sheets = list(self.sheets)
489
+ new_sheets.pop(index)
490
+
491
+ return replace(self, sheets=new_sheets)