format-docstring 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- format_docstring/__init__.py +5 -0
- format_docstring/base_fixer.py +70 -0
- format_docstring/config.py +211 -0
- format_docstring/docstring_rewriter.py +314 -0
- format_docstring/line_wrap_google.py +7 -0
- format_docstring/line_wrap_numpy.py +387 -0
- format_docstring/line_wrap_utils.py +781 -0
- format_docstring/main_jupyter.py +165 -0
- format_docstring/main_py.py +125 -0
- format_docstring-0.1.0.dist-info/METADATA +311 -0
- format_docstring-0.1.0.dist-info/RECORD +15 -0
- format_docstring-0.1.0.dist-info/WHEEL +5 -0
- format_docstring-0.1.0.dist-info/entry_points.txt +3 -0
- format_docstring-0.1.0.dist-info/licenses/LICENSE +21 -0
- format_docstring-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,781 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import textwrap
|
|
5
|
+
|
|
6
|
+
# Regex pattern to split text into paragraphs (multiple consecutive newlines)
|
|
7
|
+
_PARAGRAPH_SPLIT_PATTERN = re.compile(r'\n\s*\n')
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def add_leading_indent(docstring: str, leading_indent: int | None) -> str:
|
|
11
|
+
r"""Ensure a docstring starts with a newline + indent when requested.
|
|
12
|
+
|
|
13
|
+
If ``leading_indent`` is a positive integer and the docstring body doesn't
|
|
14
|
+
already begin with ``"\n" + ' ' * leading_indent``, prepend it. Otherwise,
|
|
15
|
+
return the docstring unchanged.
|
|
16
|
+
"""
|
|
17
|
+
if leading_indent is not None:
|
|
18
|
+
needed_prefix: str = '\n' + (' ' * leading_indent)
|
|
19
|
+
if not docstring.startswith(needed_prefix):
|
|
20
|
+
return needed_prefix + docstring
|
|
21
|
+
|
|
22
|
+
return docstring
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def finalize_lines(out_lines: list[str], leading_indent: int | None) -> str:
|
|
26
|
+
"""Trim trailing spaces, normalize blank lines, and append closing indent.
|
|
27
|
+
|
|
28
|
+
- Trims trailing spaces from each line.
|
|
29
|
+
- Converts lines that are only whitespace to truly empty lines.
|
|
30
|
+
- Removes a trailing newline if present.
|
|
31
|
+
- If ``leading_indent`` is provided and positive, ensures the result ends
|
|
32
|
+
with a newline plus that many spaces (so closing quotes align).
|
|
33
|
+
"""
|
|
34
|
+
out = [line.rstrip(' ') for line in out_lines]
|
|
35
|
+
result = '\n'.join(
|
|
36
|
+
'' if (line.strip() == '') else line for line in out
|
|
37
|
+
).rstrip('\n')
|
|
38
|
+
|
|
39
|
+
if leading_indent is not None:
|
|
40
|
+
suffix = '\n' + (' ' * leading_indent)
|
|
41
|
+
if not result.endswith(suffix):
|
|
42
|
+
result = result + suffix
|
|
43
|
+
|
|
44
|
+
return result
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def collect_to_temp_output(temp_out: list[str | list[str]], line: str) -> None:
|
|
48
|
+
"""
|
|
49
|
+
Collect `line` into temporary output.
|
|
50
|
+
|
|
51
|
+
If the last element of `temp_out` is `list[str]`, append `line` into it. If
|
|
52
|
+
the last element of `temp_out` is `str`, add new (empty) list as the last
|
|
53
|
+
element and use `line` as the first element of this new (empty) list.
|
|
54
|
+
"""
|
|
55
|
+
if len(temp_out) == 0:
|
|
56
|
+
temp_out.append(line)
|
|
57
|
+
return
|
|
58
|
+
|
|
59
|
+
if isinstance(temp_out[-1], list):
|
|
60
|
+
temp_out[-1].append(line)
|
|
61
|
+
else:
|
|
62
|
+
temp_out.append([line])
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def process_temp_output(
|
|
66
|
+
temp_out: list[str | list[str]],
|
|
67
|
+
width: int,
|
|
68
|
+
) -> list[str]:
|
|
69
|
+
"""Wrap the `list[str]` elements in `temp_out`."""
|
|
70
|
+
out: list[str] = []
|
|
71
|
+
|
|
72
|
+
for element in temp_out:
|
|
73
|
+
if isinstance(element, str):
|
|
74
|
+
out.append(element)
|
|
75
|
+
elif isinstance(element, list):
|
|
76
|
+
wrapped: list[str] = wrap_preserving_indent(element, width)
|
|
77
|
+
out.extend(wrapped)
|
|
78
|
+
else:
|
|
79
|
+
raise RuntimeError("Something's wrong. Please contact the author.")
|
|
80
|
+
|
|
81
|
+
return fix_typos_in_section_headings(out)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def wrap_preserving_indent(lines: list[str], width: int) -> list[str]:
|
|
85
|
+
"""Wrap lines while preserving structure of tables, lists, and indentation.
|
|
86
|
+
|
|
87
|
+
Uses segmentation to identify rST tables and bulleted lists which shouldn't
|
|
88
|
+
be wrapped, and only wraps the regular text content while preserving
|
|
89
|
+
indentation and paragraph structure.
|
|
90
|
+
|
|
91
|
+
Parameters
|
|
92
|
+
----------
|
|
93
|
+
lines : list[str]
|
|
94
|
+
The list of lines to process.
|
|
95
|
+
width : int
|
|
96
|
+
The target line width for wrapping.
|
|
97
|
+
|
|
98
|
+
Returns
|
|
99
|
+
-------
|
|
100
|
+
list[str]
|
|
101
|
+
The processed lines with wrappable content wrapped and
|
|
102
|
+
non-wrappable content (tables, lists) preserved.
|
|
103
|
+
"""
|
|
104
|
+
if not lines:
|
|
105
|
+
return []
|
|
106
|
+
|
|
107
|
+
# Segment lines into wrappable and non-wrappable chunks
|
|
108
|
+
segments = segment_lines_by_wrappability(lines)
|
|
109
|
+
|
|
110
|
+
result: list[str] = []
|
|
111
|
+
|
|
112
|
+
for segment_lines, is_wrappable in segments:
|
|
113
|
+
if not is_wrappable:
|
|
114
|
+
# Don't wrap tables and lists - preserve them exactly
|
|
115
|
+
result.extend(segment_lines)
|
|
116
|
+
else:
|
|
117
|
+
# Wrap regular text content
|
|
118
|
+
wrapped_segment = _wrap_text_segment(segment_lines, width)
|
|
119
|
+
result.extend(wrapped_segment)
|
|
120
|
+
|
|
121
|
+
return result
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _wrap_text_segment(lines: list[str], width: int) -> list[str]:
|
|
125
|
+
"""Wrap a segment of regular text lines while preserving indentation and
|
|
126
|
+
paragraphs.
|
|
127
|
+
|
|
128
|
+
This is the core wrapping logic extracted from the original
|
|
129
|
+
wrap_preserving_indent.
|
|
130
|
+
"""
|
|
131
|
+
if not lines:
|
|
132
|
+
return []
|
|
133
|
+
|
|
134
|
+
# Convert lines back to text for processing
|
|
135
|
+
text = '\n'.join(lines)
|
|
136
|
+
|
|
137
|
+
# Get original indentation from the first non-empty line
|
|
138
|
+
first_line = ''
|
|
139
|
+
for line in lines:
|
|
140
|
+
if line.strip():
|
|
141
|
+
first_line = line
|
|
142
|
+
break
|
|
143
|
+
|
|
144
|
+
if not first_line:
|
|
145
|
+
return lines # All empty lines
|
|
146
|
+
|
|
147
|
+
stripped_first: str = first_line.lstrip(' ')
|
|
148
|
+
indent_len: int = len(first_line) - len(stripped_first)
|
|
149
|
+
indent: str = ' ' * indent_len
|
|
150
|
+
|
|
151
|
+
# First merge lines within paragraphs while preserving paragraph breaks
|
|
152
|
+
merged_text: str = merge_lines_and_strip(text)
|
|
153
|
+
|
|
154
|
+
# Split into paragraphs and process each one
|
|
155
|
+
paragraphs: list[str] = merged_text.split('\n\n')
|
|
156
|
+
|
|
157
|
+
avail: int = max(1, width - indent_len)
|
|
158
|
+
tw: textwrap.TextWrapper = textwrap.TextWrapper(
|
|
159
|
+
width=avail,
|
|
160
|
+
break_long_words=False,
|
|
161
|
+
break_on_hyphens=False,
|
|
162
|
+
replace_whitespace=False,
|
|
163
|
+
drop_whitespace=True,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
out: list[list[str]] = []
|
|
167
|
+
for paragraph in paragraphs:
|
|
168
|
+
if paragraph.strip(): # Only process non-empty paragraphs
|
|
169
|
+
stripped_para: str = paragraph.lstrip(' ')
|
|
170
|
+
wrapped_lines: list[str] = tw.wrap(stripped_para)
|
|
171
|
+
indented_lines: list[str] = (
|
|
172
|
+
[indent + line for line in wrapped_lines]
|
|
173
|
+
if wrapped_lines
|
|
174
|
+
else [indent + paragraph]
|
|
175
|
+
)
|
|
176
|
+
out.append(indented_lines)
|
|
177
|
+
else:
|
|
178
|
+
# Empty paragraph means extra line break - preserve it
|
|
179
|
+
out.append([''])
|
|
180
|
+
|
|
181
|
+
out.append(['']) # Add empty line separator
|
|
182
|
+
|
|
183
|
+
# Flatten and remove the trailing empty line
|
|
184
|
+
result: list[str] = [item for sublist in out for item in sublist]
|
|
185
|
+
if result and result[-1] == '':
|
|
186
|
+
result.pop()
|
|
187
|
+
|
|
188
|
+
return result if result else lines
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def merge_lines_and_strip(text: str) -> str:
|
|
192
|
+
r"""Merge lines within paragraphs, preserving paragraph breaks.
|
|
193
|
+
|
|
194
|
+
Takes a multi-line string where each line may have leading or trailing
|
|
195
|
+
whitespace. Lines within the same paragraph (separated by single newlines)
|
|
196
|
+
are merged with spaces, while paragraph breaks (multiple consecutive
|
|
197
|
+
newlines) are preserved as double newlines.
|
|
198
|
+
|
|
199
|
+
Parameters
|
|
200
|
+
----------
|
|
201
|
+
text : str
|
|
202
|
+
The input text containing multiple lines with potential leading or
|
|
203
|
+
trailing whitespace and paragraph breaks.
|
|
204
|
+
|
|
205
|
+
Returns
|
|
206
|
+
-------
|
|
207
|
+
str
|
|
208
|
+
The processed text with lines merged within paragraphs and paragraph
|
|
209
|
+
breaks preserved as double newlines.
|
|
210
|
+
|
|
211
|
+
Examples
|
|
212
|
+
--------
|
|
213
|
+
>>> text = ' something like this\\n and this is the 2nd\\n line,'
|
|
214
|
+
>>> merge_lines_and_strip(text)
|
|
215
|
+
'something like this and this is the 2nd line,'
|
|
216
|
+
|
|
217
|
+
>>> text = 'first para\\nstill first\\n\\nsecond para\\nstill second'
|
|
218
|
+
>>> merge_lines_and_strip(text)
|
|
219
|
+
'first para still first\\n\\nsecond para still second'
|
|
220
|
+
"""
|
|
221
|
+
# Split on multiple newlines to separate paragraphs
|
|
222
|
+
paragraphs = _PARAGRAPH_SPLIT_PATTERN.split(text)
|
|
223
|
+
|
|
224
|
+
processed_paragraphs = []
|
|
225
|
+
for paragraph in paragraphs:
|
|
226
|
+
# For each paragraph, split into lines, strip whitespace, and join with
|
|
227
|
+
# spaces
|
|
228
|
+
lines = paragraph.split('\n')
|
|
229
|
+
stripped_lines = [line.strip() for line in lines if line.strip()]
|
|
230
|
+
if stripped_lines: # Only add non-empty paragraphs
|
|
231
|
+
processed_paragraphs.append(' '.join(stripped_lines))
|
|
232
|
+
|
|
233
|
+
return '\n\n'.join(processed_paragraphs)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def fix_typos_in_section_headings(lines: list[str]) -> list[str]:
|
|
237
|
+
"""Fix typos such as 'Return' in section headings."""
|
|
238
|
+
if len(lines) < 2:
|
|
239
|
+
return lines
|
|
240
|
+
|
|
241
|
+
# Define typo corrections (case-insensitive keys, proper case values)
|
|
242
|
+
typo_corrections = {
|
|
243
|
+
'return': 'Returns',
|
|
244
|
+
'parameter': 'Parameters',
|
|
245
|
+
'other parameter': 'Other Parameters',
|
|
246
|
+
'attribute': 'Attributes',
|
|
247
|
+
'yield': 'Yields',
|
|
248
|
+
'raise': 'Raises',
|
|
249
|
+
'note': 'Notes',
|
|
250
|
+
'example': 'Examples',
|
|
251
|
+
# Also handle correctly spelled but wrong case
|
|
252
|
+
'returns': 'Returns',
|
|
253
|
+
'parameters': 'Parameters',
|
|
254
|
+
'other parameters': 'Other Parameters',
|
|
255
|
+
'attributes': 'Attributes',
|
|
256
|
+
'yields': 'Yields',
|
|
257
|
+
'raises': 'Raises',
|
|
258
|
+
'notes': 'Notes',
|
|
259
|
+
'examples': 'Examples',
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
result = lines.copy()
|
|
263
|
+
|
|
264
|
+
for i in range(len(lines) - 1):
|
|
265
|
+
current_line = lines[i].strip()
|
|
266
|
+
next_line = lines[i + 1].strip()
|
|
267
|
+
|
|
268
|
+
# Check if next line is dashes (at least 2 dashes, only dashes and
|
|
269
|
+
# whitespace)
|
|
270
|
+
if len(next_line) >= 2 and all(c == '-' for c in next_line):
|
|
271
|
+
# Current line is a section heading, check for typos
|
|
272
|
+
# (which are case-insensitive)
|
|
273
|
+
current_line_lower = current_line.lower()
|
|
274
|
+
if current_line_lower in typo_corrections:
|
|
275
|
+
corrected_heading = typo_corrections[current_line_lower]
|
|
276
|
+
# Preserve original indentation
|
|
277
|
+
original_indent = lines[i][
|
|
278
|
+
: len(lines[i]) - len(lines[i].lstrip())
|
|
279
|
+
]
|
|
280
|
+
result[i] = original_indent + corrected_heading
|
|
281
|
+
# Fix dashes to match corrected heading length
|
|
282
|
+
dashes_indent = lines[i + 1][
|
|
283
|
+
: len(lines[i + 1]) - len(lines[i + 1].lstrip())
|
|
284
|
+
]
|
|
285
|
+
result[i + 1] = dashes_indent + '-' * len(corrected_heading)
|
|
286
|
+
|
|
287
|
+
return result
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def segment_lines_by_wrappability(
|
|
291
|
+
lines: list[str],
|
|
292
|
+
) -> list[tuple[list[str], bool]]:
|
|
293
|
+
"""Segment lines into chunks that can or cannot be wrapped.
|
|
294
|
+
|
|
295
|
+
Scans through the lines to detect rST tables, bulleted lists, and literal
|
|
296
|
+
blocks (paragraphs following ::), which should not be wrapped. Other
|
|
297
|
+
content can be wrapped.
|
|
298
|
+
|
|
299
|
+
Parameters
|
|
300
|
+
----------
|
|
301
|
+
lines : list[str]
|
|
302
|
+
The list of lines to segment.
|
|
303
|
+
|
|
304
|
+
Returns
|
|
305
|
+
-------
|
|
306
|
+
list[tuple[list[str], bool]]
|
|
307
|
+
A list of tuples where each tuple contains:
|
|
308
|
+
- list[str]: consecutive lines forming a segment
|
|
309
|
+
- bool: True if these lines can be wrapped, False if they should not be
|
|
310
|
+
wrapped
|
|
311
|
+
|
|
312
|
+
rST tables, bulleted lists, and literal blocks have wrappable=False,
|
|
313
|
+
other content has wrappable=True.
|
|
314
|
+
|
|
315
|
+
Examples
|
|
316
|
+
--------
|
|
317
|
+
>>> lines = [
|
|
318
|
+
... 'Some text that can be wrapped',
|
|
319
|
+
... '- First list item',
|
|
320
|
+
... '- Second list item',
|
|
321
|
+
... 'More wrappable text',
|
|
322
|
+
... ]
|
|
323
|
+
>>> result = segment_lines_by_wrappability(lines)
|
|
324
|
+
>>> len(result)
|
|
325
|
+
3
|
|
326
|
+
>>> result[0]
|
|
327
|
+
(['Some text that can be wrapped'], True)
|
|
328
|
+
>>> result[1]
|
|
329
|
+
(['- First list item', '- Second list item'], False)
|
|
330
|
+
>>> result[2]
|
|
331
|
+
(['More wrappable text'], True)
|
|
332
|
+
"""
|
|
333
|
+
if not lines:
|
|
334
|
+
return []
|
|
335
|
+
|
|
336
|
+
segments: list[tuple[list[str], bool]] = []
|
|
337
|
+
current_idx = 0
|
|
338
|
+
|
|
339
|
+
while current_idx < len(lines):
|
|
340
|
+
# Check for rST table
|
|
341
|
+
is_table, table_end_idx = is_rST_table(lines, current_idx)
|
|
342
|
+
if is_table:
|
|
343
|
+
# Add table segment (not wrappable)
|
|
344
|
+
table_lines = lines[current_idx:table_end_idx]
|
|
345
|
+
segments.append((table_lines, False))
|
|
346
|
+
current_idx = table_end_idx
|
|
347
|
+
continue
|
|
348
|
+
|
|
349
|
+
# Check for bulleted list
|
|
350
|
+
is_list, list_end_idx = is_bulleted_list(lines, current_idx)
|
|
351
|
+
if is_list:
|
|
352
|
+
# Add list segment (not wrappable)
|
|
353
|
+
list_lines = lines[current_idx:list_end_idx]
|
|
354
|
+
segments.append((list_lines, False))
|
|
355
|
+
current_idx = list_end_idx
|
|
356
|
+
continue
|
|
357
|
+
|
|
358
|
+
# Check for literal block following ::
|
|
359
|
+
is_literal, literal_end_idx = _is_literal_block_paragraph(
|
|
360
|
+
lines, current_idx
|
|
361
|
+
)
|
|
362
|
+
if is_literal:
|
|
363
|
+
# Add literal block segment (not wrappable)
|
|
364
|
+
literal_lines = lines[current_idx:literal_end_idx]
|
|
365
|
+
segments.append((literal_lines, False))
|
|
366
|
+
current_idx = literal_end_idx
|
|
367
|
+
continue
|
|
368
|
+
|
|
369
|
+
# Neither table, list, nor literal block - collect wrappable content
|
|
370
|
+
start_idx = current_idx
|
|
371
|
+
current_idx += 1
|
|
372
|
+
|
|
373
|
+
# Continue collecting wrappable lines until we hit a table/list/literal
|
|
374
|
+
# or end
|
|
375
|
+
while current_idx < len(lines):
|
|
376
|
+
is_table, _ = is_rST_table(lines, current_idx)
|
|
377
|
+
is_list, _ = is_bulleted_list(lines, current_idx)
|
|
378
|
+
is_literal, _ = _is_literal_block_paragraph(lines, current_idx)
|
|
379
|
+
|
|
380
|
+
if is_table or is_list or is_literal:
|
|
381
|
+
break
|
|
382
|
+
|
|
383
|
+
current_idx += 1
|
|
384
|
+
|
|
385
|
+
# Add wrappable segment
|
|
386
|
+
wrappable_lines = lines[start_idx:current_idx]
|
|
387
|
+
segments.append((wrappable_lines, True))
|
|
388
|
+
|
|
389
|
+
return segments
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def is_rST_table(lines: list[str], start_idx: int = 0) -> tuple[bool, int]:
|
|
393
|
+
"""Check if lines starting at start_idx form a reStructuredText table.
|
|
394
|
+
|
|
395
|
+
rST supports two table formats:
|
|
396
|
+
1. Simple tables: columns separated by spaces, header/data separated by
|
|
397
|
+
`=` lines
|
|
398
|
+
2. Grid tables: cells enclosed by + and - characters forming a grid
|
|
399
|
+
|
|
400
|
+
Parameters
|
|
401
|
+
----------
|
|
402
|
+
lines : list[str]
|
|
403
|
+
The list of lines to check.
|
|
404
|
+
start_idx : int, optional
|
|
405
|
+
The starting index to check from, by default 0.
|
|
406
|
+
|
|
407
|
+
Returns
|
|
408
|
+
-------
|
|
409
|
+
tuple[bool, int]
|
|
410
|
+
A tuple of (is_table, end_idx) where is_table indicates if an rST
|
|
411
|
+
table was found starting at start_idx, and end_idx is the index after
|
|
412
|
+
the last line of the table (or start_idx if no table found).
|
|
413
|
+
"""
|
|
414
|
+
if start_idx >= len(lines):
|
|
415
|
+
return False, start_idx
|
|
416
|
+
|
|
417
|
+
# Try to detect grid table first
|
|
418
|
+
grid_result = _is_grid_table(lines, start_idx)
|
|
419
|
+
if grid_result[0]:
|
|
420
|
+
return grid_result
|
|
421
|
+
|
|
422
|
+
# Try to detect simple table
|
|
423
|
+
simple_result = _is_simple_table(lines, start_idx)
|
|
424
|
+
if simple_result[0]:
|
|
425
|
+
return simple_result
|
|
426
|
+
|
|
427
|
+
return False, start_idx
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def _is_grid_table(lines: list[str], start_idx: int) -> tuple[bool, int]:
|
|
431
|
+
"""Check for rST grid table format.
|
|
432
|
+
|
|
433
|
+
Grid tables look like:
|
|
434
|
+
+-----+-----+
|
|
435
|
+
| A | B |
|
|
436
|
+
+=====+=====+
|
|
437
|
+
| 1 | 2 |
|
|
438
|
+
+-----+-----+
|
|
439
|
+
"""
|
|
440
|
+
if start_idx >= len(lines):
|
|
441
|
+
return False, start_idx
|
|
442
|
+
|
|
443
|
+
first_line = lines[start_idx].strip()
|
|
444
|
+
if not first_line or not _is_grid_separator_line(first_line):
|
|
445
|
+
return False, start_idx
|
|
446
|
+
|
|
447
|
+
# Find the end of the grid table
|
|
448
|
+
current_idx = start_idx + 1
|
|
449
|
+
in_table = True
|
|
450
|
+
has_content = False
|
|
451
|
+
|
|
452
|
+
while current_idx < len(lines) and in_table:
|
|
453
|
+
line = lines[current_idx].strip()
|
|
454
|
+
|
|
455
|
+
if not line:
|
|
456
|
+
# Empty line ends the table
|
|
457
|
+
break
|
|
458
|
+
|
|
459
|
+
if _is_grid_separator_line(line):
|
|
460
|
+
# Separator line continues the table
|
|
461
|
+
current_idx += 1
|
|
462
|
+
elif _is_grid_content_line(line):
|
|
463
|
+
# Content line continues the table
|
|
464
|
+
has_content = True
|
|
465
|
+
current_idx += 1
|
|
466
|
+
else:
|
|
467
|
+
# Non-table line ends the table
|
|
468
|
+
break
|
|
469
|
+
|
|
470
|
+
# Must have at least one content line to be a valid table
|
|
471
|
+
return has_content and current_idx > start_idx + 1, current_idx
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
def _is_simple_table(lines: list[str], start_idx: int) -> tuple[bool, int]:
|
|
475
|
+
"""Check for rST simple table format.
|
|
476
|
+
|
|
477
|
+
Simple tables look like:
|
|
478
|
+
===== =====
|
|
479
|
+
A B
|
|
480
|
+
===== =====
|
|
481
|
+
1 2
|
|
482
|
+
===== =====
|
|
483
|
+
"""
|
|
484
|
+
if start_idx >= len(lines):
|
|
485
|
+
return False, start_idx
|
|
486
|
+
|
|
487
|
+
first_line = lines[start_idx].strip()
|
|
488
|
+
if not first_line or not _is_simple_separator_line(first_line):
|
|
489
|
+
return False, start_idx
|
|
490
|
+
|
|
491
|
+
# Find the end of the simple table
|
|
492
|
+
current_idx = start_idx + 1
|
|
493
|
+
has_content = False
|
|
494
|
+
|
|
495
|
+
while current_idx < len(lines):
|
|
496
|
+
line = lines[current_idx].strip()
|
|
497
|
+
|
|
498
|
+
if not line:
|
|
499
|
+
# Empty line ends the table
|
|
500
|
+
break
|
|
501
|
+
|
|
502
|
+
if _is_simple_separator_line(line):
|
|
503
|
+
# Separator line continues the table
|
|
504
|
+
current_idx += 1
|
|
505
|
+
elif _is_simple_content_line(line, first_line):
|
|
506
|
+
# Content line continues the table
|
|
507
|
+
has_content = True
|
|
508
|
+
current_idx += 1
|
|
509
|
+
else:
|
|
510
|
+
# Non-table line ends the table
|
|
511
|
+
break
|
|
512
|
+
|
|
513
|
+
# Must have at least one content line and end with separator
|
|
514
|
+
return (
|
|
515
|
+
has_content
|
|
516
|
+
and current_idx > start_idx + 1
|
|
517
|
+
and current_idx <= len(lines)
|
|
518
|
+
and _is_simple_separator_line(lines[current_idx - 1].strip())
|
|
519
|
+
), current_idx
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
def _is_grid_separator_line(line: str) -> bool:
|
|
523
|
+
"""Check if line is a grid table separator (starts with + and
|
|
524
|
+
contains + - =).
|
|
525
|
+
"""
|
|
526
|
+
if not line or not line.startswith('+'):
|
|
527
|
+
return False
|
|
528
|
+
# Should contain only +, -, =, and spaces
|
|
529
|
+
# Must end with + to be a complete separator line
|
|
530
|
+
return (
|
|
531
|
+
all(c in '+-= ' for c in line)
|
|
532
|
+
and '+' in line
|
|
533
|
+
and line.rstrip().endswith('+')
|
|
534
|
+
)
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
def _is_grid_content_line(line: str) -> bool:
|
|
538
|
+
"""Check if line is a grid table content line (starts and ends with |)."""
|
|
539
|
+
stripped = line.strip()
|
|
540
|
+
return stripped.startswith('|') and stripped.endswith('|')
|
|
541
|
+
|
|
542
|
+
|
|
543
|
+
def _is_simple_separator_line(line: str) -> bool:
|
|
544
|
+
"""
|
|
545
|
+
Check if line is a simple table separator (contains only = and spaces).
|
|
546
|
+
"""
|
|
547
|
+
if not line:
|
|
548
|
+
return False
|
|
549
|
+
# Should contain only = and spaces, and at least one =
|
|
550
|
+
return all(c in '= ' for c in line) and '=' in line
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
def _is_simple_content_line(line: str, separator_line: str) -> bool:
|
|
554
|
+
"""
|
|
555
|
+
Check if line could be content for a simple table based on separator
|
|
556
|
+
pattern.
|
|
557
|
+
"""
|
|
558
|
+
if not line:
|
|
559
|
+
return False
|
|
560
|
+
|
|
561
|
+
# Simple heuristic: content line should not be longer than separator
|
|
562
|
+
# and should contain some non-space characters
|
|
563
|
+
return len(line.rstrip()) <= len(separator_line) and bool(line.strip())
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
def is_bulleted_list(lines: list[str], start_idx: int = 0) -> tuple[bool, int]:
|
|
567
|
+
"""Check if lines starting at start_idx form a bulleted list.
|
|
568
|
+
|
|
569
|
+
A bulleted list consists of consecutive list items that start with:
|
|
570
|
+
- Unordered: -, *, or + followed by space
|
|
571
|
+
- Ordered: number followed by . or ) and space (like "1. " or "1) ")
|
|
572
|
+
|
|
573
|
+
Multi-line list items are supported. Continuation lines must be indented
|
|
574
|
+
more than the list item marker and contain some content.
|
|
575
|
+
|
|
576
|
+
Parameters
|
|
577
|
+
----------
|
|
578
|
+
lines : list[str]
|
|
579
|
+
The list of lines to check.
|
|
580
|
+
start_idx : int, optional
|
|
581
|
+
The starting index to check from, by default 0.
|
|
582
|
+
|
|
583
|
+
Returns
|
|
584
|
+
-------
|
|
585
|
+
tuple[bool, int]
|
|
586
|
+
A tuple of (is_list, end_idx) where is_list indicates if a bulleted
|
|
587
|
+
list was found starting at start_idx, and end_idx is the index after
|
|
588
|
+
the last line of the list (or start_idx if no list found).
|
|
589
|
+
"""
|
|
590
|
+
if start_idx >= len(lines):
|
|
591
|
+
return False, start_idx
|
|
592
|
+
|
|
593
|
+
first_line = lines[start_idx].strip()
|
|
594
|
+
if not first_line:
|
|
595
|
+
return False, start_idx
|
|
596
|
+
|
|
597
|
+
# Check if first line is a list item
|
|
598
|
+
if not _is_list_item(first_line):
|
|
599
|
+
return False, start_idx
|
|
600
|
+
|
|
601
|
+
# Determine list type from first item
|
|
602
|
+
is_ordered = _is_ordered_list_item(first_line)
|
|
603
|
+
list_format = _get_list_format(first_line) if is_ordered else None
|
|
604
|
+
|
|
605
|
+
# Get the indentation level of the first list item
|
|
606
|
+
first_line_full = lines[start_idx]
|
|
607
|
+
first_stripped = first_line_full.lstrip(' ')
|
|
608
|
+
first_indent = len(first_line_full) - len(first_stripped)
|
|
609
|
+
|
|
610
|
+
# Find consecutive list items and their continuation lines
|
|
611
|
+
current_idx = start_idx + 1
|
|
612
|
+
while current_idx < len(lines):
|
|
613
|
+
line = lines[current_idx]
|
|
614
|
+
stripped_line = line.strip()
|
|
615
|
+
|
|
616
|
+
# Empty line ends the list
|
|
617
|
+
if not stripped_line:
|
|
618
|
+
break
|
|
619
|
+
|
|
620
|
+
# Check if this line is a continuation of a multi-line list item
|
|
621
|
+
if _is_continuation_line(line, first_indent):
|
|
622
|
+
current_idx += 1
|
|
623
|
+
continue
|
|
624
|
+
|
|
625
|
+
# Check if this line is a new list item of the same type
|
|
626
|
+
if not _is_list_item(stripped_line):
|
|
627
|
+
break
|
|
628
|
+
|
|
629
|
+
# Must be same type (ordered vs unordered)
|
|
630
|
+
if _is_ordered_list_item(stripped_line) != is_ordered:
|
|
631
|
+
break
|
|
632
|
+
|
|
633
|
+
# For ordered lists, must use same format (. vs ))
|
|
634
|
+
if is_ordered and _get_list_format(stripped_line) != list_format:
|
|
635
|
+
break
|
|
636
|
+
|
|
637
|
+
current_idx += 1
|
|
638
|
+
|
|
639
|
+
# Need at least one list item to be considered a list
|
|
640
|
+
return current_idx > start_idx, current_idx
|
|
641
|
+
|
|
642
|
+
|
|
643
|
+
def _is_list_item(line: str) -> bool:
|
|
644
|
+
"""Check if a line is a list item (ordered or unordered)."""
|
|
645
|
+
return _is_unordered_list_item(line) or _is_ordered_list_item(line)
|
|
646
|
+
|
|
647
|
+
|
|
648
|
+
def _is_unordered_list_item(line: str) -> bool:
|
|
649
|
+
"""Check if a line is an unordered list item (starts with -, *, or +)."""
|
|
650
|
+
stripped = line.lstrip()
|
|
651
|
+
return (
|
|
652
|
+
stripped.startswith('- ')
|
|
653
|
+
or stripped.startswith('* ')
|
|
654
|
+
or stripped.startswith('+ ')
|
|
655
|
+
)
|
|
656
|
+
|
|
657
|
+
|
|
658
|
+
def _is_ordered_list_item(line: str) -> bool:
|
|
659
|
+
"""Check if a line is an ordered list item.
|
|
660
|
+
|
|
661
|
+
Supports formats:
|
|
662
|
+
- number. (e.g., "1. ", "2. ")
|
|
663
|
+
- number) (e.g., "1) ", "2) ")
|
|
664
|
+
- (number) (e.g., "(1) ", "(2) ")
|
|
665
|
+
"""
|
|
666
|
+
stripped = line.lstrip()
|
|
667
|
+
if not stripped:
|
|
668
|
+
return False
|
|
669
|
+
|
|
670
|
+
# Look for patterns:
|
|
671
|
+
# - digits followed by . or ) followed by space
|
|
672
|
+
# - ( followed by digits followed by ) followed by space
|
|
673
|
+
import re
|
|
674
|
+
|
|
675
|
+
pattern = r'^(\d+[.)] |\(\d+\) )'
|
|
676
|
+
return bool(re.match(pattern, stripped))
|
|
677
|
+
|
|
678
|
+
|
|
679
|
+
def _get_list_format(line: str) -> str | None:
|
|
680
|
+
"""Get the format of an ordered list item.
|
|
681
|
+
|
|
682
|
+
Returns
|
|
683
|
+
-------
|
|
684
|
+
- '.' for "1. " format
|
|
685
|
+
- ')' for "1) " format
|
|
686
|
+
- '()' for "(1) " format
|
|
687
|
+
- None if not an ordered list item
|
|
688
|
+
"""
|
|
689
|
+
stripped = line.lstrip()
|
|
690
|
+
if not stripped:
|
|
691
|
+
return None
|
|
692
|
+
|
|
693
|
+
import re
|
|
694
|
+
|
|
695
|
+
dot_match = re.match(r'^\d+\. ', stripped)
|
|
696
|
+
paren_match = re.match(r'^\d+\) ', stripped)
|
|
697
|
+
full_paren_match = re.match(r'^\(\d+\) ', stripped)
|
|
698
|
+
|
|
699
|
+
if dot_match:
|
|
700
|
+
return '.'
|
|
701
|
+
|
|
702
|
+
if paren_match:
|
|
703
|
+
return ')'
|
|
704
|
+
|
|
705
|
+
if full_paren_match:
|
|
706
|
+
return '()'
|
|
707
|
+
|
|
708
|
+
return None
|
|
709
|
+
|
|
710
|
+
|
|
711
|
+
def _is_continuation_line(line: str, list_item_indent: int) -> bool:
|
|
712
|
+
"""Check if a line is a continuation of a multi-line list item.
|
|
713
|
+
|
|
714
|
+
A continuation line is indented further than the list item marker
|
|
715
|
+
and contains some content.
|
|
716
|
+
"""
|
|
717
|
+
if not line or not line.strip():
|
|
718
|
+
return False
|
|
719
|
+
|
|
720
|
+
# Get the indentation level of this line
|
|
721
|
+
stripped = line.lstrip(' ')
|
|
722
|
+
line_indent = len(line) - len(stripped)
|
|
723
|
+
|
|
724
|
+
# Must be indented more than the list item marker to be a continuation
|
|
725
|
+
return line_indent > list_item_indent
|
|
726
|
+
|
|
727
|
+
|
|
728
|
+
def _is_literal_block_paragraph(
|
|
729
|
+
lines: list[str], start_idx: int
|
|
730
|
+
) -> tuple[bool, int]:
|
|
731
|
+
"""Check if lines starting at start_idx form a literal block following ::.
|
|
732
|
+
|
|
733
|
+
A literal block is a paragraph that follows a line ending with ::
|
|
734
|
+
(double colon). The entire paragraph should not be wrapped.
|
|
735
|
+
|
|
736
|
+
Parameters
|
|
737
|
+
----------
|
|
738
|
+
lines : list[str]
|
|
739
|
+
The list of lines to check.
|
|
740
|
+
start_idx : int
|
|
741
|
+
The starting index to check from.
|
|
742
|
+
|
|
743
|
+
Returns
|
|
744
|
+
-------
|
|
745
|
+
tuple[bool, int]
|
|
746
|
+
A tuple of (is_literal_block, end_idx) where is_literal_block indicates
|
|
747
|
+
if a literal block was found starting at start_idx, and end_idx is the
|
|
748
|
+
index after the last line of the block (or start_idx if no block found)
|
|
749
|
+
"""
|
|
750
|
+
if start_idx >= len(lines):
|
|
751
|
+
return False, start_idx
|
|
752
|
+
|
|
753
|
+
# Check if current line starts a paragraph after a :: line
|
|
754
|
+
if start_idx == 0:
|
|
755
|
+
return False, start_idx
|
|
756
|
+
|
|
757
|
+
# Look at the previous non-empty line to see if it ends with ::
|
|
758
|
+
prev_idx = start_idx - 1
|
|
759
|
+
while prev_idx >= 0 and not lines[prev_idx].strip():
|
|
760
|
+
prev_idx -= 1
|
|
761
|
+
|
|
762
|
+
if prev_idx < 0:
|
|
763
|
+
return False, start_idx
|
|
764
|
+
|
|
765
|
+
prev_line = lines[prev_idx].rstrip()
|
|
766
|
+
if not prev_line.endswith('::'):
|
|
767
|
+
return False, start_idx
|
|
768
|
+
|
|
769
|
+
# Current line starts a literal block - find its end
|
|
770
|
+
current_idx = start_idx
|
|
771
|
+
while current_idx < len(lines):
|
|
772
|
+
line = lines[current_idx].strip()
|
|
773
|
+
|
|
774
|
+
# Empty line ends the literal block
|
|
775
|
+
if not line:
|
|
776
|
+
break
|
|
777
|
+
|
|
778
|
+
current_idx += 1
|
|
779
|
+
|
|
780
|
+
# Need at least one line to be a literal block
|
|
781
|
+
return current_idx > start_idx, current_idx
|