sec2md 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sec2md might be problematic. Click here for more details.
- sec2md/__init__.py +24 -0
- sec2md/absolute_table_parser.py +622 -0
- sec2md/chunker/__init__.py +0 -0
- sec2md/chunker/markdown_blocks.py +116 -0
- sec2md/chunker/markdown_chunk.py +76 -0
- sec2md/chunker/markdown_chunker.py +234 -0
- sec2md/chunking.py +66 -0
- sec2md/core.py +93 -0
- sec2md/models.py +153 -0
- sec2md/parser.py +586 -0
- sec2md/section_extractor.py +316 -0
- sec2md/sections.py +104 -0
- sec2md/table_parser.py +386 -0
- sec2md/utils.py +109 -0
- sec2md-0.1.0.dist-info/METADATA +217 -0
- sec2md-0.1.0.dist-info/RECORD +19 -0
- sec2md-0.1.0.dist-info/WHEEL +5 -0
- sec2md-0.1.0.dist-info/licenses/LICENSE +21 -0
- sec2md-0.1.0.dist-info/top_level.txt +1 -0
sec2md/__init__.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""sec2md: Convert SEC filings to high-quality Markdown."""
|
|
2
|
+
|
|
3
|
+
from sec2md.core import convert_to_markdown
|
|
4
|
+
from sec2md.utils import flatten_note
|
|
5
|
+
from sec2md.sections import extract_sections, get_section
|
|
6
|
+
from sec2md.chunking import chunk_pages, chunk_section
|
|
7
|
+
from sec2md.models import Page, Section, Item10K, Item10Q, FilingType
|
|
8
|
+
from sec2md.chunker.markdown_chunk import MarkdownChunk
|
|
9
|
+
|
|
10
|
+
__version__ = "0.1.0"
|
|
11
|
+
__all__ = [
|
|
12
|
+
"convert_to_markdown",
|
|
13
|
+
"flatten_note",
|
|
14
|
+
"extract_sections",
|
|
15
|
+
"get_section",
|
|
16
|
+
"chunk_pages",
|
|
17
|
+
"chunk_section",
|
|
18
|
+
"Page",
|
|
19
|
+
"Section",
|
|
20
|
+
"Item10K",
|
|
21
|
+
"Item10Q",
|
|
22
|
+
"FilingType",
|
|
23
|
+
"MarkdownChunk",
|
|
24
|
+
]
|
|
@@ -0,0 +1,622 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from bs4 import Tag
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
from typing import List, Optional, Tuple, Dict
|
|
7
|
+
|
|
8
|
+
NUMERIC_RE = re.compile(r"""
|
|
9
|
+
^\s*
|
|
10
|
+
[\(\[]? # optional opening paren/bracket
|
|
11
|
+
[\-—–]?\s* # optional dash
|
|
12
|
+
[$€£¥]?\s* # optional currency
|
|
13
|
+
\d+(?:[.,]\d{3})* # integer part (with or without thousands)
|
|
14
|
+
(?:[.,]\d+)? # decimals
|
|
15
|
+
\s*%? # optional percent
|
|
16
|
+
[\)\]]?\s*$ # optional closing paren/bracket
|
|
17
|
+
""", re.X)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def median(values: List[float]) -> float:
|
|
21
|
+
"""Calculate median of a list of numbers."""
|
|
22
|
+
if not values:
|
|
23
|
+
return 0.0
|
|
24
|
+
sorted_vals = sorted(values)
|
|
25
|
+
n = len(sorted_vals)
|
|
26
|
+
if n % 2 == 0:
|
|
27
|
+
return (sorted_vals[n // 2 - 1] + sorted_vals[n // 2]) / 2.0
|
|
28
|
+
return sorted_vals[n // 2]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class AbsolutelyPositionedTableParser:
|
|
32
|
+
"""
|
|
33
|
+
Parser for pseudo-tables constructed from absolutely positioned div elements.
|
|
34
|
+
|
|
35
|
+
These appear in some SEC filings where tables are rendered using position:absolute
|
|
36
|
+
divs instead of proper HTML table elements.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(self, elements: List[Tag]):
|
|
40
|
+
"""
|
|
41
|
+
Initialize with a list of absolutely positioned elements.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
elements: List of Tag elements that are absolutely positioned
|
|
45
|
+
"""
|
|
46
|
+
self.elements = elements
|
|
47
|
+
self.positioned_elements = self._extract_positions()
|
|
48
|
+
|
|
49
|
+
def _get_position(self, el: Tag) -> Optional[Tuple[float, float]]:
|
|
50
|
+
"""Extract (left, top) position from element style."""
|
|
51
|
+
if not isinstance(el, Tag):
|
|
52
|
+
return None
|
|
53
|
+
style = el.get("style", "")
|
|
54
|
+
left_match = re.search(r'left:\s*(\d+(?:\.\d+)?)px', style)
|
|
55
|
+
top_match = re.search(r'top:\s*(\d+(?:\.\d+)?)px', style)
|
|
56
|
+
if left_match and top_match:
|
|
57
|
+
return (float(left_match.group(1)), float(top_match.group(1)))
|
|
58
|
+
return None
|
|
59
|
+
|
|
60
|
+
def _clean_text(self, element: Tag) -> str:
|
|
61
|
+
"""Extract and clean text from an element."""
|
|
62
|
+
text = element.get_text(separator=" ", strip=True)
|
|
63
|
+
text = text.replace("\u200b", "").replace("\ufeff", "").replace("\xa0", " ")
|
|
64
|
+
text = re.sub(r'\s+', ' ', text).strip()
|
|
65
|
+
return text
|
|
66
|
+
|
|
67
|
+
def _is_bold(self, el: Tag) -> bool:
|
|
68
|
+
"""Check if element has bold styling."""
|
|
69
|
+
style = (el.get("style") or "").lower()
|
|
70
|
+
return "font-weight:700" in style or "font-weight:bold" in style
|
|
71
|
+
|
|
72
|
+
def _is_spacer(self, el: Tag) -> bool:
|
|
73
|
+
"""
|
|
74
|
+
Detect inline-block spacer boxes that should be treated as spaces.
|
|
75
|
+
|
|
76
|
+
These are common in PDF->HTML conversions: <div style="display:inline-block;width:5px"> </div>
|
|
77
|
+
"""
|
|
78
|
+
if not isinstance(el, Tag):
|
|
79
|
+
return False
|
|
80
|
+
|
|
81
|
+
style = el.get("style", "").lower().replace(" ", "")
|
|
82
|
+
text = el.get_text(strip=True)
|
|
83
|
+
has_nbsp = '\xa0' in str(el) or ' ' in str(el)
|
|
84
|
+
width_match = re.search(r'width:(\d+)px', style)
|
|
85
|
+
|
|
86
|
+
is_inline_block = 'display:inline-block' in style
|
|
87
|
+
is_empty_or_nbsp = (not text or has_nbsp)
|
|
88
|
+
is_narrow = width_match and int(width_match.group(1)) < 30
|
|
89
|
+
|
|
90
|
+
return is_inline_block and is_empty_or_nbsp and is_narrow
|
|
91
|
+
|
|
92
|
+
def _contains_number(self, text: str) -> bool:
|
|
93
|
+
"""Check if text contains a numeric value using robust pattern."""
|
|
94
|
+
return bool(NUMERIC_RE.search(text))
|
|
95
|
+
|
|
96
|
+
def _extract_positions(self) -> List[Tuple[float, float, Tag]]:
|
|
97
|
+
"""Extract positions for all elements with valid positioning."""
|
|
98
|
+
positioned = []
|
|
99
|
+
for el in self.elements:
|
|
100
|
+
pos = self._get_position(el)
|
|
101
|
+
|
|
102
|
+
# Handle spacer boxes - add as synthetic space marker
|
|
103
|
+
if self._is_spacer(el):
|
|
104
|
+
if pos:
|
|
105
|
+
# Create a synthetic tag that we'll recognize later
|
|
106
|
+
positioned.append((pos[0], pos[1], el))
|
|
107
|
+
continue
|
|
108
|
+
|
|
109
|
+
text = self._clean_text(el)
|
|
110
|
+
# Only include elements with both position and text content
|
|
111
|
+
if pos and text:
|
|
112
|
+
left, top = pos
|
|
113
|
+
positioned.append((left, top, el))
|
|
114
|
+
return positioned
|
|
115
|
+
|
|
116
|
+
def _filter_table_content(self, elements: List[Tuple[float, float, Tag]]) -> List[Tuple[float, float, Tag]]:
|
|
117
|
+
"""
|
|
118
|
+
Filter out title/caption text that appears before the actual table.
|
|
119
|
+
|
|
120
|
+
Tables often have introductory text like "The following table sets forth..."
|
|
121
|
+
This should be excluded from table detection and rendering.
|
|
122
|
+
"""
|
|
123
|
+
if len(elements) < 10:
|
|
124
|
+
return elements # Too small to have significant leading text
|
|
125
|
+
|
|
126
|
+
# Group by Y position to find rows
|
|
127
|
+
y_coords = [top for _, top, _ in elements]
|
|
128
|
+
y_clusters = self._cluster_by_eps(y_coords, eps=15)
|
|
129
|
+
|
|
130
|
+
# Count elements per row
|
|
131
|
+
row_counts = defaultdict(list)
|
|
132
|
+
for left, top, el in elements:
|
|
133
|
+
row_cluster = y_clusters[top]
|
|
134
|
+
row_counts[row_cluster].append((left, top, el))
|
|
135
|
+
|
|
136
|
+
# Sort rows by Y position
|
|
137
|
+
sorted_rows = sorted(row_counts.items(), key=lambda x: min(t for _, t, _ in x[1]))
|
|
138
|
+
|
|
139
|
+
# Find the first row with multiple elements (likely start of actual table)
|
|
140
|
+
table_start_row = None
|
|
141
|
+
for row_id, row_elements in sorted_rows:
|
|
142
|
+
if len(row_elements) >= 3: # Row with at least 3 elements = likely table row
|
|
143
|
+
table_start_row = row_id
|
|
144
|
+
break
|
|
145
|
+
|
|
146
|
+
if table_start_row is None:
|
|
147
|
+
return elements # Couldn't identify table start, return all
|
|
148
|
+
|
|
149
|
+
# Get the Y position of the table start
|
|
150
|
+
table_start_y = min(top for _, top, _ in row_counts[table_start_row])
|
|
151
|
+
|
|
152
|
+
# Filter out elements that are significantly above the table start (>30px)
|
|
153
|
+
filtered = [(l, t, e) for l, t, e in elements if t >= table_start_y - 30]
|
|
154
|
+
|
|
155
|
+
return filtered if len(filtered) >= 6 else elements # Sanity check
|
|
156
|
+
|
|
157
|
+
def _cluster_by_eps(self, values: List[float], eps: float) -> Dict[float, int]:
|
|
158
|
+
"""
|
|
159
|
+
Cluster positions within epsilon tolerance.
|
|
160
|
+
|
|
161
|
+
This is more robust than gap-based clustering because it handles
|
|
162
|
+
rendering jitter (e.g., 100.0, 100.5, 101.2 should be same cluster).
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
values: List of coordinate values
|
|
166
|
+
eps: Epsilon tolerance (pixels)
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
Dictionary mapping value -> cluster_id
|
|
170
|
+
"""
|
|
171
|
+
if not values:
|
|
172
|
+
return {}
|
|
173
|
+
|
|
174
|
+
sorted_vals = sorted(set(values))
|
|
175
|
+
cluster_id = 0
|
|
176
|
+
clusters = {}
|
|
177
|
+
anchor = sorted_vals[0]
|
|
178
|
+
|
|
179
|
+
for val in sorted_vals:
|
|
180
|
+
if val - anchor > eps:
|
|
181
|
+
cluster_id += 1
|
|
182
|
+
anchor = val
|
|
183
|
+
clusters[val] = cluster_id
|
|
184
|
+
|
|
185
|
+
return clusters
|
|
186
|
+
|
|
187
|
+
def is_table_like(self) -> bool:
|
|
188
|
+
"""
|
|
189
|
+
Determine if the positioned elements form a table-like structure.
|
|
190
|
+
|
|
191
|
+
This uses multiple heuristics to distinguish actual data tables from
|
|
192
|
+
normal paragraph text that happens to be absolutely positioned.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
True if elements appear to form a table, False otherwise
|
|
196
|
+
"""
|
|
197
|
+
if len(self.positioned_elements) < 6: # Need at least a 2x3 table
|
|
198
|
+
return False
|
|
199
|
+
|
|
200
|
+
# Filter out caption/title text
|
|
201
|
+
filtered_elements = self._filter_table_content(self.positioned_elements)
|
|
202
|
+
|
|
203
|
+
if len(filtered_elements) < 6:
|
|
204
|
+
return False
|
|
205
|
+
|
|
206
|
+
# Extract coordinates from filtered elements
|
|
207
|
+
x_coords = [left for left, _, _ in filtered_elements]
|
|
208
|
+
y_coords = [top for _, top, _ in filtered_elements]
|
|
209
|
+
|
|
210
|
+
# Cluster with epsilon tolerance (12px for rows, 50px for columns)
|
|
211
|
+
y_clusters = self._cluster_by_eps(y_coords, eps=12)
|
|
212
|
+
x_clusters = self._cluster_by_eps(x_coords, eps=50)
|
|
213
|
+
|
|
214
|
+
n_rows = len(set(y_clusters.values()))
|
|
215
|
+
n_cols = len(set(x_clusters.values()))
|
|
216
|
+
|
|
217
|
+
# Need at least 2x3 grid (2 columns minimum)
|
|
218
|
+
if n_rows < 2 or n_cols < 2:
|
|
219
|
+
return False
|
|
220
|
+
|
|
221
|
+
# CRITICAL: Check for numeric content - tables should have numbers
|
|
222
|
+
# Use robust numeric pattern
|
|
223
|
+
elements_with_numbers = sum(
|
|
224
|
+
1 for _, _, el in filtered_elements
|
|
225
|
+
if not self._is_spacer(el) and self._contains_number(self._clean_text(el))
|
|
226
|
+
)
|
|
227
|
+
numeric_ratio = elements_with_numbers / len(filtered_elements)
|
|
228
|
+
|
|
229
|
+
# At least 20% of cells should contain numbers
|
|
230
|
+
if numeric_ratio < 0.20:
|
|
231
|
+
return False
|
|
232
|
+
|
|
233
|
+
# Check average text length - tables have short cell content
|
|
234
|
+
avg_length = sum(len(self._clean_text(el)) for _, _, el in filtered_elements) / len(filtered_elements)
|
|
235
|
+
|
|
236
|
+
# If average cell is > 50 characters, probably paragraph text, not a table
|
|
237
|
+
if avg_length > 50:
|
|
238
|
+
return False
|
|
239
|
+
|
|
240
|
+
# Check for sentence structures (periods indicating prose)
|
|
241
|
+
text_with_periods = sum(
|
|
242
|
+
1 for _, _, el in filtered_elements
|
|
243
|
+
if '.' in self._clean_text(el) and len(self._clean_text(el)) > 20
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
# If >40% of cells have periods and long text, probably prose
|
|
247
|
+
if text_with_periods / len(filtered_elements) > 0.40:
|
|
248
|
+
return False
|
|
249
|
+
|
|
250
|
+
# Check density - should be reasonably filled
|
|
251
|
+
expected_cells = n_rows * n_cols
|
|
252
|
+
actual_cells = len(filtered_elements)
|
|
253
|
+
density = actual_cells / expected_cells
|
|
254
|
+
|
|
255
|
+
if density < 0.25: # Less than 25% filled = probably not a table
|
|
256
|
+
return False
|
|
257
|
+
|
|
258
|
+
# Check row consistency - rows should have similar number of elements
|
|
259
|
+
row_counts = defaultdict(int)
|
|
260
|
+
for left, top, _ in filtered_elements:
|
|
261
|
+
row_cluster = y_clusters[top]
|
|
262
|
+
row_counts[row_cluster] += 1
|
|
263
|
+
|
|
264
|
+
counts = list(row_counts.values())
|
|
265
|
+
if not counts or sum(counts) / len(counts) < 2:
|
|
266
|
+
return False
|
|
267
|
+
|
|
268
|
+
# Check for actual column structure - at least one column should have numeric content
|
|
269
|
+
col_elements = defaultdict(list)
|
|
270
|
+
for left, top, element in filtered_elements:
|
|
271
|
+
col_cluster = x_clusters[left]
|
|
272
|
+
col_elements[col_cluster].append(element)
|
|
273
|
+
|
|
274
|
+
has_numeric_column = False
|
|
275
|
+
for col_id, elements in col_elements.items():
|
|
276
|
+
if len(elements) >= 2:
|
|
277
|
+
numeric_in_col = sum(
|
|
278
|
+
1 for el in elements
|
|
279
|
+
if not self._is_spacer(el) and self._contains_number(self._clean_text(el))
|
|
280
|
+
)
|
|
281
|
+
if numeric_in_col / len(elements) > 0.5:
|
|
282
|
+
has_numeric_column = True
|
|
283
|
+
break
|
|
284
|
+
|
|
285
|
+
if not has_numeric_column:
|
|
286
|
+
return False
|
|
287
|
+
|
|
288
|
+
return True
|
|
289
|
+
|
|
290
|
+
def to_grid(self) -> Optional[List[List[List[Tuple[float, float, Tag]]]]]:
|
|
291
|
+
"""
|
|
292
|
+
Convert positioned elements to a 2D grid structure.
|
|
293
|
+
|
|
294
|
+
Returns:
|
|
295
|
+
2D grid where each cell contains a list of (left, top, element) tuples,
|
|
296
|
+
or None if structure is not table-like
|
|
297
|
+
"""
|
|
298
|
+
if not self.is_table_like():
|
|
299
|
+
return None
|
|
300
|
+
|
|
301
|
+
# Filter out caption/title text
|
|
302
|
+
filtered_elements = self._filter_table_content(self.positioned_elements)
|
|
303
|
+
|
|
304
|
+
# Extract coordinates from filtered elements
|
|
305
|
+
x_coords = [left for left, _, _ in filtered_elements]
|
|
306
|
+
y_coords = [top for _, top, _ in filtered_elements]
|
|
307
|
+
|
|
308
|
+
# Cluster with epsilon tolerance
|
|
309
|
+
y_clusters = self._cluster_by_eps(y_coords, eps=12)
|
|
310
|
+
x_clusters = self._cluster_by_eps(x_coords, eps=50)
|
|
311
|
+
|
|
312
|
+
n_rows = len(set(y_clusters.values()))
|
|
313
|
+
n_cols = len(set(x_clusters.values()))
|
|
314
|
+
|
|
315
|
+
# Build grid dictionary
|
|
316
|
+
grid_dict: Dict[Tuple[int, int], List[Tuple[float, float, Tag]]] = defaultdict(list)
|
|
317
|
+
|
|
318
|
+
for left, top, element in filtered_elements:
|
|
319
|
+
row_cluster = y_clusters[top]
|
|
320
|
+
col_cluster = x_clusters[left]
|
|
321
|
+
|
|
322
|
+
# Map to 0-based indices
|
|
323
|
+
row_id = sorted(set(y_clusters.values())).index(row_cluster)
|
|
324
|
+
col_id = sorted(set(x_clusters.values())).index(col_cluster)
|
|
325
|
+
|
|
326
|
+
grid_dict[(row_id, col_id)].append((left, top, element))
|
|
327
|
+
|
|
328
|
+
# Convert to 2D list
|
|
329
|
+
grid = [[[] for _ in range(n_cols)] for _ in range(n_rows)]
|
|
330
|
+
|
|
331
|
+
for (row, col), cell_elements in grid_dict.items():
|
|
332
|
+
if row < n_rows and col < n_cols:
|
|
333
|
+
# Sort by horizontal position within cell
|
|
334
|
+
cell_elements.sort(key=lambda x: x[0])
|
|
335
|
+
grid[row][col] = cell_elements
|
|
336
|
+
|
|
337
|
+
return grid
|
|
338
|
+
|
|
339
|
+
def to_markdown(self) -> str:
|
|
340
|
+
"""
|
|
341
|
+
Convert to markdown table format.
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
Markdown table string, or empty string if not table-like
|
|
345
|
+
"""
|
|
346
|
+
grid = self.to_grid()
|
|
347
|
+
if grid is None:
|
|
348
|
+
return ""
|
|
349
|
+
|
|
350
|
+
# Extract text from grid, merging elements in same cell
|
|
351
|
+
text_grid = []
|
|
352
|
+
for row in grid:
|
|
353
|
+
text_row = []
|
|
354
|
+
for cell_elements in row:
|
|
355
|
+
if not cell_elements:
|
|
356
|
+
text_row.append("")
|
|
357
|
+
else:
|
|
358
|
+
# Merge all text from elements in this cell
|
|
359
|
+
texts = []
|
|
360
|
+
for _, _, element in cell_elements:
|
|
361
|
+
if self._is_spacer(element):
|
|
362
|
+
# Spacer box - add a space if we have previous text
|
|
363
|
+
if texts:
|
|
364
|
+
texts.append(" ")
|
|
365
|
+
else:
|
|
366
|
+
text = self._clean_text(element)
|
|
367
|
+
if text:
|
|
368
|
+
# Preserve bold formatting
|
|
369
|
+
if self._is_bold(element):
|
|
370
|
+
text = f"**{text}**"
|
|
371
|
+
texts.append(text)
|
|
372
|
+
text_row.append("".join(texts))
|
|
373
|
+
text_grid.append(text_row)
|
|
374
|
+
|
|
375
|
+
if not text_grid:
|
|
376
|
+
return ""
|
|
377
|
+
|
|
378
|
+
n_cols = len(text_grid[0]) if text_grid else 0
|
|
379
|
+
|
|
380
|
+
# Build markdown table
|
|
381
|
+
lines = []
|
|
382
|
+
for i, row in enumerate(text_grid):
|
|
383
|
+
# Pad row to match column count
|
|
384
|
+
while len(row) < n_cols:
|
|
385
|
+
row.append("")
|
|
386
|
+
# Escape pipe characters
|
|
387
|
+
escaped_row = [cell.replace("|", "\\|") for cell in row]
|
|
388
|
+
lines.append("| " + " | ".join(escaped_row) + " |")
|
|
389
|
+
|
|
390
|
+
# Add separator after first row (header)
|
|
391
|
+
if i == 0:
|
|
392
|
+
lines.append("| " + " | ".join(["---"] * n_cols) + " |")
|
|
393
|
+
|
|
394
|
+
markdown = "\n".join(lines)
|
|
395
|
+
|
|
396
|
+
# Clean up the markdown
|
|
397
|
+
return self._clean_markdown_table(markdown)
|
|
398
|
+
|
|
399
|
+
def _clean_markdown_table(self, markdown: str) -> str:
|
|
400
|
+
"""
|
|
401
|
+
Clean up markdown table by removing junk rows and empty columns.
|
|
402
|
+
|
|
403
|
+
Args:
|
|
404
|
+
markdown: Raw markdown table string
|
|
405
|
+
|
|
406
|
+
Returns:
|
|
407
|
+
Cleaned markdown table string
|
|
408
|
+
"""
|
|
409
|
+
if not markdown:
|
|
410
|
+
return ""
|
|
411
|
+
|
|
412
|
+
lines = markdown.strip().split('\n')
|
|
413
|
+
if len(lines) < 3: # Need at least header, separator, one data row
|
|
414
|
+
return markdown
|
|
415
|
+
|
|
416
|
+
# Parse rows
|
|
417
|
+
rows = []
|
|
418
|
+
separator_idx = -1
|
|
419
|
+
for i, line in enumerate(lines):
|
|
420
|
+
cells = [c.strip() for c in line.split('|')[1:-1]] # Remove leading/trailing |
|
|
421
|
+
if all(c in ['---', ''] for c in cells):
|
|
422
|
+
separator_idx = i
|
|
423
|
+
rows.append(cells)
|
|
424
|
+
else:
|
|
425
|
+
rows.append(cells)
|
|
426
|
+
|
|
427
|
+
if not rows or separator_idx < 0:
|
|
428
|
+
return markdown
|
|
429
|
+
|
|
430
|
+
# Identify junk rows (footnotes, page numbers, mostly empty)
|
|
431
|
+
def is_junk_row(row, row_idx):
|
|
432
|
+
if row_idx <= separator_idx: # Keep header rows
|
|
433
|
+
return False
|
|
434
|
+
|
|
435
|
+
# Check if mostly empty
|
|
436
|
+
non_empty = [c for c in row if c and c != '---']
|
|
437
|
+
if len(non_empty) == 0:
|
|
438
|
+
return True
|
|
439
|
+
if len(non_empty) == 1 and len(non_empty[0]) < 5: # Single short cell (like page number)
|
|
440
|
+
return True
|
|
441
|
+
|
|
442
|
+
# Check if it's a footnote (starts with (a), (b), etc.)
|
|
443
|
+
first_non_empty = next((c for c in row if c), "")
|
|
444
|
+
if re.match(r'^\([a-z]\)', first_non_empty):
|
|
445
|
+
return True
|
|
446
|
+
|
|
447
|
+
# Check if one very long cell (footnote text) and rest empty
|
|
448
|
+
if len(non_empty) == 1 and len(non_empty[0]) > 100:
|
|
449
|
+
return True
|
|
450
|
+
|
|
451
|
+
return False
|
|
452
|
+
|
|
453
|
+
# Filter out junk rows
|
|
454
|
+
cleaned_rows = [row for i, row in enumerate(rows) if not is_junk_row(row, i)]
|
|
455
|
+
|
|
456
|
+
if not cleaned_rows or len(cleaned_rows) < 3:
|
|
457
|
+
return markdown
|
|
458
|
+
|
|
459
|
+
# Identify and remove empty columns
|
|
460
|
+
n_cols = len(cleaned_rows[0])
|
|
461
|
+
col_has_content = [False] * n_cols
|
|
462
|
+
|
|
463
|
+
for row_idx, row in enumerate(cleaned_rows):
|
|
464
|
+
if row_idx == separator_idx: # Skip separator
|
|
465
|
+
continue
|
|
466
|
+
for col_idx, cell in enumerate(row):
|
|
467
|
+
if col_idx < n_cols and cell and cell != '---':
|
|
468
|
+
col_has_content[col_idx] = True
|
|
469
|
+
|
|
470
|
+
# Remove completely empty columns
|
|
471
|
+
cols_to_keep = [i for i in range(n_cols) if col_has_content[i]]
|
|
472
|
+
|
|
473
|
+
# Rebuild table with kept columns
|
|
474
|
+
if not cols_to_keep:
|
|
475
|
+
return markdown
|
|
476
|
+
|
|
477
|
+
final_rows = []
|
|
478
|
+
for row in cleaned_rows:
|
|
479
|
+
new_row = [row[i] if i < len(row) else "" for i in cols_to_keep]
|
|
480
|
+
final_rows.append(new_row)
|
|
481
|
+
|
|
482
|
+
# Rebuild markdown
|
|
483
|
+
result_lines = []
|
|
484
|
+
for i, row in enumerate(final_rows):
|
|
485
|
+
result_lines.append("| " + " | ".join(row) + " |")
|
|
486
|
+
|
|
487
|
+
return "\n".join(result_lines)
|
|
488
|
+
|
|
489
|
+
def _join_lines(self, prev: str, current: str, gap: float, median_gap: float) -> Tuple[str, bool]:
|
|
490
|
+
"""
|
|
491
|
+
Smart line joining with hyphenation handling.
|
|
492
|
+
|
|
493
|
+
Args:
|
|
494
|
+
prev: Previous line text
|
|
495
|
+
current: Current line text
|
|
496
|
+
gap: Vertical gap between lines (pixels)
|
|
497
|
+
median_gap: Median line gap in document
|
|
498
|
+
|
|
499
|
+
Returns:
|
|
500
|
+
Tuple of (joined_text, should_add_newline)
|
|
501
|
+
"""
|
|
502
|
+
# Hyphenated word continuation
|
|
503
|
+
if prev.endswith('-'):
|
|
504
|
+
# Check if it's likely a hyphenated word (next starts with lowercase)
|
|
505
|
+
if current and current[0].islower():
|
|
506
|
+
# Remove hyphen and join directly
|
|
507
|
+
return (prev[:-1] + current, False)
|
|
508
|
+
else:
|
|
509
|
+
# Keep hyphen but join with space (e.g., "end-of-year Statement")
|
|
510
|
+
return (prev + " " + current, False)
|
|
511
|
+
|
|
512
|
+
# Check if previous line looks like it continues (no terminal punctuation)
|
|
513
|
+
ends_with_continuation = not prev.rstrip().endswith(('.', '!', '?', ':', ';', ')', ']'))
|
|
514
|
+
|
|
515
|
+
# Small gap + continuation = join with space
|
|
516
|
+
if ends_with_continuation and gap < 1.4 * median_gap:
|
|
517
|
+
return (prev + " " + current, False)
|
|
518
|
+
|
|
519
|
+
# Otherwise, separate with newline
|
|
520
|
+
return (prev, True)
|
|
521
|
+
|
|
522
|
+
def to_text(self) -> str:
|
|
523
|
+
"""
|
|
524
|
+
Convert to plain text format (fallback if not table-like).
|
|
525
|
+
Preserves bold formatting and handles hyphenation.
|
|
526
|
+
|
|
527
|
+
Returns:
|
|
528
|
+
Text representation with elements sorted by position and formatting preserved
|
|
529
|
+
"""
|
|
530
|
+
# Sort by vertical then horizontal position
|
|
531
|
+
sorted_elements = sorted(self.positioned_elements, key=lambda x: (x[1], x[0]))
|
|
532
|
+
|
|
533
|
+
# Group by rows (epsilon clustering for Y coordinates)
|
|
534
|
+
if not sorted_elements:
|
|
535
|
+
return ""
|
|
536
|
+
|
|
537
|
+
y_coords = [top for _, top, _ in sorted_elements]
|
|
538
|
+
median_line_gap = median([y_coords[i + 1] - y_coords[i]
|
|
539
|
+
for i in range(len(y_coords) - 1)
|
|
540
|
+
if y_coords[i + 1] - y_coords[i] > 1]) if len(y_coords) > 1 else 15.0
|
|
541
|
+
|
|
542
|
+
rows = []
|
|
543
|
+
current_row = []
|
|
544
|
+
last_top = None
|
|
545
|
+
|
|
546
|
+
for left, top, element in sorted_elements:
|
|
547
|
+
if last_top is None or abs(top - last_top) <= 5: # Same row (5px tolerance)
|
|
548
|
+
current_row.append((left, top, element))
|
|
549
|
+
else:
|
|
550
|
+
if current_row:
|
|
551
|
+
rows.append(current_row)
|
|
552
|
+
current_row = [(left, top, element)]
|
|
553
|
+
last_top = top
|
|
554
|
+
|
|
555
|
+
if current_row:
|
|
556
|
+
rows.append(current_row)
|
|
557
|
+
|
|
558
|
+
# Convert to text with formatting and hyphenation handling
|
|
559
|
+
lines = []
|
|
560
|
+
for i, row in enumerate(rows):
|
|
561
|
+
# Sort by horizontal position within row
|
|
562
|
+
row.sort(key=lambda x: x[0])
|
|
563
|
+
texts = []
|
|
564
|
+
for _, _, el in row:
|
|
565
|
+
if self._is_spacer(el):
|
|
566
|
+
# Add space marker
|
|
567
|
+
if texts:
|
|
568
|
+
texts.append(" ")
|
|
569
|
+
else:
|
|
570
|
+
text = self._clean_text(el)
|
|
571
|
+
if text:
|
|
572
|
+
# Preserve bold formatting
|
|
573
|
+
if self._is_bold(el):
|
|
574
|
+
text = f"**{text}**"
|
|
575
|
+
texts.append(text)
|
|
576
|
+
|
|
577
|
+
if not texts:
|
|
578
|
+
continue
|
|
579
|
+
|
|
580
|
+
line = "".join(texts)
|
|
581
|
+
|
|
582
|
+
# Determine if we need spacing before this line
|
|
583
|
+
if i == 0:
|
|
584
|
+
# First line - no spacing needed
|
|
585
|
+
lines.append(line)
|
|
586
|
+
else:
|
|
587
|
+
# Check previous line to determine spacing
|
|
588
|
+
prev_row = rows[i - 1]
|
|
589
|
+
prev_y = prev_row[0][1]
|
|
590
|
+
current_y = row[0][1]
|
|
591
|
+
gap = abs(current_y - prev_y)
|
|
592
|
+
|
|
593
|
+
# Check if previous line is a continuation
|
|
594
|
+
prev_line = lines[-1] if lines else ""
|
|
595
|
+
|
|
596
|
+
# Check if current line is a bold header
|
|
597
|
+
is_header = (
|
|
598
|
+
any(self._is_bold(el) for _, _, el in row if not self._is_spacer(el)) and
|
|
599
|
+
all(self._is_bold(el) for _, _, el in row if
|
|
600
|
+
not self._is_spacer(el) and self._clean_text(el)) and
|
|
601
|
+
len(line) < 80
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
if is_header and not prev_line.endswith('-'):
|
|
605
|
+
# Add blank line before header
|
|
606
|
+
lines.append("")
|
|
607
|
+
lines.append(line)
|
|
608
|
+
else:
|
|
609
|
+
# Use smart joining
|
|
610
|
+
joined_text, needs_newline = self._join_lines(prev_line, line, gap, median_line_gap)
|
|
611
|
+
|
|
612
|
+
if needs_newline:
|
|
613
|
+
# Replace last line with joined text and add current as new line
|
|
614
|
+
if lines:
|
|
615
|
+
lines[-1] = joined_text
|
|
616
|
+
lines.append(line)
|
|
617
|
+
else:
|
|
618
|
+
# Replace last line with joined result
|
|
619
|
+
if lines:
|
|
620
|
+
lines[-1] = joined_text
|
|
621
|
+
|
|
622
|
+
return "\n".join(lines)
|
|
File without changes
|