sec2md 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sec2md might be problematic. Click here for more details.
- sec2md/__init__.py +1 -1
- sec2md/section_extractor.py +121 -61
- {sec2md-0.1.10.dist-info → sec2md-0.1.12.dist-info}/METADATA +1 -1
- {sec2md-0.1.10.dist-info → sec2md-0.1.12.dist-info}/RECORD +7 -7
- {sec2md-0.1.10.dist-info → sec2md-0.1.12.dist-info}/WHEEL +0 -0
- {sec2md-0.1.10.dist-info → sec2md-0.1.12.dist-info}/licenses/LICENSE +0 -0
- {sec2md-0.1.10.dist-info → sec2md-0.1.12.dist-info}/top_level.txt +0 -0
sec2md/__init__.py
CHANGED
sec2md/section_extractor.py
CHANGED
|
@@ -143,8 +143,9 @@ class SectionExtractor:
|
|
|
143
143
|
# ========== 8-K Specific Methods ==========
|
|
144
144
|
|
|
145
145
|
# 8-K item header regex: ITEM 1.01 / 7.01 / 9.01
|
|
146
|
+
# Simplified pattern: match ONLY at line start, with strict formatting
|
|
146
147
|
_ITEM_8K_RE = re.compile(
|
|
147
|
-
rf'^\s*{LEAD_WRAP}(ITEM)\s+([1-9]\.\d{{2}}[A-Z]?)\.?\s*(?:[:.\-–—]\s*)?(.*)',
|
|
148
|
+
rf'^\s*{LEAD_WRAP}(ITEM)\s+([1-9]\.\d{{2}}[A-Z]?)\.?\s*(?:[:.\-–—]\s*)?(.*)$',
|
|
148
149
|
re.IGNORECASE | re.MULTILINE
|
|
149
150
|
)
|
|
150
151
|
|
|
@@ -250,84 +251,140 @@ class SectionExtractor:
|
|
|
250
251
|
end = mstop.start() if mstop else next_item_start
|
|
251
252
|
return doc[start_after:end].strip()
|
|
252
253
|
|
|
254
|
+
def _is_8k_boilerplate_page(self, page_content: str, page_num: int) -> bool:
|
|
255
|
+
"""Detect cover, TOC, and signature pages in 8-Ks."""
|
|
256
|
+
# Cover page is always page 1
|
|
257
|
+
if page_num == 1:
|
|
258
|
+
return True
|
|
259
|
+
|
|
260
|
+
# TOC page: has "TABLE OF CONTENTS" header (with or without bold markdown)
|
|
261
|
+
# Also detect if page has multiple ITEM entries with page numbers (TOC table pattern)
|
|
262
|
+
if re.search(r'TABLE OF CONTENTS', page_content, re.IGNORECASE):
|
|
263
|
+
return True
|
|
264
|
+
|
|
265
|
+
# Alternative TOC detection: page has multiple items with "| digit |" pattern (page numbers in table)
|
|
266
|
+
item_with_page_count = len(re.findall(r'ITEM\s+[1-9]\.\d{2}.*?\|\s*\d+\s*\|', page_content, re.IGNORECASE))
|
|
267
|
+
if item_with_page_count >= 2: # If 2+ items have page numbers, it's a TOC
|
|
268
|
+
return True
|
|
269
|
+
|
|
270
|
+
# Signatures page: has "SIGNATURES" header and filing signature text
|
|
271
|
+
if re.search(r'\*\*SIGNATURES\*\*', page_content) and \
|
|
272
|
+
re.search(r'Pursuant to the requirements', page_content, re.IGNORECASE):
|
|
273
|
+
return True
|
|
274
|
+
|
|
275
|
+
return False
|
|
276
|
+
|
|
253
277
|
def _get_8k_sections(self) -> List[Any]:
|
|
254
|
-
"""Extract 8-K sections
|
|
278
|
+
"""Extract 8-K sections using page-by-page approach like standard extractor."""
|
|
255
279
|
from sec2md.models import Section, Page, ITEM_8K_TITLES
|
|
256
280
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
281
|
+
sections = []
|
|
282
|
+
current_item = None
|
|
283
|
+
current_item_title = None
|
|
284
|
+
current_pages: List[Dict] = []
|
|
260
285
|
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
286
|
+
def flush_section():
|
|
287
|
+
nonlocal sections, current_item, current_item_title, current_pages
|
|
288
|
+
if current_pages and current_item:
|
|
289
|
+
# Parse exhibits if this is ITEM 9.01
|
|
290
|
+
exhibits = None
|
|
291
|
+
if current_item.startswith("ITEM 9.01"):
|
|
292
|
+
content = "\n".join(p["content"] for p in current_pages)
|
|
293
|
+
md = re.search(r'^\s*\(?d\)?\s*Exhibits\b.*$', content, re.IGNORECASE | re.MULTILINE)
|
|
294
|
+
ex_block = content[md.end():].strip() if md else content
|
|
295
|
+
parsed_exhibits = self._parse_exhibits(ex_block)
|
|
296
|
+
exhibits = parsed_exhibits if parsed_exhibits else None
|
|
297
|
+
|
|
298
|
+
# Convert page dicts to Page objects
|
|
299
|
+
page_objects = [Page(number=p["page"], content=p["content"], elements=None, text_blocks=None)
|
|
300
|
+
for p in current_pages]
|
|
301
|
+
|
|
302
|
+
sections.append(Section(
|
|
303
|
+
part=None,
|
|
304
|
+
item=current_item,
|
|
305
|
+
item_title=current_item_title,
|
|
306
|
+
pages=page_objects,
|
|
307
|
+
exhibits=exhibits
|
|
308
|
+
))
|
|
309
|
+
current_pages = []
|
|
264
310
|
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
code = self._normalize_8k_item_code(m.group(2))
|
|
269
|
-
title_inline = (m.group(3) or "").strip()
|
|
270
|
-
# Clean markdown artifacts from title
|
|
271
|
-
title_inline = MD_EDGE.sub("", title_inline)
|
|
311
|
+
for page_dict in self.pages:
|
|
312
|
+
page_num = page_dict["page"]
|
|
313
|
+
content = page_dict["content"]
|
|
272
314
|
|
|
273
|
-
# Skip
|
|
274
|
-
if
|
|
275
|
-
self._log(f"DEBUG:
|
|
315
|
+
# Skip boilerplate pages
|
|
316
|
+
if self._is_8k_boilerplate_page(content, page_num):
|
|
317
|
+
self._log(f"DEBUG: Page {page_num} is boilerplate, skipping")
|
|
276
318
|
continue
|
|
277
319
|
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
320
|
+
# Find first valid ITEM header on this page (if any)
|
|
321
|
+
item_m = None
|
|
322
|
+
first_idx = None
|
|
281
323
|
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
324
|
+
for m in self._ITEM_8K_RE.finditer(content):
|
|
325
|
+
# Get the full line for this match
|
|
326
|
+
line_start = content.rfind('\n', 0, m.start()) + 1
|
|
327
|
+
line_end = content.find('\n', m.end())
|
|
328
|
+
if line_end == -1:
|
|
329
|
+
line_end = len(content)
|
|
330
|
+
full_line = content[line_start:line_end].strip()
|
|
285
331
|
|
|
286
|
-
|
|
332
|
+
# Skip if this is a table row (contains pipe characters)
|
|
333
|
+
if '|' in full_line:
|
|
334
|
+
self._log(f"DEBUG: Page {page_num} skipping table row: {full_line[:60]}")
|
|
335
|
+
continue
|
|
287
336
|
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
337
|
+
# Get item code and title
|
|
338
|
+
code = self._normalize_8k_item_code(m.group(2))
|
|
339
|
+
title_inline = (m.group(3) or "").strip()
|
|
340
|
+
title_inline = MD_EDGE.sub("", title_inline)
|
|
341
|
+
|
|
342
|
+
# This is a valid ITEM header
|
|
343
|
+
item_m = m
|
|
344
|
+
first_idx = m.start()
|
|
345
|
+
self._log(f"DEBUG: Page {page_num} found ITEM {code} at position {first_idx}")
|
|
346
|
+
break
|
|
347
|
+
|
|
348
|
+
# No item header found - add to current section
|
|
349
|
+
if first_idx is None:
|
|
350
|
+
if current_item:
|
|
351
|
+
current_pages.append({"page": page_num, "content": content.strip()})
|
|
352
|
+
continue
|
|
353
|
+
|
|
354
|
+
# Found item header - split page
|
|
355
|
+
before = content[:first_idx].strip()
|
|
356
|
+
after = content[first_idx:].strip()
|
|
357
|
+
|
|
358
|
+
# Add "before" content to current section
|
|
359
|
+
if current_item and before:
|
|
360
|
+
current_pages.append({"page": page_num, "content": before})
|
|
361
|
+
|
|
362
|
+
# Flush current section
|
|
363
|
+
flush_section()
|
|
364
|
+
|
|
365
|
+
# Start new section
|
|
366
|
+
code = self._normalize_8k_item_code(item_m.group(2))
|
|
367
|
+
title_inline = (item_m.group(3) or "").strip()
|
|
368
|
+
title_inline = MD_EDGE.sub("", title_inline)
|
|
369
|
+
current_item = f"ITEM {code}"
|
|
370
|
+
current_item_title = title_inline if title_inline else ITEM_8K_TITLES.get(code)
|
|
294
371
|
|
|
295
372
|
# Filter by desired_items if provided
|
|
296
373
|
if self.desired_items and code not in self.desired_items:
|
|
297
374
|
self._log(f"DEBUG: Skipping ITEM {code} (not in desired_items)")
|
|
375
|
+
current_item = None
|
|
376
|
+
current_item_title = None
|
|
298
377
|
continue
|
|
299
378
|
|
|
300
|
-
#
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
md = re.search(r'^\s*\(?d\)?\s*Exhibits\b.*$', body, re.IGNORECASE | re.MULTILINE)
|
|
304
|
-
ex_block = body[md.end():].strip() if md else body
|
|
305
|
-
exhibits = self._parse_exhibits(ex_block)
|
|
306
|
-
self._log(f"DEBUG: Found {len(exhibits)} exhibits in 9.01")
|
|
307
|
-
|
|
308
|
-
# Map back to Page objects (approximate page boundaries from original content)
|
|
309
|
-
# Since 8-K sections can span pages, we need to find which pages contain this content
|
|
310
|
-
section_pages = self._map_8k_content_to_pages(body)
|
|
311
|
-
|
|
312
|
-
# Skip sections with no matching pages
|
|
313
|
-
if not section_pages:
|
|
314
|
-
self._log(f"DEBUG: Skipping ITEM {code} (no pages found)")
|
|
315
|
-
continue
|
|
316
|
-
|
|
317
|
-
# Create Section with exhibits (now part of the model)
|
|
318
|
-
section = Section(
|
|
319
|
-
part=None, # 8-K has no PART divisions
|
|
320
|
-
item=f"ITEM {code}",
|
|
321
|
-
item_title=h["title"],
|
|
322
|
-
pages=section_pages,
|
|
323
|
-
exhibits=exhibits if exhibits else None
|
|
324
|
-
)
|
|
379
|
+
# Add "after" content to new section
|
|
380
|
+
if after:
|
|
381
|
+
current_pages.append({"page": page_num, "content": after})
|
|
325
382
|
|
|
326
|
-
|
|
327
|
-
|
|
383
|
+
# Flush final section
|
|
384
|
+
flush_section()
|
|
328
385
|
|
|
329
|
-
self._log(f"DEBUG: Total sections extracted: {len(
|
|
330
|
-
return
|
|
386
|
+
self._log(f"DEBUG: Total sections extracted: {len(sections)}")
|
|
387
|
+
return sections
|
|
331
388
|
|
|
332
389
|
def _map_8k_content_to_pages(self, section_content: str) -> List[Any]:
|
|
333
390
|
"""Map extracted section content back to Page objects, splitting at section boundaries."""
|
|
@@ -337,7 +394,10 @@ class SectionExtractor:
|
|
|
337
394
|
section_content_cleaned = self._clean_8k_text(section_content)
|
|
338
395
|
remaining_section = section_content_cleaned
|
|
339
396
|
|
|
340
|
-
|
|
397
|
+
# Use filtered pages (excludes cover, TOC, signatures)
|
|
398
|
+
pages_to_search = getattr(self, '_filtered_8k_pages', self.pages)
|
|
399
|
+
|
|
400
|
+
for page_dict in pages_to_search:
|
|
341
401
|
page_num = page_dict["page"]
|
|
342
402
|
page_content = page_dict["content"]
|
|
343
403
|
page_content_cleaned = self._clean_8k_text(page_content)
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
sec2md/__init__.py,sha256=
|
|
1
|
+
sec2md/__init__.py,sha256=cKVj4J_IPlcrZASlumEpjv69dMjIveatYUtPjASm1nE,988
|
|
2
2
|
sec2md/absolute_table_parser.py,sha256=rphc5_HttniV2RtPCThQ68HWyyZIn9l-gkaFsbtQXBU,22982
|
|
3
3
|
sec2md/chunking.py,sha256=OUjVffiqrHkFakFGjCZffE88G_jhIu9RBpVEbliF9jU,6115
|
|
4
4
|
sec2md/core.py,sha256=hmdJXitoEWuekR5f3B1oEK1xmPux0t494lOpg5aJrRk,2663
|
|
5
5
|
sec2md/models.py,sha256=zZNRp4S7pI_KHRSQwA04uSNYpDej-OzYW3S-mX2Irmc,14735
|
|
6
6
|
sec2md/parser.py,sha256=-uyorKhrXrn_3dKMqq4peo2bdxcGvkQVHI5riSXX7z4,47558
|
|
7
|
-
sec2md/section_extractor.py,sha256=
|
|
7
|
+
sec2md/section_extractor.py,sha256=0MqS_xluIQcI10u8-q7pk3v0uG8p8htlb4Sv0agh3Xg,30663
|
|
8
8
|
sec2md/sections.py,sha256=wtmKqF_KP_G-7_qAxGvxs25U_4vcH5NDGn14ouEy5GE,2784
|
|
9
9
|
sec2md/table_parser.py,sha256=FhR8OwX5NAJmzdbTFzHQTGUNUPieYN37UzMFbQMkogU,12551
|
|
10
10
|
sec2md/utils.py,sha256=2lUeN5irTbdIyjylCkaPKMv4ALWxWMJl96PTO8FV3Ik,2990
|
|
@@ -12,8 +12,8 @@ sec2md/chunker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
12
12
|
sec2md/chunker/blocks.py,sha256=LiPV0GX0LYGkV-3kfxeBA9OCmMVjOjrwL46PH8snXw4,3388
|
|
13
13
|
sec2md/chunker/chunk.py,sha256=eF7QAOita6AW_sp2Sg69853ZOH7npwM5o-AEem62RRk,4729
|
|
14
14
|
sec2md/chunker/chunker.py,sha256=_VhrxfSCarnPGIx6LHIurgCEiwH3Tz7kVZuECgTNw2w,10588
|
|
15
|
-
sec2md-0.1.
|
|
16
|
-
sec2md-0.1.
|
|
17
|
-
sec2md-0.1.
|
|
18
|
-
sec2md-0.1.
|
|
19
|
-
sec2md-0.1.
|
|
15
|
+
sec2md-0.1.12.dist-info/licenses/LICENSE,sha256=uJDiSGQ5TOx-PGhu2LGH4A-O53vS5hrQ5sc3j2Ps_Rk,1071
|
|
16
|
+
sec2md-0.1.12.dist-info/METADATA,sha256=eSwrrLVm2fNKlpEIBKY-wm4VwKwwh7i-egy3FIfURqA,7626
|
|
17
|
+
sec2md-0.1.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
18
|
+
sec2md-0.1.12.dist-info/top_level.txt,sha256=Jpmw3laEWwS9fljtAEg4sExjFw3zP8dGarjIknyh1v8,7
|
|
19
|
+
sec2md-0.1.12.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|