sec2md 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sec2md might be problematic. Click here for more details.

sec2md/__init__.py CHANGED
@@ -10,7 +10,7 @@ from sec2md.chunker.chunker import Chunker
10
10
  from sec2md.parser import Parser
11
11
  from sec2md.section_extractor import SectionExtractor
12
12
 
13
- __version__ = "0.1.10"
13
+ __version__ = "0.1.12"
14
14
  __all__ = [
15
15
  "convert_to_markdown",
16
16
  "flatten_note",
@@ -143,8 +143,9 @@ class SectionExtractor:
143
143
  # ========== 8-K Specific Methods ==========
144
144
 
145
145
  # 8-K item header regex: ITEM 1.01 / 7.01 / 9.01
146
+ # Simplified pattern: match ONLY at line start, with strict formatting
146
147
  _ITEM_8K_RE = re.compile(
147
- rf'^\s*{LEAD_WRAP}(ITEM)\s+([1-9]\.\d{{2}}[A-Z]?)\.?\s*(?:[:.\-–—]\s*)?(.*)',
148
+ rf'^\s*{LEAD_WRAP}(ITEM)\s+([1-9]\.\d{{2}}[A-Z]?)\.?\s*(?:[:.\-–—]\s*)?(.*)$',
148
149
  re.IGNORECASE | re.MULTILINE
149
150
  )
150
151
 
@@ -250,84 +251,140 @@ class SectionExtractor:
250
251
  end = mstop.start() if mstop else next_item_start
251
252
  return doc[start_after:end].strip()
252
253
 
254
+ def _is_8k_boilerplate_page(self, page_content: str, page_num: int) -> bool:
255
+ """Detect cover, TOC, and signature pages in 8-Ks."""
256
+ # Cover page is always page 1
257
+ if page_num == 1:
258
+ return True
259
+
260
+ # TOC page: has "TABLE OF CONTENTS" header (with or without bold markdown)
261
+ # Also detect if page has multiple ITEM entries with page numbers (TOC table pattern)
262
+ if re.search(r'TABLE OF CONTENTS', page_content, re.IGNORECASE):
263
+ return True
264
+
265
+ # Alternative TOC detection: page has multiple items with "| digit |" pattern (page numbers in table)
266
+ item_with_page_count = len(re.findall(r'ITEM\s+[1-9]\.\d{2}.*?\|\s*\d+\s*\|', page_content, re.IGNORECASE))
267
+ if item_with_page_count >= 2: # If 2+ items have page numbers, it's a TOC
268
+ return True
269
+
270
+ # Signatures page: has "SIGNATURES" header and filing signature text
271
+ if re.search(r'\*\*SIGNATURES\*\*', page_content) and \
272
+ re.search(r'Pursuant to the requirements', page_content, re.IGNORECASE):
273
+ return True
274
+
275
+ return False
276
+
253
277
  def _get_8k_sections(self) -> List[Any]:
254
- """Extract 8-K sections (items only, no PART divisions)."""
278
+ """Extract 8-K sections using page-by-page approach like standard extractor."""
255
279
  from sec2md.models import Section, Page, ITEM_8K_TITLES
256
280
 
257
- # Concatenate all pages into one doc
258
- full_content = "\n\n".join(p["content"] for p in self.pages)
259
- doc = self._clean_8k_text(full_content)
281
+ sections = []
282
+ current_item = None
283
+ current_item_title = None
284
+ current_pages: List[Dict] = []
260
285
 
261
- if not doc:
262
- self._log("DEBUG: No content after cleaning")
263
- return []
286
+ def flush_section():
287
+ nonlocal sections, current_item, current_item_title, current_pages
288
+ if current_pages and current_item:
289
+ # Parse exhibits if this is ITEM 9.01
290
+ exhibits = None
291
+ if current_item.startswith("ITEM 9.01"):
292
+ content = "\n".join(p["content"] for p in current_pages)
293
+ md = re.search(r'^\s*\(?d\)?\s*Exhibits\b.*$', content, re.IGNORECASE | re.MULTILINE)
294
+ ex_block = content[md.end():].strip() if md else content
295
+ parsed_exhibits = self._parse_exhibits(ex_block)
296
+ exhibits = parsed_exhibits if parsed_exhibits else None
297
+
298
+ # Convert page dicts to Page objects
299
+ page_objects = [Page(number=p["page"], content=p["content"], elements=None, text_blocks=None)
300
+ for p in current_pages]
301
+
302
+ sections.append(Section(
303
+ part=None,
304
+ item=current_item,
305
+ item_title=current_item_title,
306
+ pages=page_objects,
307
+ exhibits=exhibits
308
+ ))
309
+ current_pages = []
264
310
 
265
- # Find all item headers
266
- headers: List[Dict] = []
267
- for m in self._ITEM_8K_RE.finditer(doc):
268
- code = self._normalize_8k_item_code(m.group(2))
269
- title_inline = (m.group(3) or "").strip()
270
- # Clean markdown artifacts from title
271
- title_inline = MD_EDGE.sub("", title_inline)
311
+ for page_dict in self.pages:
312
+ page_num = page_dict["page"]
313
+ content = page_dict["content"]
272
314
 
273
- # Skip TOC entries (they have page numbers like "| 3 |" in the title)
274
- if re.search(r'\|\s*\d+\s*\|', title_inline):
275
- self._log(f"DEBUG: Skipping TOC entry for ITEM {code}")
315
+ # Skip boilerplate pages
316
+ if self._is_8k_boilerplate_page(content, page_num):
317
+ self._log(f"DEBUG: Page {page_num} is boilerplate, skipping")
276
318
  continue
277
319
 
278
- title = title_inline if title_inline else ITEM_8K_TITLES.get(code)
279
- headers.append({"start": m.start(), "end": m.end(), "no": code, "title": title})
280
- self._log(f"DEBUG: Found ITEM {code} at position {m.start()}")
320
+ # Find first valid ITEM header on this page (if any)
321
+ item_m = None
322
+ first_idx = None
281
323
 
282
- if not headers:
283
- self._log("DEBUG: No item headers found")
284
- return []
324
+ for m in self._ITEM_8K_RE.finditer(content):
325
+ # Get the full line for this match
326
+ line_start = content.rfind('\n', 0, m.start()) + 1
327
+ line_end = content.find('\n', m.end())
328
+ if line_end == -1:
329
+ line_end = len(content)
330
+ full_line = content[line_start:line_end].strip()
285
331
 
286
- self._log(f"DEBUG: Total headers found: {len(headers)}")
332
+ # Skip if this is a table row (contains pipe characters)
333
+ if '|' in full_line:
334
+ self._log(f"DEBUG: Page {page_num} skipping table row: {full_line[:60]}")
335
+ continue
287
336
 
288
- # Extract sections
289
- results: List[Section] = []
290
- for i, h in enumerate(headers):
291
- code = h["no"]
292
- next_start = headers[i + 1]["start"] if i + 1 < len(headers) else len(doc)
293
- body = self._slice_8k_body(doc, h["end"], next_start)
337
+ # Get item code and title
338
+ code = self._normalize_8k_item_code(m.group(2))
339
+ title_inline = (m.group(3) or "").strip()
340
+ title_inline = MD_EDGE.sub("", title_inline)
341
+
342
+ # This is a valid ITEM header
343
+ item_m = m
344
+ first_idx = m.start()
345
+ self._log(f"DEBUG: Page {page_num} found ITEM {code} at position {first_idx}")
346
+ break
347
+
348
+ # No item header found - add to current section
349
+ if first_idx is None:
350
+ if current_item:
351
+ current_pages.append({"page": page_num, "content": content.strip()})
352
+ continue
353
+
354
+ # Found item header - split page
355
+ before = content[:first_idx].strip()
356
+ after = content[first_idx:].strip()
357
+
358
+ # Add "before" content to current section
359
+ if current_item and before:
360
+ current_pages.append({"page": page_num, "content": before})
361
+
362
+ # Flush current section
363
+ flush_section()
364
+
365
+ # Start new section
366
+ code = self._normalize_8k_item_code(item_m.group(2))
367
+ title_inline = (item_m.group(3) or "").strip()
368
+ title_inline = MD_EDGE.sub("", title_inline)
369
+ current_item = f"ITEM {code}"
370
+ current_item_title = title_inline if title_inline else ITEM_8K_TITLES.get(code)
294
371
 
295
372
  # Filter by desired_items if provided
296
373
  if self.desired_items and code not in self.desired_items:
297
374
  self._log(f"DEBUG: Skipping ITEM {code} (not in desired_items)")
375
+ current_item = None
376
+ current_item_title = None
298
377
  continue
299
378
 
300
- # For 9.01, parse exhibits
301
- exhibits = []
302
- if code.startswith("9.01"):
303
- md = re.search(r'^\s*\(?d\)?\s*Exhibits\b.*$', body, re.IGNORECASE | re.MULTILINE)
304
- ex_block = body[md.end():].strip() if md else body
305
- exhibits = self._parse_exhibits(ex_block)
306
- self._log(f"DEBUG: Found {len(exhibits)} exhibits in 9.01")
307
-
308
- # Map back to Page objects (approximate page boundaries from original content)
309
- # Since 8-K sections can span pages, we need to find which pages contain this content
310
- section_pages = self._map_8k_content_to_pages(body)
311
-
312
- # Skip sections with no matching pages
313
- if not section_pages:
314
- self._log(f"DEBUG: Skipping ITEM {code} (no pages found)")
315
- continue
316
-
317
- # Create Section with exhibits (now part of the model)
318
- section = Section(
319
- part=None, # 8-K has no PART divisions
320
- item=f"ITEM {code}",
321
- item_title=h["title"],
322
- pages=section_pages,
323
- exhibits=exhibits if exhibits else None
324
- )
379
+ # Add "after" content to new section
380
+ if after:
381
+ current_pages.append({"page": page_num, "content": after})
325
382
 
326
- results.append(section)
327
- self._log(f"DEBUG: Extracted ITEM {code} with {len(section_pages)} pages")
383
+ # Flush final section
384
+ flush_section()
328
385
 
329
- self._log(f"DEBUG: Total sections extracted: {len(results)}")
330
- return results
386
+ self._log(f"DEBUG: Total sections extracted: {len(sections)}")
387
+ return sections
331
388
 
332
389
  def _map_8k_content_to_pages(self, section_content: str) -> List[Any]:
333
390
  """Map extracted section content back to Page objects, splitting at section boundaries."""
@@ -337,7 +394,10 @@ class SectionExtractor:
337
394
  section_content_cleaned = self._clean_8k_text(section_content)
338
395
  remaining_section = section_content_cleaned
339
396
 
340
- for page_dict in self.pages:
397
+ # Use filtered pages (excludes cover, TOC, signatures)
398
+ pages_to_search = getattr(self, '_filtered_8k_pages', self.pages)
399
+
400
+ for page_dict in pages_to_search:
341
401
  page_num = page_dict["page"]
342
402
  page_content = page_dict["content"]
343
403
  page_content_cleaned = self._clean_8k_text(page_content)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sec2md
3
- Version: 0.1.10
3
+ Version: 0.1.12
4
4
  Summary: Convert SEC EDGAR filings to LLM-ready Markdown for AI agents and agentic RAG
5
5
  Author-email: Lucas Astorian <lucas@intellifin.ai>
6
6
  License: MIT
@@ -1,10 +1,10 @@
1
- sec2md/__init__.py,sha256=WHduz6dNVQ_pLZ-OMs-9ikWD8Brc0HdHh1sfo_ygQYU,988
1
+ sec2md/__init__.py,sha256=cKVj4J_IPlcrZASlumEpjv69dMjIveatYUtPjASm1nE,988
2
2
  sec2md/absolute_table_parser.py,sha256=rphc5_HttniV2RtPCThQ68HWyyZIn9l-gkaFsbtQXBU,22982
3
3
  sec2md/chunking.py,sha256=OUjVffiqrHkFakFGjCZffE88G_jhIu9RBpVEbliF9jU,6115
4
4
  sec2md/core.py,sha256=hmdJXitoEWuekR5f3B1oEK1xmPux0t494lOpg5aJrRk,2663
5
5
  sec2md/models.py,sha256=zZNRp4S7pI_KHRSQwA04uSNYpDej-OzYW3S-mX2Irmc,14735
6
6
  sec2md/parser.py,sha256=-uyorKhrXrn_3dKMqq4peo2bdxcGvkQVHI5riSXX7z4,47558
7
- sec2md/section_extractor.py,sha256=otx4RObfNqP1zStilis9z4gDXp4mkN-9-tzIMACEIaE,28050
7
+ sec2md/section_extractor.py,sha256=0MqS_xluIQcI10u8-q7pk3v0uG8p8htlb4Sv0agh3Xg,30663
8
8
  sec2md/sections.py,sha256=wtmKqF_KP_G-7_qAxGvxs25U_4vcH5NDGn14ouEy5GE,2784
9
9
  sec2md/table_parser.py,sha256=FhR8OwX5NAJmzdbTFzHQTGUNUPieYN37UzMFbQMkogU,12551
10
10
  sec2md/utils.py,sha256=2lUeN5irTbdIyjylCkaPKMv4ALWxWMJl96PTO8FV3Ik,2990
@@ -12,8 +12,8 @@ sec2md/chunker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  sec2md/chunker/blocks.py,sha256=LiPV0GX0LYGkV-3kfxeBA9OCmMVjOjrwL46PH8snXw4,3388
13
13
  sec2md/chunker/chunk.py,sha256=eF7QAOita6AW_sp2Sg69853ZOH7npwM5o-AEem62RRk,4729
14
14
  sec2md/chunker/chunker.py,sha256=_VhrxfSCarnPGIx6LHIurgCEiwH3Tz7kVZuECgTNw2w,10588
15
- sec2md-0.1.10.dist-info/licenses/LICENSE,sha256=uJDiSGQ5TOx-PGhu2LGH4A-O53vS5hrQ5sc3j2Ps_Rk,1071
16
- sec2md-0.1.10.dist-info/METADATA,sha256=xW9Jin_IALBKHTlFzHnY9inkHmKLmf9jCio5jYc-EnY,7626
17
- sec2md-0.1.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
18
- sec2md-0.1.10.dist-info/top_level.txt,sha256=Jpmw3laEWwS9fljtAEg4sExjFw3zP8dGarjIknyh1v8,7
19
- sec2md-0.1.10.dist-info/RECORD,,
15
+ sec2md-0.1.12.dist-info/licenses/LICENSE,sha256=uJDiSGQ5TOx-PGhu2LGH4A-O53vS5hrQ5sc3j2Ps_Rk,1071
16
+ sec2md-0.1.12.dist-info/METADATA,sha256=eSwrrLVm2fNKlpEIBKY-wm4VwKwwh7i-egy3FIfURqA,7626
17
+ sec2md-0.1.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
18
+ sec2md-0.1.12.dist-info/top_level.txt,sha256=Jpmw3laEWwS9fljtAEg4sExjFw3zP8dGarjIknyh1v8,7
19
+ sec2md-0.1.12.dist-info/RECORD,,