PyPI - sec2md - Versions diffs - 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl - Mend

sec2md 0.1.10py3-none-any.whl → 0.1.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of sec2md might be problematic. Click here for more details.

Files changed (7) hide show

sec2md/__init__.py CHANGED Viewed

@@ -10,7 +10,7 @@ from sec2md.chunker.chunker import Chunker
 from sec2md.parser import Parser
 from sec2md.section_extractor import SectionExtractor
-__version__ = "0.1.10"
+__version__ = "0.1.12"
 __all__ = [
     "convert_to_markdown",
     "flatten_note",

sec2md/section_extractor.py CHANGED Viewed

@@ -143,8 +143,9 @@ class SectionExtractor:
     # ========== 8-K Specific Methods ==========
     # 8-K item header regex: ITEM 1.01 / 7.01 / 9.01
+    # Simplified pattern: match ONLY at line start, with strict formatting
     _ITEM_8K_RE = re.compile(
-        rf'^\s*{LEAD_WRAP}(ITEM)\s+([1-9]\.\d{{2}}[A-Z]?)\.?\s*(?:[:.\-–—]\s*)?(.*)',
+        rf'^\s*{LEAD_WRAP}(ITEM)\s+([1-9]\.\d{{2}}[A-Z]?)\.?\s*(?:[:.\-–—]\s*)?(.*)$',
         re.IGNORECASE | re.MULTILINE
     )
@@ -250,84 +251,140 @@ class SectionExtractor:
         end = mstop.start() if mstop else next_item_start
         return doc[start_after:end].strip()
+    def _is_8k_boilerplate_page(self, page_content: str, page_num: int) -> bool:
+        """Detect cover, TOC, and signature pages in 8-Ks."""
+        # Cover page is always page 1
+        if page_num == 1:
+            return True
+        # TOC page: has "TABLE OF CONTENTS" header (with or without bold markdown)
+        # Also detect if page has multiple ITEM entries with page numbers (TOC table pattern)
+        if re.search(r'TABLE OF CONTENTS', page_content, re.IGNORECASE):
+            return True
+        # Alternative TOC detection: page has multiple items with "| digit |" pattern (page numbers in table)
+        item_with_page_count = len(re.findall(r'ITEM\s+[1-9]\.\d{2}.*?\|\s*\d+\s*\|', page_content, re.IGNORECASE))
+        if item_with_page_count >= 2:  # If 2+ items have page numbers, it's a TOC
+            return True
+        # Signatures page: has "SIGNATURES" header and filing signature text
+        if re.search(r'\*\*SIGNATURES\*\*', page_content) and \
+           re.search(r'Pursuant to the requirements', page_content, re.IGNORECASE):
+            return True
+        return False
     def _get_8k_sections(self) -> List[Any]:
-        """Extract 8-K sections (items only, no PART divisions)."""
+        """Extract 8-K sections using page-by-page approach like standard extractor."""
         from sec2md.models import Section, Page, ITEM_8K_TITLES
-        # Concatenate all pages into one doc
-        full_content = "\n\n".join(p["content"] for p in self.pages)
-        doc = self._clean_8k_text(full_content)
+        sections = []
+        current_item = None
+        current_item_title = None
+        current_pages: List[Dict] = []
-        if not doc:
-            self._log("DEBUG: No content after cleaning")
-            return []
+        def flush_section():
+            nonlocal sections, current_item, current_item_title, current_pages
+            if current_pages and current_item:
+                # Parse exhibits if this is ITEM 9.01
+                exhibits = None
+                if current_item.startswith("ITEM 9.01"):
+                    content = "\n".join(p["content"] for p in current_pages)
+                    md = re.search(r'^\s*\(?d\)?\s*Exhibits\b.*$', content, re.IGNORECASE | re.MULTILINE)
+                    ex_block = content[md.end():].strip() if md else content
+                    parsed_exhibits = self._parse_exhibits(ex_block)
+                    exhibits = parsed_exhibits if parsed_exhibits else None
+                # Convert page dicts to Page objects
+                page_objects = [Page(number=p["page"], content=p["content"], elements=None, text_blocks=None)
+                                for p in current_pages]
+                sections.append(Section(
+                    part=None,
+                    item=current_item,
+                    item_title=current_item_title,
+                    pages=page_objects,
+                    exhibits=exhibits
+                ))
+                current_pages = []
-        # Find all item headers
-        headers: List[Dict] = []
-        for m in self._ITEM_8K_RE.finditer(doc):
-            code = self._normalize_8k_item_code(m.group(2))
-            title_inline = (m.group(3) or "").strip()
-            # Clean markdown artifacts from title
-            title_inline = MD_EDGE.sub("", title_inline)
+        for page_dict in self.pages:
+            page_num = page_dict["page"]
+            content = page_dict["content"]
-            # Skip TOC entries (they have page numbers like "| 3 |" in the title)
-            if re.search(r'\|\s*\d+\s*\|', title_inline):
-                self._log(f"DEBUG: Skipping TOC entry for ITEM {code}")
+            # Skip boilerplate pages
+            if self._is_8k_boilerplate_page(content, page_num):
+                self._log(f"DEBUG: Page {page_num} is boilerplate, skipping")
                 continue
-            title = title_inline if title_inline else ITEM_8K_TITLES.get(code)
-            headers.append({"start": m.start(), "end": m.end(), "no": code, "title": title})
-            self._log(f"DEBUG: Found ITEM {code} at position {m.start()}")
+            # Find first valid ITEM header on this page (if any)
+            item_m = None
+            first_idx = None
-        if not headers:
-            self._log("DEBUG: No item headers found")
-            return []
+            for m in self._ITEM_8K_RE.finditer(content):
+                # Get the full line for this match
+                line_start = content.rfind('\n', 0, m.start()) + 1
+                line_end = content.find('\n', m.end())
+                if line_end == -1:
+                    line_end = len(content)
+                full_line = content[line_start:line_end].strip()
-        self._log(f"DEBUG: Total headers found: {len(headers)}")
+                # Skip if this is a table row (contains pipe characters)
+                if '|' in full_line:
+                    self._log(f"DEBUG: Page {page_num} skipping table row: {full_line[:60]}")
+                    continue
-        # Extract sections
-        results: List[Section] = []
-        for i, h in enumerate(headers):
-            code = h["no"]
-            next_start = headers[i + 1]["start"] if i + 1 < len(headers) else len(doc)
-            body = self._slice_8k_body(doc, h["end"], next_start)
+                # Get item code and title
+                code = self._normalize_8k_item_code(m.group(2))
+                title_inline = (m.group(3) or "").strip()
+                title_inline = MD_EDGE.sub("", title_inline)
+                # This is a valid ITEM header
+                item_m = m
+                first_idx = m.start()
+                self._log(f"DEBUG: Page {page_num} found ITEM {code} at position {first_idx}")
+                break
+            # No item header found - add to current section
+            if first_idx is None:
+                if current_item:
+                    current_pages.append({"page": page_num, "content": content.strip()})
+                continue
+            # Found item header - split page
+            before = content[:first_idx].strip()
+            after = content[first_idx:].strip()
+            # Add "before" content to current section
+            if current_item and before:
+                current_pages.append({"page": page_num, "content": before})
+            # Flush current section
+            flush_section()
+            # Start new section
+            code = self._normalize_8k_item_code(item_m.group(2))
+            title_inline = (item_m.group(3) or "").strip()
+            title_inline = MD_EDGE.sub("", title_inline)
+            current_item = f"ITEM {code}"
+            current_item_title = title_inline if title_inline else ITEM_8K_TITLES.get(code)
             # Filter by desired_items if provided
             if self.desired_items and code not in self.desired_items:
                 self._log(f"DEBUG: Skipping ITEM {code} (not in desired_items)")
+                current_item = None
+                current_item_title = None
                 continue
-            # For 9.01, parse exhibits
-            exhibits = []
-            if code.startswith("9.01"):
-                md = re.search(r'^\s*\(?d\)?\s*Exhibits\b.*$', body, re.IGNORECASE | re.MULTILINE)
-                ex_block = body[md.end():].strip() if md else body
-                exhibits = self._parse_exhibits(ex_block)
-                self._log(f"DEBUG: Found {len(exhibits)} exhibits in 9.01")
-            # Map back to Page objects (approximate page boundaries from original content)
-            # Since 8-K sections can span pages, we need to find which pages contain this content
-            section_pages = self._map_8k_content_to_pages(body)
-            # Skip sections with no matching pages
-            if not section_pages:
-                self._log(f"DEBUG: Skipping ITEM {code} (no pages found)")
-                continue
-            # Create Section with exhibits (now part of the model)
-            section = Section(
-                part=None,  # 8-K has no PART divisions
-                item=f"ITEM {code}",
-                item_title=h["title"],
-                pages=section_pages,
-                exhibits=exhibits if exhibits else None
-            )
+            # Add "after" content to new section
+            if after:
+                current_pages.append({"page": page_num, "content": after})
-            results.append(section)
-            self._log(f"DEBUG: Extracted ITEM {code} with {len(section_pages)} pages")
+        # Flush final section
+        flush_section()
-        self._log(f"DEBUG: Total sections extracted: {len(results)}")
-        return results
+        self._log(f"DEBUG: Total sections extracted: {len(sections)}")
+        return sections
     def _map_8k_content_to_pages(self, section_content: str) -> List[Any]:
         """Map extracted section content back to Page objects, splitting at section boundaries."""
@@ -337,7 +394,10 @@ class SectionExtractor:
         section_content_cleaned = self._clean_8k_text(section_content)
         remaining_section = section_content_cleaned
-        for page_dict in self.pages:
+        # Use filtered pages (excludes cover, TOC, signatures)
+        pages_to_search = getattr(self, '_filtered_8k_pages', self.pages)
+        for page_dict in pages_to_search:
             page_num = page_dict["page"]
             page_content = page_dict["content"]
             page_content_cleaned = self._clean_8k_text(page_content)

{sec2md-0.1.10.dist-info → sec2md-0.1.12.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sec2md
-Version: 0.1.10
+Version: 0.1.12
 Summary: Convert SEC EDGAR filings to LLM-ready Markdown for AI agents and agentic RAG
 Author-email: Lucas Astorian <lucas@intellifin.ai>
 License: MIT

{sec2md-0.1.10.dist-info → sec2md-0.1.12.dist-info}/RECORD RENAMED Viewed

@@ -1,10 +1,10 @@
-sec2md/__init__.py,sha256=WHduz6dNVQ_pLZ-OMs-9ikWD8Brc0HdHh1sfo_ygQYU,988
+sec2md/__init__.py,sha256=cKVj4J_IPlcrZASlumEpjv69dMjIveatYUtPjASm1nE,988
 sec2md/absolute_table_parser.py,sha256=rphc5_HttniV2RtPCThQ68HWyyZIn9l-gkaFsbtQXBU,22982
 sec2md/chunking.py,sha256=OUjVffiqrHkFakFGjCZffE88G_jhIu9RBpVEbliF9jU,6115
 sec2md/core.py,sha256=hmdJXitoEWuekR5f3B1oEK1xmPux0t494lOpg5aJrRk,2663
 sec2md/models.py,sha256=zZNRp4S7pI_KHRSQwA04uSNYpDej-OzYW3S-mX2Irmc,14735
 sec2md/parser.py,sha256=-uyorKhrXrn_3dKMqq4peo2bdxcGvkQVHI5riSXX7z4,47558
-sec2md/section_extractor.py,sha256=otx4RObfNqP1zStilis9z4gDXp4mkN-9-tzIMACEIaE,28050
+sec2md/section_extractor.py,sha256=0MqS_xluIQcI10u8-q7pk3v0uG8p8htlb4Sv0agh3Xg,30663
 sec2md/sections.py,sha256=wtmKqF_KP_G-7_qAxGvxs25U_4vcH5NDGn14ouEy5GE,2784
 sec2md/table_parser.py,sha256=FhR8OwX5NAJmzdbTFzHQTGUNUPieYN37UzMFbQMkogU,12551
 sec2md/utils.py,sha256=2lUeN5irTbdIyjylCkaPKMv4ALWxWMJl96PTO8FV3Ik,2990
@@ -12,8 +12,8 @@ sec2md/chunker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sec2md/chunker/blocks.py,sha256=LiPV0GX0LYGkV-3kfxeBA9OCmMVjOjrwL46PH8snXw4,3388
 sec2md/chunker/chunk.py,sha256=eF7QAOita6AW_sp2Sg69853ZOH7npwM5o-AEem62RRk,4729
 sec2md/chunker/chunker.py,sha256=_VhrxfSCarnPGIx6LHIurgCEiwH3Tz7kVZuECgTNw2w,10588
-sec2md-0.1.10.dist-info/licenses/LICENSE,sha256=uJDiSGQ5TOx-PGhu2LGH4A-O53vS5hrQ5sc3j2Ps_Rk,1071
-sec2md-0.1.10.dist-info/METADATA,sha256=xW9Jin_IALBKHTlFzHnY9inkHmKLmf9jCio5jYc-EnY,7626
-sec2md-0.1.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-sec2md-0.1.10.dist-info/top_level.txt,sha256=Jpmw3laEWwS9fljtAEg4sExjFw3zP8dGarjIknyh1v8,7
-sec2md-0.1.10.dist-info/RECORD,,
+sec2md-0.1.12.dist-info/licenses/LICENSE,sha256=uJDiSGQ5TOx-PGhu2LGH4A-O53vS5hrQ5sc3j2Ps_Rk,1071
+sec2md-0.1.12.dist-info/METADATA,sha256=eSwrrLVm2fNKlpEIBKY-wm4VwKwwh7i-egy3FIfURqA,7626
+sec2md-0.1.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+sec2md-0.1.12.dist-info/top_level.txt,sha256=Jpmw3laEWwS9fljtAEg4sExjFw3zP8dGarjIknyh1v8,7
+sec2md-0.1.12.dist-info/RECORD,,

{sec2md-0.1.10.dist-info → sec2md-0.1.12.dist-info}/WHEEL RENAMED Viewed

File without changes

{sec2md-0.1.10.dist-info → sec2md-0.1.12.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{sec2md-0.1.10.dist-info → sec2md-0.1.12.dist-info}/top_level.txt RENAMED Viewed

File without changes

sec2md 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl

Potentially problematic release.

sec2md 0.1.10py3-none-any.whl → 0.1.12py3-none-any.whl