PyPI - sec2md - Versions diffs - 0.1.5__py3-none-any.whl → 0.1.12__py3-none-any.whl - Mend

sec2md 0.1.5py3-none-any.whl → 0.1.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of sec2md might be problematic. Click here for more details.

Files changed (14) hide show

sec2md/__init__.py +5 -5
sec2md/chunker/{markdown_blocks.py → blocks.py} +1 -27
sec2md/chunker/{markdown_chunk.py → chunk.py} +4 -4
sec2md/chunker/{markdown_chunker.py → chunker.py} +18 -18
sec2md/chunking.py +26 -28
sec2md/models.py +3 -3
sec2md/parser.py +1 -1
sec2md/section_extractor.py +176 -78
{sec2md-0.1.5.dist-info → sec2md-0.1.12.dist-info}/METADATA +1 -1
sec2md-0.1.12.dist-info/RECORD +19 -0
sec2md-0.1.5.dist-info/RECORD +0 -19
{sec2md-0.1.5.dist-info → sec2md-0.1.12.dist-info}/WHEEL +0 -0
{sec2md-0.1.5.dist-info → sec2md-0.1.12.dist-info}/licenses/LICENSE +0 -0
{sec2md-0.1.5.dist-info → sec2md-0.1.12.dist-info}/top_level.txt +0 -0

sec2md/__init__.py CHANGED Viewed

@@ -5,12 +5,12 @@ from sec2md.utils import flatten_note
 from sec2md.sections import extract_sections, get_section
 from sec2md.chunking import chunk_pages, chunk_section, merge_text_blocks, chunk_text_block
 from sec2md.models import Page, Section, Item10K, Item10Q, Item8K, FilingType, Element, TextBlock, Exhibit
-from sec2md.chunker.markdown_chunk import MarkdownChunk
-from sec2md.chunker.markdown_chunker import MarkdownChunker
+from sec2md.chunker.chunk import Chunk
+from sec2md.chunker.chunker import Chunker
 from sec2md.parser import Parser
 from sec2md.section_extractor import SectionExtractor
-__version__ = "0.1.5"
+__version__ = "0.1.12"
 __all__ = [
     "convert_to_markdown",
     "flatten_note",
@@ -29,8 +29,8 @@ __all__ = [
     "Item10Q",
     "Item8K",
     "FilingType",
-    "MarkdownChunk",
-    "MarkdownChunker",
+    "Chunk",
+    "Chunker",
     "Parser",
     "SectionExtractor",
 ]

sec2md/chunker/{markdown_blocks.py → blocks.py} RENAMED Viewed

@@ -73,40 +73,14 @@ class TextBlock(BaseBlock):
         return cls(content=content, page=page, block_type='Text')
-class AudioParagraphBlock(BaseBlock):
-    block_type: str = Field(default="Text", description="Audio paragraph block type")
-    paragraph_id: int = Field(..., description="Paragraph ID")
-    audio_start: float = Field(..., description="Audio start time")
-    audio_end: float = Field(..., description="Audio end time")
-    @computed_field
-    @property
-    def sentences(self) -> List[Sentence]:
-        """Returns the text block sentences"""
-        return [Sentence(content=content) for content in split_sentences(self.content)]
-    def format(self) -> dict:
-        """Formats the audio paragraphs"""
-        return {"id": self.paragraph_id, "content": self.content, "start": self.audio_start, "end": self.audio_end}
-class TableBlock(BaseModel):
+class TableBlock(BaseBlock):
     block_type: str = Field(default='Table', description="Table block type")
-    content: str = Field(..., description="Table content")
-    page: int = Field(..., description="Page number")
-    model_config = {"frozen": False}
     def __init__(self, **data):
         if 'content' in data:
             data['content'] = self._to_minified_markdown_static(data['content'])
         super().__init__(**data)
-    @computed_field
-    @property
-    def tokens(self) -> int:
-        return estimate_tokens(self.content)
     @staticmethod
     def _to_minified_markdown_static(content: str) -> str:
         """Returns the table in a Minified Markdown format"""

sec2md/chunker/{markdown_chunk.py → chunk.py} RENAMED Viewed

@@ -1,7 +1,7 @@
 from typing import List, Optional, Tuple, TYPE_CHECKING
 from pydantic import BaseModel, Field, computed_field
-from sec2md.chunker.markdown_blocks import BaseBlock
+from sec2md.chunker.blocks import BaseBlock
 if TYPE_CHECKING:
     from sec2md.models import Element
@@ -9,8 +9,8 @@ else:
     Element = 'Element'  # Forward reference for Pydantic
-class MarkdownChunk(BaseModel):
-    """Represents a chunk of markdown content that can be embedded"""
+class Chunk(BaseModel):
+    """Represents a chunk of content that can be embedded"""
     blocks: List[BaseBlock] = Field(..., description="List of markdown blocks in this chunk")
     header: Optional[str] = Field(None, description="Optional header for embedding context")
@@ -126,7 +126,7 @@ class MarkdownChunk(BaseModel):
     def __repr__(self):
         pages_str = f"{self.start_page}-{self.end_page}" if self.start_page != self.end_page else str(self.start_page)
-        return f"MarkdownChunk(pages={pages_str}, blocks={len(self.blocks)}, tokens={self.num_tokens})"
+        return f"Chunk(pages={pages_str}, blocks={len(self.blocks)}, tokens={self.num_tokens})"
     def _repr_markdown_(self):
         """This method is called by IPython to display as Markdown"""

sec2md/chunker/{markdown_chunker.py → chunker.py} RENAMED Viewed

@@ -1,24 +1,24 @@
 import logging
 from typing import Union, Tuple, List, Dict, Any
-from sec2md.chunker.markdown_chunk import MarkdownChunk
-from sec2md.chunker.markdown_blocks import BaseBlock, TextBlock, TableBlock, HeaderBlock
+from sec2md.chunker.chunk import Chunk
+from sec2md.chunker.blocks import BaseBlock, TextBlock, TableBlock, HeaderBlock
-# Rebuild MarkdownChunk after Element is defined
+# Rebuild Chunk after Element is defined
 from sec2md.models import Element
-MarkdownChunk.model_rebuild()
+Chunk.model_rebuild()
 logger = logging.getLogger(__name__)
-class MarkdownChunker:
-    """Splits markdown content into chunks"""
+class Chunker:
+    """Splits content into chunks"""
     def __init__(self, chunk_size: int = 512, chunk_overlap: int = 128):
         self.chunk_size = chunk_size
         self.chunk_overlap = chunk_overlap
-    def split(self, pages: List[Any], header: str = None) -> List[MarkdownChunk]:
+    def split(self, pages: List[Any], header: str = None) -> List[Chunk]:
         """Split the pages into chunks with optional header for embedding context.
         Args:
@@ -26,7 +26,7 @@ class MarkdownChunker:
             header: Optional header to prepend to each chunk's embedding_text
         Returns:
-            List of MarkdownChunk objects
+            List of Chunk objects
         """
         # Build element map: page -> List[Element objects]
         page_elements = {}
@@ -64,14 +64,14 @@ class MarkdownChunker:
             last_page = page
             for line in page.content.split('\n'):
-                if table_content and not MarkdownChunker._is_table_line(line):
+                if table_content and not Chunker._is_table_line(line):
                     blocks.append(TableBlock(content=table_content, page=page.number))
                     table_content = ""
                 if line.startswith("#"):
                     blocks.append(HeaderBlock(content=line, page=page.number))
-                elif MarkdownChunker._is_table_line(line):
+                elif Chunker._is_table_line(line):
                     table_content += f"{line}\n"
                 else:
@@ -96,7 +96,7 @@ class MarkdownChunker:
             return True
         return True
-    def _chunk_blocks(self, blocks: List[BaseBlock], header: str = None, page_elements: dict = None) -> List[MarkdownChunk]:
+    def _chunk_blocks(self, blocks: List[BaseBlock], header: str = None, page_elements: dict = None) -> List[Chunk]:
         """Converts the blocks to chunks"""
         page_elements = page_elements or {}
         chunks = []
@@ -127,7 +127,7 @@ class MarkdownChunker:
         return chunks
     def _process_text_block(self, block: TextBlock, chunk_blocks: List[BaseBlock], num_tokens: int,
-                            chunks: List[MarkdownChunk], header: str = None, page_elements: dict = None):
+                            chunks: List[Chunk], header: str = None, page_elements: dict = None):
         """Process a text block by breaking it into sentences if needed"""
         sentences = []
         sentences_tokens = 0
@@ -156,7 +156,7 @@ class MarkdownChunker:
         return chunk_blocks, num_tokens, chunks
     def _process_table_block(self, block: BaseBlock, chunk_blocks: List[BaseBlock], num_tokens: int,
-                             chunks: List[MarkdownChunk], all_blocks: List[BaseBlock], block_idx: int, header: str = None, page_elements: dict = None):
+                             chunks: List[Chunk], all_blocks: List[BaseBlock], block_idx: int, header: str = None, page_elements: dict = None):
         """Process a table block with optional header backtrack"""
         context = []
         context_tokens = 0
@@ -200,7 +200,7 @@ class MarkdownChunker:
         return chunk_blocks, num_tokens, chunks
     def _process_header_table_block(self, block: BaseBlock, chunk_blocks: List[BaseBlock], num_tokens: int,
-                                    chunks: List[MarkdownChunk], next_block: BaseBlock, header: str = None, page_elements: dict = None):
+                                    chunks: List[Chunk], next_block: BaseBlock, header: str = None, page_elements: dict = None):
         """Process a header block"""
         if not chunk_blocks:
             chunk_blocks.append(block)
@@ -223,17 +223,17 @@ class MarkdownChunker:
         return chunk_blocks, num_tokens, chunks
-    def _finalize_chunk(self, chunks: List[MarkdownChunk], blocks: List[BaseBlock], header: str, page_elements: dict):
+    def _finalize_chunk(self, chunks: List[Chunk], blocks: List[BaseBlock], header: str, page_elements: dict):
         """Create chunk with elements from the pages it spans"""
         chunk_pages = set(block.page for block in blocks)
         elements = []
         for page_num in sorted(chunk_pages):
             if page_num in page_elements:
                 elements.extend(page_elements[page_num])
-        chunks.append(MarkdownChunk(blocks=blocks, header=header, elements=elements))
+        chunks.append(Chunk(blocks=blocks, header=header, elements=elements))
-    def _create_chunk(self, chunks: List[MarkdownChunk], blocks: List[BaseBlock], header: str = None, page_elements: dict = None) -> Tuple[
-        List[MarkdownChunk], List[BaseBlock], int]:
+    def _create_chunk(self, chunks: List[Chunk], blocks: List[BaseBlock], header: str = None, page_elements: dict = None) -> Tuple[
+        List[Chunk], List[BaseBlock], int]:
         """Creates a chunk and returns overlap blocks"""
         page_elements = page_elements or {}
         self._finalize_chunk(chunks, blocks, header, page_elements)

sec2md/chunking.py CHANGED Viewed

@@ -1,10 +1,10 @@
-"""Chunking utilities for page-aware markdown splitting."""
+"""Chunking utilities for page-aware splitting."""
 from typing import List, Optional
 from collections import defaultdict
 from sec2md.models import Page, Section, TextBlock
-from sec2md.chunker.markdown_chunker import MarkdownChunker
-from sec2md.chunker.markdown_chunk import MarkdownChunk
+from sec2md.chunker.chunker import Chunker
+from sec2md.chunker.chunk import Chunk
 def chunk_pages(
@@ -12,9 +12,9 @@ def chunk_pages(
     chunk_size: int = 512,
     chunk_overlap: int = 128,
     header: Optional[str] = None
-) -> List[MarkdownChunk]:
+) -> List[Chunk]:
     """
-    Chunk pages into overlapping markdown chunks.
+    Chunk pages into overlapping chunks.
     Args:
         pages: List of Page objects (with optional elements)
@@ -23,7 +23,7 @@ def chunk_pages(
         header: Optional header to prepend to each chunk's embedding_text
     Returns:
-        List of MarkdownChunk objects with page tracking and elements
+        List of Chunk objects with page tracking and elements
     Example:
         >>> pages = sec2md.convert_to_markdown(html, return_pages=True, include_elements=True)
@@ -32,7 +32,7 @@ def chunk_pages(
         ...     print(f"Page {chunk.page}: {chunk.content[:100]}...")
         ...     print(f"Elements: {chunk.elements}")
     """
-    chunker = MarkdownChunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    chunker = Chunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
     return chunker.split(pages=pages, header=header)
@@ -41,9 +41,9 @@ def chunk_section(
     chunk_size: int = 512,
     chunk_overlap: int = 128,
     header: Optional[str] = None
-) -> List[MarkdownChunk]:
+) -> List[Chunk]:
     """
-    Chunk a filing section into overlapping markdown chunks.
+    Chunk a filing section into overlapping chunks.
     Args:
         section: Section object from extract_sections()
@@ -52,7 +52,7 @@ def chunk_section(
         header: Optional header to prepend to each chunk's embedding_text
     Returns:
-        List of MarkdownChunk objects
+        List of Chunk objects
     Example:
         >>> sections = sec2md.extract_sections(pages, filing_type="10-K")
@@ -79,8 +79,8 @@ def merge_text_blocks(pages: List[Page]) -> List[TextBlock]:
     Returns:
         List of merged TextBlock objects with page metadata:
-        - page_start: First page the note appears on
-        - page_end: Last page the note appears on
+        - start_page: First page the note appears on
+        - end_page: Last page the note appears on
         - source_pages: All pages the note spans
         - elements: All elements from all pages
@@ -88,7 +88,7 @@ def merge_text_blocks(pages: List[Page]) -> List[TextBlock]:
         >>> pages = parser.get_pages(include_elements=True)
         >>> merged = merge_text_blocks(pages)
         >>> for tb in merged:
-        ...     print(f"{tb.title}: pages {tb.page_start}-{tb.page_end}")
+        ...     print(f"{tb.title}: pages {tb.start_page}-{tb.end_page}")
         Debt Disclosure: pages 45-46
         Segment Reporting: pages 49-50
     """
@@ -97,8 +97,8 @@ def merge_text_blocks(pages: List[Page]) -> List[TextBlock]:
         "name": None,
         "title": None,
         "elements": [],
-        "page_start": float('inf'),
-        "page_end": -1,
+        "start_page": float('inf'),
+        "end_page": -1,
         "pages": set()
     })
@@ -108,8 +108,8 @@ def merge_text_blocks(pages: List[Page]) -> List[TextBlock]:
                 tb_map[tb.name]["name"] = tb.name
                 tb_map[tb.name]["title"] = tb.title
                 tb_map[tb.name]["elements"].extend(tb.elements)
-                tb_map[tb.name]["page_start"] = min(tb_map[tb.name]["page_start"], page.number)
-                tb_map[tb.name]["page_end"] = max(tb_map[tb.name]["page_end"], page.number)
+                tb_map[tb.name]["start_page"] = min(tb_map[tb.name]["start_page"], page.number)
+                tb_map[tb.name]["end_page"] = max(tb_map[tb.name]["end_page"], page.number)
                 tb_map[tb.name]["pages"].add(page.number)
     # Create merged TextBlock objects
@@ -119,8 +119,8 @@ def merge_text_blocks(pages: List[Page]) -> List[TextBlock]:
             name=tb_data["name"],
             title=tb_data["title"],
             elements=tb_data["elements"],
-            page_start=tb_data["page_start"],
-            page_end=tb_data["page_end"],
+            start_page=tb_data["start_page"],
+            end_page=tb_data["end_page"],
             source_pages=sorted(tb_data["pages"])
         )
         merged.append(tb)
@@ -132,8 +132,8 @@ def chunk_text_block(
     text_block: TextBlock,
     chunk_size: int = 512,
     chunk_overlap: int = 128,
-    include_title_as_header: bool = True
-) -> List[MarkdownChunk]:
+    header: Optional[str] = None
+) -> List[Chunk]:
     """
     Chunk a single TextBlock (financial note).
@@ -141,17 +141,17 @@ def chunk_text_block(
         text_block: TextBlock object (possibly spanning multiple pages)
         chunk_size: Target chunk size in tokens (estimated as chars/4)
         chunk_overlap: Overlap between chunks in tokens
-        include_title_as_header: Prepend note title to chunks for embedding
+        header: Optional header to prepend to each chunk's embedding_text
     Returns:
-        List of MarkdownChunk objects with elements preserved
+        List of Chunk objects with elements preserved
     Example:
         >>> merged = merge_text_blocks(pages)
         >>> debt_note = [tb for tb in merged if "Debt" in tb.title][0]
-        >>> chunks = chunk_text_block(debt_note, chunk_size=512)
+        >>> chunks = chunk_text_block(debt_note, chunk_size=512, header="Company: AAPL | Note: Debt")
         >>> print(f"Chunked {debt_note.title} into {len(chunks)} chunks")
-        >>> print(f"Note spans pages {debt_note.page_start}-{debt_note.page_end}")
+        >>> print(f"Note spans pages {debt_note.start_page}-{debt_note.end_page}")
     """
     # Group elements by page
     elements_by_page = defaultdict(list)
@@ -172,8 +172,6 @@ def chunk_text_block(
             elements=elems       # Only elements from this page
         ))
-    # Chunk normally across all pages
-    header = f"Note: {text_block.title}" if include_title_as_header and text_block.title else None
-    chunker = MarkdownChunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    chunker = Chunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
     return chunker.split(pages=pages, header=header)

sec2md/models.py CHANGED Viewed

@@ -248,8 +248,8 @@ class TextBlock(BaseModel):
     elements: List['Element'] = Field(default_factory=list, description="Element objects in this TextBlock")
     # Optional: Set by merge_text_blocks() for multi-page notes
-    page_start: Optional[int] = Field(None, description="First page this TextBlock appears on")
-    page_end: Optional[int] = Field(None, description="Last page this TextBlock appears on")
+    start_page: Optional[int] = Field(None, description="First page this TextBlock appears on")
+    end_page: Optional[int] = Field(None, description="Last page this TextBlock appears on")
     source_pages: Optional[List[int]] = Field(None, description="All pages this TextBlock spans")
     model_config = {"frozen": False, "arbitrary_types_allowed": True}
@@ -261,7 +261,7 @@ class TextBlock(BaseModel):
         return [e.id for e in self.elements]
     def __repr__(self) -> str:
-        pages_info = f", pages={self.page_start}-{self.page_end}" if self.page_start else ""
+        pages_info = f", pages={self.start_page}-{self.end_page}" if self.start_page else ""
         return f"TextBlock(name='{self.name}', title='{self.title}', elements={len(self.elements)}{pages_info})"

sec2md/parser.py CHANGED Viewed

@@ -710,7 +710,7 @@ class Parser:
         return current
-    def get_pages(self, include_elements: bool = False) -> List[Page]:
+    def get_pages(self, include_elements: bool = True) -> List[Page]:
         """Get parsed pages as Page objects.
         Args:

sec2md/section_extractor.py CHANGED Viewed

@@ -143,8 +143,9 @@ class SectionExtractor:
     # ========== 8-K Specific Methods ==========
     # 8-K item header regex: ITEM 1.01 / 7.01 / 9.01
+    # Simplified pattern: match ONLY at line start, with strict formatting
     _ITEM_8K_RE = re.compile(
-        rf'^\s*{LEAD_WRAP}(ITEM)\s+([1-9]\.\d{{2}}[A-Z]?)\.?\s*(?:[:.\-–—]\s*)?(.*)',
+        rf'^\s*{LEAD_WRAP}(ITEM)\s+([1-9]\.\d{{2}}[A-Z]?)\.?\s*(?:[:.\-–—]\s*)?(.*)$',
         re.IGNORECASE | re.MULTILINE
     )
@@ -250,109 +251,206 @@ class SectionExtractor:
         end = mstop.start() if mstop else next_item_start
         return doc[start_after:end].strip()
+    def _is_8k_boilerplate_page(self, page_content: str, page_num: int) -> bool:
+        """Detect cover, TOC, and signature pages in 8-Ks."""
+        # Cover page is always page 1
+        if page_num == 1:
+            return True
+        # TOC page: has "TABLE OF CONTENTS" header (with or without bold markdown)
+        # Also detect if page has multiple ITEM entries with page numbers (TOC table pattern)
+        if re.search(r'TABLE OF CONTENTS', page_content, re.IGNORECASE):
+            return True
+        # Alternative TOC detection: page has multiple items with "| digit |" pattern (page numbers in table)
+        item_with_page_count = len(re.findall(r'ITEM\s+[1-9]\.\d{2}.*?\|\s*\d+\s*\|', page_content, re.IGNORECASE))
+        if item_with_page_count >= 2:  # If 2+ items have page numbers, it's a TOC
+            return True
+        # Signatures page: has "SIGNATURES" header and filing signature text
+        if re.search(r'\*\*SIGNATURES\*\*', page_content) and \
+           re.search(r'Pursuant to the requirements', page_content, re.IGNORECASE):
+            return True
+        return False
     def _get_8k_sections(self) -> List[Any]:
-        """Extract 8-K sections (items only, no PART divisions)."""
+        """Extract 8-K sections using page-by-page approach like standard extractor."""
         from sec2md.models import Section, Page, ITEM_8K_TITLES
-        # Concatenate all pages into one doc
-        full_content = "\n\n".join(p["content"] for p in self.pages)
-        doc = self._clean_8k_text(full_content)
+        sections = []
+        current_item = None
+        current_item_title = None
+        current_pages: List[Dict] = []
-        if not doc:
-            self._log("DEBUG: No content after cleaning")
-            return []
+        def flush_section():
+            nonlocal sections, current_item, current_item_title, current_pages
+            if current_pages and current_item:
+                # Parse exhibits if this is ITEM 9.01
+                exhibits = None
+                if current_item.startswith("ITEM 9.01"):
+                    content = "\n".join(p["content"] for p in current_pages)
+                    md = re.search(r'^\s*\(?d\)?\s*Exhibits\b.*$', content, re.IGNORECASE | re.MULTILINE)
+                    ex_block = content[md.end():].strip() if md else content
+                    parsed_exhibits = self._parse_exhibits(ex_block)
+                    exhibits = parsed_exhibits if parsed_exhibits else None
+                # Convert page dicts to Page objects
+                page_objects = [Page(number=p["page"], content=p["content"], elements=None, text_blocks=None)
+                                for p in current_pages]
+                sections.append(Section(
+                    part=None,
+                    item=current_item,
+                    item_title=current_item_title,
+                    pages=page_objects,
+                    exhibits=exhibits
+                ))
+                current_pages = []
-        # Find all item headers
-        headers: List[Dict] = []
-        for m in self._ITEM_8K_RE.finditer(doc):
-            code = self._normalize_8k_item_code(m.group(2))
-            title_inline = (m.group(3) or "").strip()
-            # Clean markdown artifacts from title
-            title_inline = MD_EDGE.sub("", title_inline)
-            title = title_inline if title_inline else ITEM_8K_TITLES.get(code)
-            headers.append({"start": m.start(), "end": m.end(), "no": code, "title": title})
-            self._log(f"DEBUG: Found ITEM {code} at position {m.start()}")
+        for page_dict in self.pages:
+            page_num = page_dict["page"]
+            content = page_dict["content"]
-        if not headers:
-            self._log("DEBUG: No item headers found")
-            return []
+            # Skip boilerplate pages
+            if self._is_8k_boilerplate_page(content, page_num):
+                self._log(f"DEBUG: Page {page_num} is boilerplate, skipping")
+                continue
-        self._log(f"DEBUG: Total headers found: {len(headers)}")
+            # Find first valid ITEM header on this page (if any)
+            item_m = None
+            first_idx = None
-        # Extract sections
-        results: List[Section] = []
-        for i, h in enumerate(headers):
-            code = h["no"]
-            next_start = headers[i + 1]["start"] if i + 1 < len(headers) else len(doc)
-            body = self._slice_8k_body(doc, h["end"], next_start)
+            for m in self._ITEM_8K_RE.finditer(content):
+                # Get the full line for this match
+                line_start = content.rfind('\n', 0, m.start()) + 1
+                line_end = content.find('\n', m.end())
+                if line_end == -1:
+                    line_end = len(content)
+                full_line = content[line_start:line_end].strip()
+                # Skip if this is a table row (contains pipe characters)
+                if '|' in full_line:
+                    self._log(f"DEBUG: Page {page_num} skipping table row: {full_line[:60]}")
+                    continue
+                # Get item code and title
+                code = self._normalize_8k_item_code(m.group(2))
+                title_inline = (m.group(3) or "").strip()
+                title_inline = MD_EDGE.sub("", title_inline)
+                # This is a valid ITEM header
+                item_m = m
+                first_idx = m.start()
+                self._log(f"DEBUG: Page {page_num} found ITEM {code} at position {first_idx}")
+                break
+            # No item header found - add to current section
+            if first_idx is None:
+                if current_item:
+                    current_pages.append({"page": page_num, "content": content.strip()})
+                continue
+            # Found item header - split page
+            before = content[:first_idx].strip()
+            after = content[first_idx:].strip()
+            # Add "before" content to current section
+            if current_item and before:
+                current_pages.append({"page": page_num, "content": before})
+            # Flush current section
+            flush_section()
+            # Start new section
+            code = self._normalize_8k_item_code(item_m.group(2))
+            title_inline = (item_m.group(3) or "").strip()
+            title_inline = MD_EDGE.sub("", title_inline)
+            current_item = f"ITEM {code}"
+            current_item_title = title_inline if title_inline else ITEM_8K_TITLES.get(code)
             # Filter by desired_items if provided
             if self.desired_items and code not in self.desired_items:
                 self._log(f"DEBUG: Skipping ITEM {code} (not in desired_items)")
+                current_item = None
+                current_item_title = None
                 continue
-            # For 9.01, parse exhibits
-            exhibits = []
-            if code.startswith("9.01"):
-                md = re.search(r'^\s*\(?d\)?\s*Exhibits\b.*$', body, re.IGNORECASE | re.MULTILINE)
-                ex_block = body[md.end():].strip() if md else body
-                exhibits = self._parse_exhibits(ex_block)
-                self._log(f"DEBUG: Found {len(exhibits)} exhibits in 9.01")
-            # Map back to Page objects (approximate page boundaries from original content)
-            # Since 8-K sections can span pages, we need to find which pages contain this content
-            section_pages = self._map_8k_content_to_pages(body)
-            # Create Section with exhibits (now part of the model)
-            section = Section(
-                part=None,  # 8-K has no PART divisions
-                item=f"ITEM {code}",
-                item_title=h["title"],
-                pages=section_pages,
-                exhibits=exhibits if exhibits else None
-            )
+            # Add "after" content to new section
+            if after:
+                current_pages.append({"page": page_num, "content": after})
-            results.append(section)
-            self._log(f"DEBUG: Extracted ITEM {code} with {len(section_pages)} pages")
+        # Flush final section
+        flush_section()
-        self._log(f"DEBUG: Total sections extracted: {len(results)}")
-        return results
+        self._log(f"DEBUG: Total sections extracted: {len(sections)}")
+        return sections
     def _map_8k_content_to_pages(self, section_content: str) -> List[Any]:
-        """Map extracted section content back to Page objects."""
+        """Map extracted section content back to Page objects, splitting at section boundaries."""
         from sec2md.models import Page
-        # Try to find which original pages contain this content
-        # This is heuristic-based: match by content overlap
         matched_pages = []
-        section_preview = section_content[:500]  # Use first 500 chars for matching
+        section_content_cleaned = self._clean_8k_text(section_content)
+        remaining_section = section_content_cleaned
-        for page_dict in self.pages:
+        # Use filtered pages (excludes cover, TOC, signatures)
+        pages_to_search = getattr(self, '_filtered_8k_pages', self.pages)
+        for page_dict in pages_to_search:
             page_num = page_dict["page"]
-            page_content = self._clean_8k_text(page_dict["content"])
+            page_content = page_dict["content"]
+            page_content_cleaned = self._clean_8k_text(page_content)
-            # Check if this page contains part of the section
-            if section_preview in page_content or page_content in section_content:
-                original_page = self._original_pages.get(page_num)
-                matched_pages.append(
-                    Page(
-                        number=page_num,
-                        content=page_content,
-                        elements=original_page.elements if original_page else None,
-                        text_blocks=original_page.text_blocks if original_page else None
-                    )
-                )
+            # Skip pages that don't contain any of the remaining section content
+            if not any(chunk in page_content_cleaned for chunk in remaining_section[:200].split()[:10]):
+                continue
+            # Find where the section content appears on this page
+            # Use the original page to preserve formatting/elements
+            original_page = self._original_pages[page_num]
+            # For 8-K, we need to split the page content at ITEM boundaries
+            # Find all ITEM headers on this page
+            item_positions = []
+            for m in self._ITEM_8K_RE.finditer(page_content_cleaned):
+                code = self._normalize_8k_item_code(m.group(2))
+                title = (m.group(3) or "").strip()
+                # Skip TOC entries
+                if not re.search(r'\|\s*\d+\s*\|', title):
+                    item_positions.append((m.start(), f"ITEM {code}"))
+            # Find which portion of the page belongs to this section
+            section_start_in_page = page_content_cleaned.find(section_content_cleaned[:100])
+            if section_start_in_page >= 0:
+                # Find the end: either next ITEM on this page, or end of page
+                section_end_in_page = len(page_content_cleaned)
+                for pos, item_code in item_positions:
+                    # Find the next ITEM after our section starts
+                    if pos > section_start_in_page + 50:  # Give 50 chars buffer
+                        section_end_in_page = pos
+                        break
+                # Extract just this section's content from the page
+                page_section_content = page_content_cleaned[section_start_in_page:section_end_in_page].strip()
-        # If no matches found (shouldn't happen), create a synthetic page
-        if not matched_pages:
-            matched_pages.append(
-                Page(
-                    number=1,
-                    content=section_content,
-                    elements=None,
+                # Create a new Page with only this section's content
+                # Note: This loses elements, but keeps the section boundary clean
+                matched_pages.append(Page(
+                    number=page_num,
+                    content=page_section_content,
+                    elements=None,  # TODO: Could filter elements by content matching
                     text_blocks=None
-                )
-            )
+                ))
+                # Update remaining section content to find on next pages
+                # Remove what we've matched from the section
+                matched_len = len(page_section_content)
+                remaining_section = remaining_section[matched_len:] if matched_len < len(remaining_section) else ""
+                if not remaining_section.strip():
+                    break  # Found all content for this section
         return matched_pages

{sec2md-0.1.5.dist-info → sec2md-0.1.12.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sec2md
-Version: 0.1.5
+Version: 0.1.12
 Summary: Convert SEC EDGAR filings to LLM-ready Markdown for AI agents and agentic RAG
 Author-email: Lucas Astorian <lucas@intellifin.ai>
 License: MIT

sec2md-0.1.12.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,19 @@
+sec2md/__init__.py,sha256=cKVj4J_IPlcrZASlumEpjv69dMjIveatYUtPjASm1nE,988
+sec2md/absolute_table_parser.py,sha256=rphc5_HttniV2RtPCThQ68HWyyZIn9l-gkaFsbtQXBU,22982
+sec2md/chunking.py,sha256=OUjVffiqrHkFakFGjCZffE88G_jhIu9RBpVEbliF9jU,6115
+sec2md/core.py,sha256=hmdJXitoEWuekR5f3B1oEK1xmPux0t494lOpg5aJrRk,2663
+sec2md/models.py,sha256=zZNRp4S7pI_KHRSQwA04uSNYpDej-OzYW3S-mX2Irmc,14735
+sec2md/parser.py,sha256=-uyorKhrXrn_3dKMqq4peo2bdxcGvkQVHI5riSXX7z4,47558
+sec2md/section_extractor.py,sha256=0MqS_xluIQcI10u8-q7pk3v0uG8p8htlb4Sv0agh3Xg,30663
+sec2md/sections.py,sha256=wtmKqF_KP_G-7_qAxGvxs25U_4vcH5NDGn14ouEy5GE,2784
+sec2md/table_parser.py,sha256=FhR8OwX5NAJmzdbTFzHQTGUNUPieYN37UzMFbQMkogU,12551
+sec2md/utils.py,sha256=2lUeN5irTbdIyjylCkaPKMv4ALWxWMJl96PTO8FV3Ik,2990
+sec2md/chunker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sec2md/chunker/blocks.py,sha256=LiPV0GX0LYGkV-3kfxeBA9OCmMVjOjrwL46PH8snXw4,3388
+sec2md/chunker/chunk.py,sha256=eF7QAOita6AW_sp2Sg69853ZOH7npwM5o-AEem62RRk,4729
+sec2md/chunker/chunker.py,sha256=_VhrxfSCarnPGIx6LHIurgCEiwH3Tz7kVZuECgTNw2w,10588
+sec2md-0.1.12.dist-info/licenses/LICENSE,sha256=uJDiSGQ5TOx-PGhu2LGH4A-O53vS5hrQ5sc3j2Ps_Rk,1071
+sec2md-0.1.12.dist-info/METADATA,sha256=eSwrrLVm2fNKlpEIBKY-wm4VwKwwh7i-egy3FIfURqA,7626
+sec2md-0.1.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+sec2md-0.1.12.dist-info/top_level.txt,sha256=Jpmw3laEWwS9fljtAEg4sExjFw3zP8dGarjIknyh1v8,7
+sec2md-0.1.12.dist-info/RECORD,,

sec2md-0.1.5.dist-info/RECORD DELETED Viewed

@@ -1,19 +0,0 @@
-sec2md/__init__.py,sha256=iR_2g-PDkCAzY76uQwBjIVpprvkxlNopdmDduzDp8lg,1037
-sec2md/absolute_table_parser.py,sha256=rphc5_HttniV2RtPCThQ68HWyyZIn9l-gkaFsbtQXBU,22982
-sec2md/chunking.py,sha256=SQASDA057bKLhSj34GNAHrRl94Rf-A9WlfEvhhWPuIc,6350
-sec2md/core.py,sha256=hmdJXitoEWuekR5f3B1oEK1xmPux0t494lOpg5aJrRk,2663
-sec2md/models.py,sha256=H_3HnI8exGVnbqbdT1Bf4bNhPLjqvlP64ud0au5ohJk,14735
-sec2md/parser.py,sha256=J1He6XMa1Mf9YGJCEffWuCs7SAqi0Ts6S445CTO-lAA,47559
-sec2md/section_extractor.py,sha256=JTbZpPgmTipzU1Q5LehlQ9y2X4ZcQRTj3A7iMr90iqM,25976
-sec2md/sections.py,sha256=wtmKqF_KP_G-7_qAxGvxs25U_4vcH5NDGn14ouEy5GE,2784
-sec2md/table_parser.py,sha256=FhR8OwX5NAJmzdbTFzHQTGUNUPieYN37UzMFbQMkogU,12551
-sec2md/utils.py,sha256=2lUeN5irTbdIyjylCkaPKMv4ALWxWMJl96PTO8FV3Ik,2990
-sec2md/chunker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-sec2md/chunker/markdown_blocks.py,sha256=yEF_v72DvYOVu0ZQ5bBCFpNM12INg-8RmajIu_dorQQ,4372
-sec2md/chunker/markdown_chunk.py,sha256=hCMpjn0cc5TIjWSZviq4fM7e781X3AtRcmI60pDLWro,4763
-sec2md/chunker/markdown_chunker.py,sha256=IYW8pQ2q9hX1lRGw4TnKAQcr-HmJfSW7wffu-BA0Jms,10743
-sec2md-0.1.5.dist-info/licenses/LICENSE,sha256=uJDiSGQ5TOx-PGhu2LGH4A-O53vS5hrQ5sc3j2Ps_Rk,1071
-sec2md-0.1.5.dist-info/METADATA,sha256=YWQ9uiut1LcBQxOCvFcT8MlfgLO7VBCDtEju5h7fp6k,7625
-sec2md-0.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-sec2md-0.1.5.dist-info/top_level.txt,sha256=Jpmw3laEWwS9fljtAEg4sExjFw3zP8dGarjIknyh1v8,7
-sec2md-0.1.5.dist-info/RECORD,,

{sec2md-0.1.5.dist-info → sec2md-0.1.12.dist-info}/WHEEL RENAMED Viewed

File without changes

{sec2md-0.1.5.dist-info → sec2md-0.1.12.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{sec2md-0.1.5.dist-info → sec2md-0.1.12.dist-info}/top_level.txt RENAMED Viewed

File without changes

sec2md 0.1.5__py3-none-any.whl → 0.1.12__py3-none-any.whl

Potentially problematic release.

sec2md 0.1.5py3-none-any.whl → 0.1.12py3-none-any.whl