PyPI - sec2md - Versions diffs - 0.1.0__py3-none-any.whl - Mend

sec2md 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of sec2md might be problematic. Click here for more details.

Files changed (19) hide show

sec2md/__init__.py +24 -0
sec2md/absolute_table_parser.py +622 -0
sec2md/chunker/__init__.py +0 -0
sec2md/chunker/markdown_blocks.py +116 -0
sec2md/chunker/markdown_chunk.py +76 -0
sec2md/chunker/markdown_chunker.py +234 -0
sec2md/chunking.py +66 -0
sec2md/core.py +93 -0
sec2md/models.py +153 -0
sec2md/parser.py +586 -0
sec2md/section_extractor.py +316 -0
sec2md/sections.py +104 -0
sec2md/table_parser.py +386 -0
sec2md/utils.py +109 -0
sec2md-0.1.0.dist-info/METADATA +217 -0
sec2md-0.1.0.dist-info/RECORD +19 -0
sec2md-0.1.0.dist-info/WHEEL +5 -0
sec2md-0.1.0.dist-info/licenses/LICENSE +21 -0
sec2md-0.1.0.dist-info/top_level.txt +1 -0

sec2md/section_extractor.py ADDED Viewed

@@ -0,0 +1,316 @@
+from __future__ import annotations
+import re
+from typing import List, Dict, Optional, Literal
+LEAD_WRAP = r'(?:\*\*|__)?\s*(?:</?[^>]+>\s*)*'
+PART_PATTERN = re.compile(
+    rf'^\s*{LEAD_WRAP}(PART\s+[IVXLC]+)\b(?:\s*$|\s+)',
+    re.IGNORECASE | re.MULTILINE
+)
+ITEM_PATTERN = re.compile(
+    rf'^\s*{LEAD_WRAP}(ITEM)\s+(\d{{1,2}}[A-Z]?)\.?\s*(?:[:.\-–—]\s*)?(.*)',
+    re.IGNORECASE | re.MULTILINE
+)
+HEADER_FOOTER_RE = re.compile(
+    r'^\s*(?:[A-Z][A-Za-z0-9 .,&\-]+)?\s*\|\s*\d{4}\s+Form\s+10-[KQ]\s*\|\s*\d+\s*$'
+)
+PAGE_NUM_RE = re.compile(r'^\s*Page\s+\d+\s*(?:of\s+\d+)?\s*$|^\s*\d+\s*$', re.IGNORECASE)
+MD_EDGE = re.compile(r'^\s*(?:\*\*|__)\s*|\s*(?:\*\*|__)\s*$')
+NBSP, NARROW_NBSP, ZWSP = '\u00A0', '\u202F', '\u200B'
+DOT_LEAD_RE = re.compile(r'^.*\.{3,}\s*\d{1,4}\s*$', re.M)  # "... 123"
+ITEM_ROWS_RE = re.compile(r'^\s*ITEM\s+\d{1,2}[A-Z]?\.?\b', re.I | re.M)
+FILING_STRUCTURES = {
+    "10-K": {
+        "PART I": ["ITEM 1", "ITEM 1A", "ITEM 1B", "ITEM 1C", "ITEM 2", "ITEM 3", "ITEM 4"],
+        "PART II": ["ITEM 5", "ITEM 6", "ITEM 7", "ITEM 7A", "ITEM 8", "ITEM 9", "ITEM 9A", "ITEM 9B", "ITEM 9C"],
+        "PART III": ["ITEM 10", "ITEM 11", "ITEM 12", "ITEM 13", "ITEM 14"],
+        "PART IV": ["ITEM 15", "ITEM 16"]
+    },
+    "10-Q": {
+        "PART I": ["ITEM 1", "ITEM 2", "ITEM 3", "ITEM 4"],
+        "PART II": ["ITEM 1", "ITEM 1A", "ITEM 2", "ITEM 3", "ITEM 4", "ITEM 5", "ITEM 6"]
+    },
+    "20-F": {
+        "PART I": [
+            "ITEM 1", "ITEM 2", "ITEM 3", "ITEM 4", "ITEM 5", "ITEM 6",
+            "ITEM 7", "ITEM 8", "ITEM 9", "ITEM 10", "ITEM 11", "ITEM 12", "ITEM 12D"
+        ],
+        "PART II": [
+            "ITEM 13", "ITEM 14", "ITEM 15",
+            # include all 16X variants explicitly so validation stays strict
+            "ITEM 16", "ITEM 16A", "ITEM 16B", "ITEM 16C", "ITEM 16D", "ITEM 16E", "ITEM 16F", "ITEM 16G", "ITEM 16H",
+            "ITEM 16I"
+        ],
+        "PART III": ["ITEM 17", "ITEM 18", "ITEM 19"]
+    }
+}
+class SectionExtractor:
+    def __init__(self, pages: List[Dict], filing_type: Optional[Literal["10-K", "10-Q", "20-F"]] = None, debug: bool = False):
+        self.pages = pages
+        self.filing_type = filing_type
+        self.structure = FILING_STRUCTURES.get(filing_type) if filing_type else None
+        self.debug = debug
+        self._toc_locked = False
+    def _log(self, msg: str):
+        if self.debug:
+            print(msg)
+    @staticmethod
+    def _normalize_section_key(part: Optional[str], item_num: Optional[str]) -> tuple[Optional[str], Optional[str]]:
+        part_key = re.sub(r'\s+', ' ', part.upper().strip()) if part else None
+        item_key = f"ITEM {item_num.upper()}" if item_num else None
+        return part_key, item_key
+    @staticmethod
+    def _normalize_section(text: str) -> str:
+        return re.sub(r'\s+', ' ', text.upper().strip())
+    def _clean_lines(self, content: str) -> List[str]:
+        content = content.replace(NBSP, ' ').replace(NARROW_NBSP, ' ').replace(ZWSP, '')
+        lines = [ln.rstrip() for ln in content.split('\n')]
+        out = []
+        for ln in lines:
+            if HEADER_FOOTER_RE.match(ln) or PAGE_NUM_RE.match(ln):
+                continue
+            ln = MD_EDGE.sub('', ln)
+            out.append(ln)
+        return out
+    def _infer_part_for_item(self, filing_type: str, item_key: str) -> Optional[str]:
+        m = re.match(r'ITEM\s+(\d{1,2})', item_key)
+        if not m:
+            return None
+        num = int(m.group(1))
+        if filing_type == "10-K":
+            if 1 <= num <= 4:
+                return "PART I"
+            elif 5 <= num <= 9:
+                return "PART II"
+            elif 10 <= num <= 14:
+                return "PART III"
+            elif 15 <= num <= 16:
+                return "PART IV"
+        elif filing_type == "10-Q":
+            if 1 <= num <= 4:
+                return "PART I"
+            else:
+                return "PART II"
+        return None
+    @staticmethod
+    def _clean_item_title(title: str) -> str:
+        title = re.sub(r'^\s*[:.\-–—]\s*', '', title)
+        title = re.sub(r'\s+', ' ', title).strip()
+        return title
+    def _is_toc(self, content: str, page_num: int = 1) -> bool:
+        # Simple rule: within first 5 pages, if we see multiple matches, treat as TOC.
+        # “Multiple” = ≥3 ITEM rows OR ≥3 dotted-leader lines.
+        if self._toc_locked or page_num > 5:
+            return False
+        item_hits = len(ITEM_ROWS_RE.findall(content))
+        leader_hits = len(DOT_LEAD_RE.findall(content))
+        return (item_hits >= 3) or (leader_hits >= 3)
+    def get_sections(self) -> List[Dict]:
+        sections = []
+        current_part = None
+        current_item = None
+        current_item_title = None
+        current_pages: List[Dict] = []
+        def flush_section():
+            nonlocal sections, current_part, current_item, current_item_title, current_pages
+            if current_pages:
+                sections.append({
+                    "part": current_part,
+                    "item": current_item,
+                    "item_title": current_item_title,
+                    "page_start": current_pages[0]["page"],
+                    "pages": current_pages
+                })
+                current_pages = []
+        for page_dict in self.pages:
+            page_num = page_dict["page"]
+            content = page_dict["content"]
+            if self._is_toc(content, page_num):
+                self._log(f"DEBUG: Page {page_num} detected as TOC, skipping")
+                continue
+            lines = self._clean_lines(content)
+            joined = "\n".join(lines)
+            if not joined.strip():
+                self._log(f"DEBUG: Page {page_num} is empty after cleaning")
+                continue
+            part_m = None
+            item_m = None
+            first_idx = None
+            first_kind = None
+            for m in PART_PATTERN.finditer(joined):
+                part_m = m
+                first_idx = m.start()
+                first_kind = 'part'
+                self._log(f"DEBUG: Page {page_num} found PART at position {first_idx}: {m.group(1)}")
+                break
+            for m in ITEM_PATTERN.finditer(joined):
+                if first_idx is None or m.start() < first_idx:
+                    item_m = m
+                    first_idx = m.start()
+                    first_kind = 'item'
+                    self._log(f"DEBUG: Page {page_num} found ITEM at position {first_idx}: ITEM {m.group(2)}")
+                break
+            if first_kind is None:
+                self._log(f"DEBUG: Page {page_num} - no header found. In section: {current_part or current_item}")
+                if current_part or current_item:
+                    if joined.strip():
+                        current_pages.append({"page": page_num, "content": joined})
+                continue
+            before = joined[:first_idx].strip()
+            after = joined[first_idx:].strip()
+            if (current_part or current_item) and before:
+                current_pages.append({"page": page_num, "content": before})
+            flush_section()
+            if first_kind == 'part' and part_m:
+                part_text = part_m.group(1)
+                current_part, _ = self._normalize_section_key(part_text, None)
+                current_item = None
+                current_item_title = None
+            elif first_kind == 'item' and item_m:
+                item_num = item_m.group(2)
+                title = (item_m.group(3) or "").strip()
+                current_item_title = self._clean_item_title(title) if title else None
+                if current_part is None and self.filing_type:
+                    inferred = self._infer_part_for_item(self.filing_type, f"ITEM {item_num.upper()}")
+                    if inferred:
+                        current_part = inferred
+                        self._log(f"DEBUG: Inferred {inferred} at detection time for ITEM {item_num}")
+                _, current_item = self._normalize_section_key(current_part, item_num)
+            if after:
+                current_pages.append({"page": page_num, "content": after})
+                if first_kind == 'part' and part_m:
+                    item_after = None
+                    for m in ITEM_PATTERN.finditer(after):
+                        item_after = m
+                        break
+                    if item_after:
+                        start = item_after.start()
+                        current_pages[-1]["content"] = after[start:]
+                        item_num = item_after.group(2)
+                        title = (item_after.group(3) or "").strip()
+                        current_item_title = self._clean_item_title(title) if title else None
+                        _, current_item = self._normalize_section_key(current_part, item_num)
+                        self._log(f"DEBUG: Page {page_num} - promoted PART to ITEM {item_num} (intra-page)")
+                        after = current_pages[-1]["content"]
+                tail = after
+                while True:
+                    next_kind, next_idx, next_part_m, next_item_m = None, None, None, None
+                    for m in PART_PATTERN.finditer(tail):
+                        if m.start() > 0:
+                            next_kind, next_idx, next_part_m = 'part', m.start(), m
+                            break
+                    for m in ITEM_PATTERN.finditer(tail):
+                        if m.start() > 0 and (next_idx is None or m.start() < next_idx):
+                            next_kind, next_idx, next_item_m = 'item', m.start(), m
+                    if next_idx is None:
+                        break
+                    before_seg = tail[:next_idx].strip()
+                    after_seg = tail[next_idx:].strip()
+                    if before_seg:
+                        current_pages[-1]["content"] = before_seg
+                    flush_section()
+                    if next_kind == 'part' and next_part_m:
+                        current_part, _ = self._normalize_section_key(next_part_m.group(1), None)
+                        current_item = None
+                        current_item_title = None
+                        self._log(f"DEBUG: Page {page_num} - intra-page PART transition to {current_part}")
+                    elif next_kind == 'item' and next_item_m:
+                        item_num = next_item_m.group(2)
+                        title = (next_item_m.group(3) or "").strip()
+                        current_item_title = self._clean_item_title(title) if title else None
+                        if current_part is None and self.filing_type:
+                            inferred = self._infer_part_for_item(self.filing_type, f"ITEM {item_num.upper()}")
+                            if inferred:
+                                current_part = inferred
+                                self._log(f"DEBUG: Inferred {inferred} at detection time for ITEM {item_num}")
+                        _, current_item = self._normalize_section_key(current_part, item_num)
+                        self._log(f"DEBUG: Page {page_num} - intra-page ITEM transition to {current_item}")
+                    current_pages.append({"page": page_num, "content": after_seg})
+                    tail = after_seg
+        flush_section()
+        self._log(f"DEBUG: Total sections before validation: {len(sections)}")
+        for s in sections:
+            self._log(f"  - Part: {s['part']}, Item: {s['item']}, Pages: {len(s['pages'])}, Start: {s['page_start']}")
+        def _section_text_len(s):
+            return sum(len(p["content"].strip()) for p in s["pages"])
+        sections = [s for s in sections if s["item"] is not None or _section_text_len(s) > 80]
+        self._log(f"DEBUG: Sections after dropping empty PART stubs: {len(sections)}")
+        if self.structure and sections:
+            self._log(f"DEBUG: Validating against structure: {self.filing_type}")
+            fixed = []
+            for s in sections:
+                part = s["part"]
+                item = s["item"]
+                if part is None and item and self.filing_type:
+                    inferred = self._infer_part_for_item(self.filing_type, item)
+                    if inferred:
+                        self._log(f"DEBUG: Inferred {inferred} from {item}")
+                        s = {**s, "part": inferred}
+                        part = inferred
+                if (part in self.structure) and (item is None or item in self.structure.get(part, [])):
+                    fixed.append(s)
+                else:
+                    self._log(f"DEBUG: Dropped section - Part: {part}, Item: {item}")
+            sections = fixed
+            self._log(f"DEBUG: Sections after validation: {len(sections)}")
+        return sections
+    def get_section(self, part: str, item: Optional[str] = None) -> Optional[Dict]:
+        part_normalized = self._normalize_section(part)
+        item_normalized = self._normalize_section(item) if item else None
+        sections = self.get_sections()
+        for section in sections:
+            if section["part"] == part_normalized:
+                if item_normalized is None or section["item"] == item_normalized:
+                    return section
+        return None

sec2md/sections.py ADDED Viewed

@@ -0,0 +1,104 @@
+"""Section extraction utilities for SEC filings."""
+from typing import List, Optional, Union
+from sec2md.models import Page, Section, FilingType, Item10K, Item10Q, ITEM_10K_MAPPING, ITEM_10Q_MAPPING
+from sec2md.section_extractor import SectionExtractor
+def extract_sections(
+    pages: List[Page],
+    filing_type: FilingType,
+    debug: bool = False
+) -> List[Section]:
+    """
+    Extract sections from filing pages.
+    Args:
+        pages: List of Page objects from convert_to_markdown(return_pages=True)
+        filing_type: Type of filing ("10-K" or "10-Q")
+        debug: Enable debug logging
+    Returns:
+        List of Section objects, each containing pages for that section
+    Example:
+        >>> pages = sec2md.convert_to_markdown(html, return_pages=True)
+        >>> sections = sec2md.extract_sections(pages, filing_type="10-K")
+        >>> for section in sections:
+        ...     print(f"{section.item}: {section.item_title}")
+    """
+    # Convert Page objects to dict format for SectionExtractor
+    pages_data = [{"page": p.number, "content": p.content} for p in pages]
+    extractor = SectionExtractor(
+        pages=pages_data,
+        filing_type=filing_type,
+        debug=debug
+    )
+    sections_data = extractor.get_sections()
+    # Convert back to Section objects with Page objects
+    sections = []
+    for section_data in sections_data:
+        section_pages = [
+            Page(number=p["page"], content=p["content"])
+            for p in section_data["pages"]
+        ]
+        sections.append(
+            Section(
+                part=section_data["part"],
+                item=section_data["item"],
+                item_title=section_data["item_title"],
+                pages=section_pages
+            )
+        )
+    return sections
+def get_section(
+    sections: List[Section],
+    item: Union[Item10K, Item10Q, str],
+    filing_type: FilingType = "10-K"
+) -> Optional[Section]:
+    """
+    Get a specific section by item enum or string.
+    Args:
+        sections: List of sections from extract_sections()
+        item: Item enum (Item10K.RISK_FACTORS) or string ("ITEM 1A")
+        filing_type: Type of filing ("10-K" or "10-Q")
+    Returns:
+        Section object if found, None otherwise
+    Example:
+        >>> sections = sec2md.extract_sections(pages, filing_type="10-K")
+        >>> risk = sec2md.get_section(sections, Item10K.RISK_FACTORS)
+        >>> print(risk.markdown())
+    """
+    # Map enum to (part, item) tuple
+    if isinstance(item, Item10K):
+        if filing_type != "10-K":
+            raise ValueError(f"Item10K enum requires filing_type='10-K', got '{filing_type}'")
+        target_part, target_item = ITEM_10K_MAPPING[item]
+    elif isinstance(item, Item10Q):
+        if filing_type != "10-Q":
+            raise ValueError(f"Item10Q enum requires filing_type='10-Q', got '{filing_type}'")
+        target_part, target_item = ITEM_10Q_MAPPING[item]
+    else:
+        # String format - normalize it
+        item_str = str(item).upper().strip()
+        if not item_str.startswith("ITEM"):
+            item_str = f"ITEM {item_str}"
+        target_item = item_str
+        target_part = None  # Match any part
+    # Find matching section
+    for section in sections:
+        if section.item == target_item:
+            if target_part is None or section.part == target_part:
+                return section
+    return None