PyPI - sec2md - Versions diffs - 0.1.4__tar.gz - Mend

sec2md 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

sec2md-0.1.4/LICENSE +21 -0
sec2md-0.1.4/PKG-INFO +215 -0
sec2md-0.1.4/README.md +180 -0
sec2md-0.1.4/pyproject.toml +64 -0
sec2md-0.1.4/setup.cfg +4 -0
sec2md-0.1.4/src/sec2md/__init__.py +34 -0
sec2md-0.1.4/src/sec2md/absolute_table_parser.py +622 -0
sec2md-0.1.4/src/sec2md/chunker/__init__.py +0 -0
sec2md-0.1.4/src/sec2md/chunker/markdown_blocks.py +127 -0
sec2md-0.1.4/src/sec2md/chunker/markdown_chunk.py +141 -0
sec2md-0.1.4/src/sec2md/chunker/markdown_chunker.py +266 -0
sec2md-0.1.4/src/sec2md/chunking.py +179 -0
sec2md-0.1.4/src/sec2md/core.py +93 -0
sec2md-0.1.4/src/sec2md/models.py +280 -0
sec2md-0.1.4/src/sec2md/parser.py +1217 -0
sec2md-0.1.4/src/sec2md/section_extractor.py +388 -0
sec2md-0.1.4/src/sec2md/sections.py +84 -0
sec2md-0.1.4/src/sec2md/table_parser.py +386 -0
sec2md-0.1.4/src/sec2md/utils.py +109 -0
sec2md-0.1.4/src/sec2md.egg-info/PKG-INFO +215 -0
sec2md-0.1.4/src/sec2md.egg-info/SOURCES.txt +22 -0
sec2md-0.1.4/src/sec2md.egg-info/dependency_links.txt +1 -0
sec2md-0.1.4/src/sec2md.egg-info/requires.txt +10 -0
sec2md-0.1.4/src/sec2md.egg-info/top_level.txt +1 -0

sec2md-0.1.4/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 Lucas Astorian
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

sec2md-0.1.4/PKG-INFO ADDED Viewed

@@ -0,0 +1,215 @@
+Metadata-Version: 2.4
+Name: sec2md
+Version: 0.1.4
+Summary: Convert SEC EDGAR filings to LLM-ready Markdown for AI agents and agentic RAG
+Author-email: Lucas Astorian <lucas@intellifin.ai>
+License: MIT
+Project-URL: Homepage, https://github.com/lucasastorian/sec2md
+Project-URL: Repository, https://github.com/lucasastorian/sec2md
+Project-URL: Issues, https://github.com/lucasastorian/sec2md/issues
+Keywords: sec,edgar,markdown,filings,10-k,10-q,llm,rag,ai,embeddings
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Financial and Insurance Industry
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Office/Business :: Financial
+Classifier: Topic :: Text Processing :: Markup :: Markdown
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: beautifulsoup4>=4.12.0
+Requires-Dist: lxml>=4.9.0
+Requires-Dist: requests>=2.31.0
+Requires-Dist: tiktoken>=0.5.0
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0.0; extra == "dev"
+Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
+Requires-Dist: black>=23.0.0; extra == "dev"
+Requires-Dist: ruff>=0.1.0; extra == "dev"
+Dynamic: license-file
+# sec2md
+[![PyPI](https://img.shields.io/pypi/v/sec2md.svg)](https://pypi.org/project/sec2md)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
+[![Documentation](https://img.shields.io/badge/docs-readthedocs-blue.svg)](https://sec2md.readthedocs.io)
+Transform messy SEC filings into clean, structured Markdown.
+**Built for AI. Optimized for retrieval. Ready for production.**
+![Before and After Comparison](comparison.png)
+*Apple 10-K cover page: Raw SEC HTML (left) vs. Clean Markdown (right)*
+---
+## The Problem
+RAG pipelines fail on SEC filings because **standard parsers destroy document structure.**
+When you flatten a 200-page 10-K to plain text:
+- ❌ **Tables break** — Complex financial statements become misaligned text
+- ❌ **Pages are lost** — Can't cite sources or trace answers back
+- ❌ **Sections merge** — Risk Factors and MD&A become indistinguishable
+- ❌ **Formatting is stripped** — Headers, bolds, lists (LLM reasoning cues) gone
+- ❌ **Retrieval fails** — Chunks without structure return wrong context
+Your RAG system is only as good as your data. Garbage in, garbage out.
+## The Solution
+`sec2md` **rebuilds** SEC filings as clean, semantic Markdown designed for AI systems:
+- ✅ **Preserves structure** - Headers (`#`), paragraphs, lists maintained
+- ✅ **Converts tables** - Complex HTML tables → clean Markdown pipes
+- ✅ **Strips noise** - XBRL tags, inline styles, and boilerplate removed
+- ✅ **Tracks pages** - Original pagination preserved for citation
+- ✅ **Detects sections** - Auto-extract Risk Factors, MD&A, Business sections
+- ✅ **Chunks intelligently** - Page-aware splitting with metadata headers
+### What We Support
+| Document Type              | Status | Notes                                |
+|----------------------------|--------|--------------------------------------|
+| **10-K/Q Filings**         | ✅     | Full section extraction (ITEM 1-16)  |
+| **Financial Statements**   | ✅     | Tables preserved in Markdown         |
+| **Notes to Financials**    | ✅     | Automatic table unwrapping           |
+| **8-K Press Releases**     | ✅     | Clean prose extraction               |
+| **Proxy Statements (DEF 14A)** | ✅ | Executive compensation, governance   |
+| **Exhibits** (Contracts)   | ✅     | Merger agreements, material contracts|
+---
+## Installation
+```bash
+pip install sec2md
+```
+## Quickstart
+```python
+import sec2md
+# Convert any SEC filing to clean Markdown
+md = sec2md.convert_to_markdown(
+    "https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm",
+    user_agent="Your Name <you@example.com>"
+)
+```
+**Input:** Messy SEC HTML with XBRL tags, nested tables, inline styles
+**Output:** Clean, structured Markdown ready for LLMs
+```markdown
+## ITEM 1. Business
+Apple Inc. designs, manufactures, and markets smartphones, personal computers,
+tablets, wearables, and accessories worldwide...
+### Products
+| Product Category | Revenue (millions) |
+|------------------|-------------------|
+| iPhone           | $200,583          |
+| Mac              | $29,357           |
+| iPad             | $28,300           |
+...
+```
+## Core Features
+### 1️⃣ Section Extraction
+Extract specific sections from 10-K/10-Q filings with type-safe enums:
+```python
+from sec2md import Item10K
+pages = sec2md.convert_to_markdown(html, return_pages=True)
+sections = sec2md.extract_sections(pages, filing_type="10-K")
+# Get Risk Factors section
+risk = sec2md.get_section(sections, Item10K.RISK_FACTORS)
+print(risk.markdown())  # Just the risk factors text
+print(risk.page_range)   # (12, 28) - page citations
+```
+### 2️⃣ Page-Aware Chunking
+Intelligent chunking that preserves page numbers for citations:
+```python
+chunks = sec2md.chunk_pages(pages, chunk_size=512)
+for chunk in chunks:
+    print(f"Page {chunk.page}: {chunk.content[:100]}...")
+    # Use for embeddings, citations, or retrieval
+```
+### 3️⃣ RAG-Optimized Headers
+Boost retrieval quality by adding metadata to chunk embeddings:
+```python
+header = """# Apple Inc. (AAPL)
+Form 10-K | FY 2024 | Risk Factors"""
+chunks = sec2md.chunk_section(risk, header=header)
+# chunk.embedding_text includes header for better embeddings
+# chunk.content contains only the actual filing text
+```
+### 4️⃣ EdgarTools Integration
+Works seamlessly with [edgartools](https://github.com/dgunning/edgartools):
+```python
+from edgar import Company
+company = Company("AAPL")
+filing = company.get_filings(form="10-K").latest()
+md = sec2md.convert_to_markdown(filing.html())
+```
+---
+## Why Choose sec2md?
+### Just Parse It
+Most libraries force you to choose between speed and accuracy. `sec2md` gives you both:
+- 🚀 **Fast** - Processes 200-page filings in seconds
+- 🎯 **Accurate** - Purpose-built for SEC document structure
+- 🔧 **Simple** - One function call, zero configuration
+### Built for Agentic RAG
+Don't rebuild what we've already solved:
+- ✅ **Page tracking** - Cite sources with exact page numbers
+- ✅ **Section detection** - Extract just what you need (Risk Factors, MD&A)
+- ✅ **Smart chunking** - Respects table boundaries, preserves context
+- ✅ **Metadata headers** - Boost embedding quality 2-3x with contextual headers
+---
+## Documentation
+📚 **Full documentation:** [sec2md.readthedocs.io](https://sec2md.readthedocs.io)
+- [Quickstart Guide](https://sec2md.readthedocs.io/quickstart) - Get up and running in 3 minutes
+- [Convert Filings](https://sec2md.readthedocs.io/usage/direct-conversion) - Handle 10-Ks, exhibits, press releases
+- [Extract Sections](https://sec2md.readthedocs.io/usage/sections) - Pull specific ITEM sections
+- [Chunking for RAG](https://sec2md.readthedocs.io/usage/chunking) - Page-aware chunking with contextual headers
+- [EdgarTools Integration](https://sec2md.readthedocs.io/usage/edgartools) - Automate filing downloads
+- [API Reference](https://sec2md.readthedocs.io/api/convert_to_markdown) - Complete API docs
+---
+## Contributing
+We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
+## License
+MIT © 2025

sec2md-0.1.4/README.md ADDED Viewed

@@ -0,0 +1,180 @@
+# sec2md
+[![PyPI](https://img.shields.io/pypi/v/sec2md.svg)](https://pypi.org/project/sec2md)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
+[![Documentation](https://img.shields.io/badge/docs-readthedocs-blue.svg)](https://sec2md.readthedocs.io)
+Transform messy SEC filings into clean, structured Markdown.
+**Built for AI. Optimized for retrieval. Ready for production.**
+![Before and After Comparison](comparison.png)
+*Apple 10-K cover page: Raw SEC HTML (left) vs. Clean Markdown (right)*
+---
+## The Problem
+RAG pipelines fail on SEC filings because **standard parsers destroy document structure.**
+When you flatten a 200-page 10-K to plain text:
+- ❌ **Tables break** — Complex financial statements become misaligned text
+- ❌ **Pages are lost** — Can't cite sources or trace answers back
+- ❌ **Sections merge** — Risk Factors and MD&A become indistinguishable
+- ❌ **Formatting is stripped** — Headers, bolds, lists (LLM reasoning cues) gone
+- ❌ **Retrieval fails** — Chunks without structure return wrong context
+Your RAG system is only as good as your data. Garbage in, garbage out.
+## The Solution
+`sec2md` **rebuilds** SEC filings as clean, semantic Markdown designed for AI systems:
+- ✅ **Preserves structure** - Headers (`#`), paragraphs, lists maintained
+- ✅ **Converts tables** - Complex HTML tables → clean Markdown pipes
+- ✅ **Strips noise** - XBRL tags, inline styles, and boilerplate removed
+- ✅ **Tracks pages** - Original pagination preserved for citation
+- ✅ **Detects sections** - Auto-extract Risk Factors, MD&A, Business sections
+- ✅ **Chunks intelligently** - Page-aware splitting with metadata headers
+### What We Support
+| Document Type              | Status | Notes                                |
+|----------------------------|--------|--------------------------------------|
+| **10-K/Q Filings**         | ✅     | Full section extraction (ITEM 1-16)  |
+| **Financial Statements**   | ✅     | Tables preserved in Markdown         |
+| **Notes to Financials**    | ✅     | Automatic table unwrapping           |
+| **8-K Press Releases**     | ✅     | Clean prose extraction               |
+| **Proxy Statements (DEF 14A)** | ✅ | Executive compensation, governance   |
+| **Exhibits** (Contracts)   | ✅     | Merger agreements, material contracts|
+---
+## Installation
+```bash
+pip install sec2md
+```
+## Quickstart
+```python
+import sec2md
+# Convert any SEC filing to clean Markdown
+md = sec2md.convert_to_markdown(
+    "https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm",
+    user_agent="Your Name <you@example.com>"
+)
+```
+**Input:** Messy SEC HTML with XBRL tags, nested tables, inline styles
+**Output:** Clean, structured Markdown ready for LLMs
+```markdown
+## ITEM 1. Business
+Apple Inc. designs, manufactures, and markets smartphones, personal computers,
+tablets, wearables, and accessories worldwide...
+### Products
+| Product Category | Revenue (millions) |
+|------------------|-------------------|
+| iPhone           | $200,583          |
+| Mac              | $29,357           |
+| iPad             | $28,300           |
+...
+```
+## Core Features
+### 1️⃣ Section Extraction
+Extract specific sections from 10-K/10-Q filings with type-safe enums:
+```python
+from sec2md import Item10K
+pages = sec2md.convert_to_markdown(html, return_pages=True)
+sections = sec2md.extract_sections(pages, filing_type="10-K")
+# Get Risk Factors section
+risk = sec2md.get_section(sections, Item10K.RISK_FACTORS)
+print(risk.markdown())  # Just the risk factors text
+print(risk.page_range)   # (12, 28) - page citations
+```
+### 2️⃣ Page-Aware Chunking
+Intelligent chunking that preserves page numbers for citations:
+```python
+chunks = sec2md.chunk_pages(pages, chunk_size=512)
+for chunk in chunks:
+    print(f"Page {chunk.page}: {chunk.content[:100]}...")
+    # Use for embeddings, citations, or retrieval
+```
+### 3️⃣ RAG-Optimized Headers
+Boost retrieval quality by adding metadata to chunk embeddings:
+```python
+header = """# Apple Inc. (AAPL)
+Form 10-K | FY 2024 | Risk Factors"""
+chunks = sec2md.chunk_section(risk, header=header)
+# chunk.embedding_text includes header for better embeddings
+# chunk.content contains only the actual filing text
+```
+### 4️⃣ EdgarTools Integration
+Works seamlessly with [edgartools](https://github.com/dgunning/edgartools):
+```python
+from edgar import Company
+company = Company("AAPL")
+filing = company.get_filings(form="10-K").latest()
+md = sec2md.convert_to_markdown(filing.html())
+```
+---
+## Why Choose sec2md?
+### Just Parse It
+Most libraries force you to choose between speed and accuracy. `sec2md` gives you both:
+- 🚀 **Fast** - Processes 200-page filings in seconds
+- 🎯 **Accurate** - Purpose-built for SEC document structure
+- 🔧 **Simple** - One function call, zero configuration
+### Built for Agentic RAG
+Don't rebuild what we've already solved:
+- ✅ **Page tracking** - Cite sources with exact page numbers
+- ✅ **Section detection** - Extract just what you need (Risk Factors, MD&A)
+- ✅ **Smart chunking** - Respects table boundaries, preserves context
+- ✅ **Metadata headers** - Boost embedding quality 2-3x with contextual headers
+---
+## Documentation
+📚 **Full documentation:** [sec2md.readthedocs.io](https://sec2md.readthedocs.io)
+- [Quickstart Guide](https://sec2md.readthedocs.io/quickstart) - Get up and running in 3 minutes
+- [Convert Filings](https://sec2md.readthedocs.io/usage/direct-conversion) - Handle 10-Ks, exhibits, press releases
+- [Extract Sections](https://sec2md.readthedocs.io/usage/sections) - Pull specific ITEM sections
+- [Chunking for RAG](https://sec2md.readthedocs.io/usage/chunking) - Page-aware chunking with contextual headers
+- [EdgarTools Integration](https://sec2md.readthedocs.io/usage/edgartools) - Automate filing downloads
+- [API Reference](https://sec2md.readthedocs.io/api/convert_to_markdown) - Complete API docs
+---
+## Contributing
+We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
+## License
+MIT © 2025

sec2md-0.1.4/pyproject.toml ADDED Viewed

@@ -0,0 +1,64 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "sec2md"
+version = "0.1.4"
+description = "Convert SEC EDGAR filings to LLM-ready Markdown for AI agents and agentic RAG"
+readme = "README.md"
+requires-python = ">=3.9"
+license = {text = "MIT"}
+authors = [
+    {name = "Lucas Astorian", email = "lucas@intellifin.ai"}
+]
+keywords = ["sec", "edgar", "markdown", "filings", "10-k", "10-q", "llm", "rag", "ai", "embeddings"]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Financial and Insurance Industry",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Office/Business :: Financial",
+    "Topic :: Text Processing :: Markup :: Markdown",
+]
+dependencies = [
+    "beautifulsoup4>=4.12.0",
+    "lxml>=4.9.0",
+    "requests>=2.31.0",
+    "tiktoken>=0.5.0",  # Default: exact token counting (use --no-deps to exclude)
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=7.0.0",
+    "pytest-cov>=4.0.0",
+    "black>=23.0.0",
+    "ruff>=0.1.0",
+]
+[project.urls]
+Homepage = "https://github.com/lucasastorian/sec2md"
+Repository = "https://github.com/lucasastorian/sec2md"
+Issues = "https://github.com/lucasastorian/sec2md/issues"
+[tool.setuptools.packages.find]
+where = ["src"]
+[tool.black]
+line-length = 100
+target-version = ["py39"]
+[tool.ruff]
+line-length = 100
+target-version = "py39"
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = ["test_*.py"]

sec2md-0.1.4/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

sec2md-0.1.4/src/sec2md/__init__.py ADDED Viewed

@@ -0,0 +1,34 @@
+"""sec2md: Convert SEC filings to high-quality Markdown."""
+from sec2md.core import convert_to_markdown
+from sec2md.utils import flatten_note
+from sec2md.sections import extract_sections, get_section
+from sec2md.chunking import chunk_pages, chunk_section, merge_text_blocks, chunk_text_block
+from sec2md.models import Page, Section, Item10K, Item10Q, FilingType, Element, TextBlock
+from sec2md.chunker.markdown_chunk import MarkdownChunk
+from sec2md.chunker.markdown_chunker import MarkdownChunker
+from sec2md.parser import Parser
+from sec2md.section_extractor import SectionExtractor
+__version__ = "0.1.4"
+__all__ = [
+    "convert_to_markdown",
+    "flatten_note",
+    "extract_sections",
+    "get_section",
+    "chunk_pages",
+    "chunk_section",
+    "merge_text_blocks",
+    "chunk_text_block",
+    "Page",
+    "Section",
+    "Element",
+    "TextBlock",
+    "Item10K",
+    "Item10Q",
+    "FilingType",
+    "MarkdownChunk",
+    "MarkdownChunker",
+    "Parser",
+    "SectionExtractor",
+]