PyPI - sec2md - Versions diffs - 0.1.0__tar.gz - Mend

sec2md 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of sec2md might be problematic. Click here for more details.

Files changed (24) hide show

sec2md-0.1.0/LICENSE +21 -0
sec2md-0.1.0/PKG-INFO +217 -0
sec2md-0.1.0/README.md +183 -0
sec2md-0.1.0/pyproject.toml +63 -0
sec2md-0.1.0/setup.cfg +4 -0
sec2md-0.1.0/src/sec2md/__init__.py +24 -0
sec2md-0.1.0/src/sec2md/absolute_table_parser.py +622 -0
sec2md-0.1.0/src/sec2md/chunker/__init__.py +0 -0
sec2md-0.1.0/src/sec2md/chunker/markdown_blocks.py +116 -0
sec2md-0.1.0/src/sec2md/chunker/markdown_chunk.py +76 -0
sec2md-0.1.0/src/sec2md/chunker/markdown_chunker.py +234 -0
sec2md-0.1.0/src/sec2md/chunking.py +66 -0
sec2md-0.1.0/src/sec2md/core.py +93 -0
sec2md-0.1.0/src/sec2md/models.py +153 -0
sec2md-0.1.0/src/sec2md/parser.py +586 -0
sec2md-0.1.0/src/sec2md/section_extractor.py +316 -0
sec2md-0.1.0/src/sec2md/sections.py +104 -0
sec2md-0.1.0/src/sec2md/table_parser.py +386 -0
sec2md-0.1.0/src/sec2md/utils.py +109 -0
sec2md-0.1.0/src/sec2md.egg-info/PKG-INFO +217 -0
sec2md-0.1.0/src/sec2md.egg-info/SOURCES.txt +22 -0
sec2md-0.1.0/src/sec2md.egg-info/dependency_links.txt +1 -0
sec2md-0.1.0/src/sec2md.egg-info/requires.txt +9 -0
sec2md-0.1.0/src/sec2md.egg-info/top_level.txt +1 -0

sec2md-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 Lucas Astorian
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

sec2md-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,217 @@
+Metadata-Version: 2.4
+Name: sec2md
+Version: 0.1.0
+Summary: Convert SEC EDGAR filings to LLM-ready Markdown for AI agents and agentic RAG
+Author-email: Lucas Astorian <lucas@intellifin.ai>
+License: MIT
+Project-URL: Homepage, https://github.com/lucasastorian/sec2md
+Project-URL: Repository, https://github.com/lucasastorian/sec2md
+Project-URL: Issues, https://github.com/lucasastorian/sec2md/issues
+Keywords: sec,edgar,markdown,filings,10-k,10-q,llm,rag,ai,embeddings
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Financial and Insurance Industry
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Office/Business :: Financial
+Classifier: Topic :: Text Processing :: Markup :: Markdown
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: beautifulsoup4>=4.12.0
+Requires-Dist: lxml>=4.9.0
+Requires-Dist: requests>=2.31.0
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0.0; extra == "dev"
+Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
+Requires-Dist: black>=23.0.0; extra == "dev"
+Requires-Dist: ruff>=0.1.0; extra == "dev"
+Dynamic: license-file
+# sec2md
+Transform messy SEC filings into clean, structured Markdown.
+**Built for AI. Optimized for retrieval. Ready for production.**
+[![PyPI](https://img.shields.io/pypi/v/sec2md.svg)](https://pypi.org/project/sec2md)
+[![Downloads](https://pepy.tech/badge/sec2md)](https://pepy.tech/project/sec2md)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
+[![Documentation](https://img.shields.io/badge/docs-readthedocs-blue.svg)](https://sec2md.readthedocs.io)
+---
+## The Problem
+SEC filings are a nightmare for LLMs:
+- ❌ **XBRL tags** pollute the text (`<us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax>`)
+- ❌ **Nested tables** with absolute positioning break standard parsers
+- ❌ **Inline CSS** and presentational HTML obscure semantic structure
+- ❌ **200+ page documents** with no clear section boundaries
+Standard HTML-to-text converters produce garbage. Your RAG pipeline deserves better.
+## The Solution
+`sec2md` **rebuilds** SEC filings as clean, semantic Markdown designed for AI systems:
+- ✅ **Preserves structure** - Headers (`#`), paragraphs, lists maintained
+- ✅ **Converts tables** - Complex HTML tables → clean Markdown pipes
+- ✅ **Strips noise** - XBRL tags, inline styles, and boilerplate removed
+- ✅ **Tracks pages** - Original pagination preserved for citation
+- ✅ **Detects sections** - Auto-extract Risk Factors, MD&A, Business sections
+- ✅ **Chunks intelligently** - Page-aware splitting with metadata headers
+### What We Support
+| Document Type              | Status | Notes                                |
+|----------------------------|--------|--------------------------------------|
+| **10-K/Q Filings**         | ✅     | Full section extraction (ITEM 1-16)  |
+| **Financial Statements**   | ✅     | Tables preserved in Markdown         |
+| **Notes to Financials**    | ✅     | Automatic table unwrapping           |
+| **8-K Press Releases**     | ✅     | Clean prose extraction               |
+| **Proxy Statements (DEF 14A)** | ✅ | Executive compensation, governance   |
+| **Exhibits** (Contracts)   | ✅     | Merger agreements, material contracts|
+---
+## Installation
+```bash
+pip install sec2md
+```
+## Quickstart
+```python
+import sec2md
+# Convert any SEC filing to clean Markdown
+md = sec2md.convert_to_markdown(
+    "https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm",
+    user_agent="Your Name <you@example.com>"
+)
+```
+**Input:** Messy SEC HTML with XBRL tags, nested tables, inline styles
+**Output:** Clean, structured Markdown ready for LLMs
+```markdown
+## ITEM 1. Business
+Apple Inc. designs, manufactures, and markets smartphones, personal computers,
+tablets, wearables, and accessories worldwide...
+### Products
+| Product Category | Revenue (millions) |
+|------------------|-------------------|
+| iPhone           | $200,583          |
+| Mac              | $29,357           |
+| iPad             | $28,300           |
+...
+```
+## Core Features
+### 1️⃣ Section Extraction
+Extract specific sections from 10-K/10-Q filings with type-safe enums:
+```python
+from sec2md import Item10K
+pages = sec2md.convert_to_markdown(html, return_pages=True)
+sections = sec2md.extract_sections(pages, filing_type="10-K")
+# Get Risk Factors section
+risk = sec2md.get_section(sections, Item10K.RISK_FACTORS)
+print(risk.markdown())  # Just the risk factors text
+print(risk.page_range)   # (12, 28) - page citations
+```
+### 2️⃣ Page-Aware Chunking
+Intelligent chunking that preserves page numbers for citations:
+```python
+chunks = sec2md.chunk_pages(pages, chunk_size=512)
+for chunk in chunks:
+    print(f"Page {chunk.page}: {chunk.content[:100]}...")
+    # Use for embeddings, citations, or retrieval
+```
+### 3️⃣ RAG-Optimized Headers
+Boost retrieval quality by adding metadata to chunk embeddings:
+```python
+header = """# Apple Inc. (AAPL)
+Form 10-K | FY 2024 | Risk Factors"""
+chunks = sec2md.chunk_section(risk, header=header)
+# chunk.embedding_text includes header for better embeddings
+# chunk.content contains only the actual filing text
+```
+### 4️⃣ EdgarTools Integration
+Works seamlessly with [edgartools](https://github.com/dgunning/edgartools):
+```python
+from edgar import Company
+company = Company("AAPL")
+filing = company.get_filings(form="10-K").latest()
+md = sec2md.convert_to_markdown(filing.html())
+```
+---
+## Why Choose sec2md?
+### Just Parse It
+Most libraries force you to choose between speed and accuracy. `sec2md` gives you both:
+- 🚀 **Fast** - Processes 200-page filings in seconds
+- 🎯 **Accurate** - Purpose-built for SEC document structure
+- 🔧 **Simple** - One function call, zero configuration
+### Built for Production RAG
+Don't rebuild what we've already solved:
+- ✅ **Page tracking** - Cite sources with exact page numbers
+- ✅ **Section detection** - Extract just what you need (Risk Factors, MD&A)
+- ✅ **Smart chunking** - Respects table boundaries, preserves context
+- ✅ **Metadata headers** - Boost embedding quality 2-3x with contextual headers
+### Avoid the Maintenance Nightmare
+Building your own SEC parser starts simple - scaling it is another story. What begins as BeautifulSoup and regex quickly turns into:
+- 🔴 Edge cases for every filing format variation
+- 🔴 Table parsing that breaks on nested structures
+- 🔴 XBRL tag stripping that misses new namespaces
+- 🔴 Section detection that fails on formatting changes
+**`sec2md` handles this for you.** Focus on building AI features, not parsing documents.
+---
+## Documentation
+📚 **Full documentation:** [sec2md.readthedocs.io](https://sec2md.readthedocs.io)
+- [Quickstart Guide](https://sec2md.readthedocs.io/quickstart) - Get up and running in 3 minutes
+- [Convert Filings](https://sec2md.readthedocs.io/usage/direct-conversion) - Handle 10-Ks, exhibits, press releases
+- [Extract Sections](https://sec2md.readthedocs.io/usage/sections) - Pull specific ITEM sections
+- [Chunking for RAG](https://sec2md.readthedocs.io/usage/chunking) - Page-aware chunking with contextual headers
+- [EdgarTools Integration](https://sec2md.readthedocs.io/usage/edgartools) - Automate filing downloads
+- [API Reference](https://sec2md.readthedocs.io/api/convert_to_markdown) - Complete API docs
+---
+## Contributing
+We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
+## License
+MIT © 2025

sec2md-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,183 @@
+# sec2md
+Transform messy SEC filings into clean, structured Markdown.
+**Built for AI. Optimized for retrieval. Ready for production.**
+[![PyPI](https://img.shields.io/pypi/v/sec2md.svg)](https://pypi.org/project/sec2md)
+[![Downloads](https://pepy.tech/badge/sec2md)](https://pepy.tech/project/sec2md)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
+[![Documentation](https://img.shields.io/badge/docs-readthedocs-blue.svg)](https://sec2md.readthedocs.io)
+---
+## The Problem
+SEC filings are a nightmare for LLMs:
+- ❌ **XBRL tags** pollute the text (`<us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax>`)
+- ❌ **Nested tables** with absolute positioning break standard parsers
+- ❌ **Inline CSS** and presentational HTML obscure semantic structure
+- ❌ **200+ page documents** with no clear section boundaries
+Standard HTML-to-text converters produce garbage. Your RAG pipeline deserves better.
+## The Solution
+`sec2md` **rebuilds** SEC filings as clean, semantic Markdown designed for AI systems:
+- ✅ **Preserves structure** - Headers (`#`), paragraphs, lists maintained
+- ✅ **Converts tables** - Complex HTML tables → clean Markdown pipes
+- ✅ **Strips noise** - XBRL tags, inline styles, and boilerplate removed
+- ✅ **Tracks pages** - Original pagination preserved for citation
+- ✅ **Detects sections** - Auto-extract Risk Factors, MD&A, Business sections
+- ✅ **Chunks intelligently** - Page-aware splitting with metadata headers
+### What We Support
+| Document Type              | Status | Notes                                |
+|----------------------------|--------|--------------------------------------|
+| **10-K/Q Filings**         | ✅     | Full section extraction (ITEM 1-16)  |
+| **Financial Statements**   | ✅     | Tables preserved in Markdown         |
+| **Notes to Financials**    | ✅     | Automatic table unwrapping           |
+| **8-K Press Releases**     | ✅     | Clean prose extraction               |
+| **Proxy Statements (DEF 14A)** | ✅ | Executive compensation, governance   |
+| **Exhibits** (Contracts)   | ✅     | Merger agreements, material contracts|
+---
+## Installation
+```bash
+pip install sec2md
+```
+## Quickstart
+```python
+import sec2md
+# Convert any SEC filing to clean Markdown
+md = sec2md.convert_to_markdown(
+    "https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm",
+    user_agent="Your Name <you@example.com>"
+)
+```
+**Input:** Messy SEC HTML with XBRL tags, nested tables, inline styles
+**Output:** Clean, structured Markdown ready for LLMs
+```markdown
+## ITEM 1. Business
+Apple Inc. designs, manufactures, and markets smartphones, personal computers,
+tablets, wearables, and accessories worldwide...
+### Products
+| Product Category | Revenue (millions) |
+|------------------|-------------------|
+| iPhone           | $200,583          |
+| Mac              | $29,357           |
+| iPad             | $28,300           |
+...
+```
+## Core Features
+### 1️⃣ Section Extraction
+Extract specific sections from 10-K/10-Q filings with type-safe enums:
+```python
+from sec2md import Item10K
+pages = sec2md.convert_to_markdown(html, return_pages=True)
+sections = sec2md.extract_sections(pages, filing_type="10-K")
+# Get Risk Factors section
+risk = sec2md.get_section(sections, Item10K.RISK_FACTORS)
+print(risk.markdown())  # Just the risk factors text
+print(risk.page_range)   # (12, 28) - page citations
+```
+### 2️⃣ Page-Aware Chunking
+Intelligent chunking that preserves page numbers for citations:
+```python
+chunks = sec2md.chunk_pages(pages, chunk_size=512)
+for chunk in chunks:
+    print(f"Page {chunk.page}: {chunk.content[:100]}...")
+    # Use for embeddings, citations, or retrieval
+```
+### 3️⃣ RAG-Optimized Headers
+Boost retrieval quality by adding metadata to chunk embeddings:
+```python
+header = """# Apple Inc. (AAPL)
+Form 10-K | FY 2024 | Risk Factors"""
+chunks = sec2md.chunk_section(risk, header=header)
+# chunk.embedding_text includes header for better embeddings
+# chunk.content contains only the actual filing text
+```
+### 4️⃣ EdgarTools Integration
+Works seamlessly with [edgartools](https://github.com/dgunning/edgartools):
+```python
+from edgar import Company
+company = Company("AAPL")
+filing = company.get_filings(form="10-K").latest()
+md = sec2md.convert_to_markdown(filing.html())
+```
+---
+## Why Choose sec2md?
+### Just Parse It
+Most libraries force you to choose between speed and accuracy. `sec2md` gives you both:
+- 🚀 **Fast** - Processes 200-page filings in seconds
+- 🎯 **Accurate** - Purpose-built for SEC document structure
+- 🔧 **Simple** - One function call, zero configuration
+### Built for Production RAG
+Don't rebuild what we've already solved:
+- ✅ **Page tracking** - Cite sources with exact page numbers
+- ✅ **Section detection** - Extract just what you need (Risk Factors, MD&A)
+- ✅ **Smart chunking** - Respects table boundaries, preserves context
+- ✅ **Metadata headers** - Boost embedding quality 2-3x with contextual headers
+### Avoid the Maintenance Nightmare
+Building your own SEC parser starts simple - scaling it is another story. What begins as BeautifulSoup and regex quickly turns into:
+- 🔴 Edge cases for every filing format variation
+- 🔴 Table parsing that breaks on nested structures
+- 🔴 XBRL tag stripping that misses new namespaces
+- 🔴 Section detection that fails on formatting changes
+**`sec2md` handles this for you.** Focus on building AI features, not parsing documents.
+---
+## Documentation
+📚 **Full documentation:** [sec2md.readthedocs.io](https://sec2md.readthedocs.io)
+- [Quickstart Guide](https://sec2md.readthedocs.io/quickstart) - Get up and running in 3 minutes
+- [Convert Filings](https://sec2md.readthedocs.io/usage/direct-conversion) - Handle 10-Ks, exhibits, press releases
+- [Extract Sections](https://sec2md.readthedocs.io/usage/sections) - Pull specific ITEM sections
+- [Chunking for RAG](https://sec2md.readthedocs.io/usage/chunking) - Page-aware chunking with contextual headers
+- [EdgarTools Integration](https://sec2md.readthedocs.io/usage/edgartools) - Automate filing downloads
+- [API Reference](https://sec2md.readthedocs.io/api/convert_to_markdown) - Complete API docs
+---
+## Contributing
+We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
+## License
+MIT © 2025

sec2md-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,63 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "sec2md"
+version = "0.1.0"
+description = "Convert SEC EDGAR filings to LLM-ready Markdown for AI agents and agentic RAG"
+readme = "README.md"
+requires-python = ">=3.9"
+license = {text = "MIT"}
+authors = [
+    {name = "Lucas Astorian", email = "lucas@intellifin.ai"}
+]
+keywords = ["sec", "edgar", "markdown", "filings", "10-k", "10-q", "llm", "rag", "ai", "embeddings"]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Financial and Insurance Industry",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Office/Business :: Financial",
+    "Topic :: Text Processing :: Markup :: Markdown",
+]
+dependencies = [
+    "beautifulsoup4>=4.12.0",
+    "lxml>=4.9.0",
+    "requests>=2.31.0",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=7.0.0",
+    "pytest-cov>=4.0.0",
+    "black>=23.0.0",
+    "ruff>=0.1.0",
+]
+[project.urls]
+Homepage = "https://github.com/lucasastorian/sec2md"
+Repository = "https://github.com/lucasastorian/sec2md"
+Issues = "https://github.com/lucasastorian/sec2md/issues"
+[tool.setuptools.packages.find]
+where = ["src"]
+[tool.black]
+line-length = 100
+target-version = ["py39"]
+[tool.ruff]
+line-length = 100
+target-version = "py39"
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = ["test_*.py"]

sec2md-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

sec2md-0.1.0/src/sec2md/__init__.py ADDED Viewed

@@ -0,0 +1,24 @@
+"""sec2md: Convert SEC filings to high-quality Markdown."""
+from sec2md.core import convert_to_markdown
+from sec2md.utils import flatten_note
+from sec2md.sections import extract_sections, get_section
+from sec2md.chunking import chunk_pages, chunk_section
+from sec2md.models import Page, Section, Item10K, Item10Q, FilingType
+from sec2md.chunker.markdown_chunk import MarkdownChunk
+__version__ = "0.1.0"
+__all__ = [
+    "convert_to_markdown",
+    "flatten_note",
+    "extract_sections",
+    "get_section",
+    "chunk_pages",
+    "chunk_section",
+    "Page",
+    "Section",
+    "Item10K",
+    "Item10Q",
+    "FilingType",
+    "MarkdownChunk",
+]