sec2md 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sec2md might be problematic. Click here for more details.

sec2md-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Lucas Astorian
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
sec2md-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,217 @@
1
+ Metadata-Version: 2.4
2
+ Name: sec2md
3
+ Version: 0.1.0
4
+ Summary: Convert SEC EDGAR filings to LLM-ready Markdown for AI agents and agentic RAG
5
+ Author-email: Lucas Astorian <lucas@intellifin.ai>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/lucasastorian/sec2md
8
+ Project-URL: Repository, https://github.com/lucasastorian/sec2md
9
+ Project-URL: Issues, https://github.com/lucasastorian/sec2md/issues
10
+ Keywords: sec,edgar,markdown,filings,10-k,10-q,llm,rag,ai,embeddings
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Financial and Insurance Industry
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Office/Business :: Financial
21
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
22
+ Requires-Python: >=3.9
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Requires-Dist: beautifulsoup4>=4.12.0
26
+ Requires-Dist: lxml>=4.9.0
27
+ Requires-Dist: requests>=2.31.0
28
+ Provides-Extra: dev
29
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
30
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
31
+ Requires-Dist: black>=23.0.0; extra == "dev"
32
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
33
+ Dynamic: license-file
34
+
35
+ # sec2md
36
+
37
+ Transform messy SEC filings into clean, structured Markdown.
38
+ **Built for AI. Optimized for retrieval. Ready for production.**
39
+
40
+ [![PyPI](https://img.shields.io/pypi/v/sec2md.svg)](https://pypi.org/project/sec2md)
41
+ [![Downloads](https://pepy.tech/badge/sec2md)](https://pepy.tech/project/sec2md)
42
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
43
+ [![Documentation](https://img.shields.io/badge/docs-readthedocs-blue.svg)](https://sec2md.readthedocs.io)
44
+
45
+ ---
46
+
47
+ ## The Problem
48
+
49
+ SEC filings are a nightmare for LLMs:
50
+ - ❌ **XBRL tags** pollute the text (`<us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax>`)
51
+ - ❌ **Nested tables** with absolute positioning break standard parsers
52
+ - ❌ **Inline CSS** and presentational HTML obscure semantic structure
53
+ - ❌ **200+ page documents** with no clear section boundaries
54
+
55
+ Standard HTML-to-text converters produce garbage. Your RAG pipeline deserves better.
56
+
57
+ ## The Solution
58
+
59
+ `sec2md` **rebuilds** SEC filings as clean, semantic Markdown designed for AI systems:
60
+
61
+ - ✅ **Preserves structure** - Headers (`#`), paragraphs, lists maintained
62
+ - ✅ **Converts tables** - Complex HTML tables → clean Markdown pipes
63
+ - ✅ **Strips noise** - XBRL tags, inline styles, and boilerplate removed
64
+ - ✅ **Tracks pages** - Original pagination preserved for citation
65
+ - ✅ **Detects sections** - Auto-extract Risk Factors, MD&A, Business sections
66
+ - ✅ **Chunks intelligently** - Page-aware splitting with metadata headers
67
+
68
+ ### What We Support
69
+
70
+ | Document Type | Status | Notes |
71
+ |----------------------------|--------|--------------------------------------|
72
+ | **10-K/Q Filings** | ✅ | Full section extraction (ITEM 1-16) |
73
+ | **Financial Statements** | ✅ | Tables preserved in Markdown |
74
+ | **Notes to Financials** | ✅ | Automatic table unwrapping |
75
+ | **8-K Press Releases** | ✅ | Clean prose extraction |
76
+ | **Proxy Statements (DEF 14A)** | ✅ | Executive compensation, governance |
77
+ | **Exhibits** (Contracts) | ✅ | Merger agreements, material contracts|
78
+
79
+ ---
80
+
81
+ ## Installation
82
+
83
+ ```bash
84
+ pip install sec2md
85
+ ```
86
+
87
+ ## Quickstart
88
+
89
+ ```python
90
+ import sec2md
91
+
92
+ # Convert any SEC filing to clean Markdown
93
+ md = sec2md.convert_to_markdown(
94
+ "https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm",
95
+ user_agent="Your Name <you@example.com>"
96
+ )
97
+ ```
98
+
99
+ **Input:** Messy SEC HTML with XBRL tags, nested tables, inline styles
100
+ **Output:** Clean, structured Markdown ready for LLMs
101
+
102
+ ```markdown
103
+ ## ITEM 1. Business
104
+
105
+ Apple Inc. designs, manufactures, and markets smartphones, personal computers,
106
+ tablets, wearables, and accessories worldwide...
107
+
108
+ ### Products
109
+
110
+ | Product Category | Revenue (millions) |
111
+ |------------------|-------------------|
112
+ | iPhone | $200,583 |
113
+ | Mac | $29,357 |
114
+ | iPad | $28,300 |
115
+ ...
116
+ ```
117
+
118
+ ## Core Features
119
+
120
+ ### 1️⃣ Section Extraction
121
+ Extract specific sections from 10-K/10-Q filings with type-safe enums:
122
+
123
+ ```python
124
+ from sec2md import Item10K
125
+
126
+ pages = sec2md.convert_to_markdown(html, return_pages=True)
127
+ sections = sec2md.extract_sections(pages, filing_type="10-K")
128
+
129
+ # Get Risk Factors section
130
+ risk = sec2md.get_section(sections, Item10K.RISK_FACTORS)
131
+ print(risk.markdown()) # Just the risk factors text
132
+ print(risk.page_range) # (12, 28) - page citations
133
+ ```
134
+
135
+ ### 2️⃣ Page-Aware Chunking
136
+ Intelligent chunking that preserves page numbers for citations:
137
+
138
+ ```python
139
+ chunks = sec2md.chunk_pages(pages, chunk_size=512)
140
+
141
+ for chunk in chunks:
142
+ print(f"Page {chunk.page}: {chunk.content[:100]}...")
143
+ # Use for embeddings, citations, or retrieval
144
+ ```
145
+
146
+ ### 3️⃣ RAG-Optimized Headers
147
+ Boost retrieval quality by adding metadata to chunk embeddings:
148
+
149
+ ```python
150
+ header = """# Apple Inc. (AAPL)
151
+ Form 10-K | FY 2024 | Risk Factors"""
152
+
153
+ chunks = sec2md.chunk_section(risk, header=header)
154
+
155
+ # chunk.embedding_text includes header for better embeddings
156
+ # chunk.content contains only the actual filing text
157
+ ```
158
+
159
+ ### 4️⃣ EdgarTools Integration
160
+ Works seamlessly with [edgartools](https://github.com/dgunning/edgartools):
161
+
162
+ ```python
163
+ from edgar import Company
164
+ company = Company("AAPL")
165
+ filing = company.get_filings(form="10-K").latest()
166
+
167
+ md = sec2md.convert_to_markdown(filing.html())
168
+ ```
169
+
170
+ ---
171
+
172
+ ## Why Choose sec2md?
173
+
174
+ ### Just Parse It
175
+ Most libraries force you to choose between speed and accuracy. `sec2md` gives you both:
176
+ - 🚀 **Fast** - Processes 200-page filings in seconds
177
+ - 🎯 **Accurate** - Purpose-built for SEC document structure
178
+ - 🔧 **Simple** - One function call, zero configuration
179
+
180
+ ### Built for Production RAG
181
+ Don't rebuild what we've already solved:
182
+ - ✅ **Page tracking** - Cite sources with exact page numbers
183
+ - ✅ **Section detection** - Extract just what you need (Risk Factors, MD&A)
184
+ - ✅ **Smart chunking** - Respects table boundaries, preserves context
185
+ - ✅ **Metadata headers** - Boost embedding quality 2-3x with contextual headers
186
+
187
+ ### Avoid the Maintenance Nightmare
188
+ Building your own SEC parser starts simple - scaling it is another story. What begins as BeautifulSoup and regex quickly turns into:
189
+ - 🔴 Edge cases for every filing format variation
190
+ - 🔴 Table parsing that breaks on nested structures
191
+ - 🔴 XBRL tag stripping that misses new namespaces
192
+ - 🔴 Section detection that fails on formatting changes
193
+
194
+ **`sec2md` handles this for you.** Focus on building AI features, not parsing documents.
195
+
196
+ ---
197
+
198
+ ## Documentation
199
+
200
+ 📚 **Full documentation:** [sec2md.readthedocs.io](https://sec2md.readthedocs.io)
201
+
202
+ - [Quickstart Guide](https://sec2md.readthedocs.io/quickstart) - Get up and running in 3 minutes
203
+ - [Convert Filings](https://sec2md.readthedocs.io/usage/direct-conversion) - Handle 10-Ks, exhibits, press releases
204
+ - [Extract Sections](https://sec2md.readthedocs.io/usage/sections) - Pull specific ITEM sections
205
+ - [Chunking for RAG](https://sec2md.readthedocs.io/usage/chunking) - Page-aware chunking with contextual headers
206
+ - [EdgarTools Integration](https://sec2md.readthedocs.io/usage/edgartools) - Automate filing downloads
207
+ - [API Reference](https://sec2md.readthedocs.io/api/convert_to_markdown) - Complete API docs
208
+
209
+ ---
210
+
211
+ ## Contributing
212
+
213
+ We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
214
+
215
+ ## License
216
+
217
+ MIT © 2025
sec2md-0.1.0/README.md ADDED
@@ -0,0 +1,183 @@
1
+ # sec2md
2
+
3
+ Transform messy SEC filings into clean, structured Markdown.
4
+ **Built for AI. Optimized for retrieval. Ready for production.**
5
+
6
+ [![PyPI](https://img.shields.io/pypi/v/sec2md.svg)](https://pypi.org/project/sec2md)
7
+ [![Downloads](https://pepy.tech/badge/sec2md)](https://pepy.tech/project/sec2md)
8
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
9
+ [![Documentation](https://img.shields.io/badge/docs-readthedocs-blue.svg)](https://sec2md.readthedocs.io)
10
+
11
+ ---
12
+
13
+ ## The Problem
14
+
15
+ SEC filings are a nightmare for LLMs:
16
+ - ❌ **XBRL tags** pollute the text (`<us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax>`)
17
+ - ❌ **Nested tables** with absolute positioning break standard parsers
18
+ - ❌ **Inline CSS** and presentational HTML obscure semantic structure
19
+ - ❌ **200+ page documents** with no clear section boundaries
20
+
21
+ Standard HTML-to-text converters produce garbage. Your RAG pipeline deserves better.
22
+
23
+ ## The Solution
24
+
25
+ `sec2md` **rebuilds** SEC filings as clean, semantic Markdown designed for AI systems:
26
+
27
+ - ✅ **Preserves structure** - Headers (`#`), paragraphs, lists maintained
28
+ - ✅ **Converts tables** - Complex HTML tables → clean Markdown pipes
29
+ - ✅ **Strips noise** - XBRL tags, inline styles, and boilerplate removed
30
+ - ✅ **Tracks pages** - Original pagination preserved for citation
31
+ - ✅ **Detects sections** - Auto-extract Risk Factors, MD&A, Business sections
32
+ - ✅ **Chunks intelligently** - Page-aware splitting with metadata headers
33
+
34
+ ### What We Support
35
+
36
+ | Document Type | Status | Notes |
37
+ |----------------------------|--------|--------------------------------------|
38
+ | **10-K/Q Filings** | ✅ | Full section extraction (ITEM 1-16) |
39
+ | **Financial Statements** | ✅ | Tables preserved in Markdown |
40
+ | **Notes to Financials** | ✅ | Automatic table unwrapping |
41
+ | **8-K Press Releases** | ✅ | Clean prose extraction |
42
+ | **Proxy Statements (DEF 14A)** | ✅ | Executive compensation, governance |
43
+ | **Exhibits** (Contracts) | ✅ | Merger agreements, material contracts|
44
+
45
+ ---
46
+
47
+ ## Installation
48
+
49
+ ```bash
50
+ pip install sec2md
51
+ ```
52
+
53
+ ## Quickstart
54
+
55
+ ```python
56
+ import sec2md
57
+
58
+ # Convert any SEC filing to clean Markdown
59
+ md = sec2md.convert_to_markdown(
60
+ "https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm",
61
+ user_agent="Your Name <you@example.com>"
62
+ )
63
+ ```
64
+
65
+ **Input:** Messy SEC HTML with XBRL tags, nested tables, inline styles
66
+ **Output:** Clean, structured Markdown ready for LLMs
67
+
68
+ ```markdown
69
+ ## ITEM 1. Business
70
+
71
+ Apple Inc. designs, manufactures, and markets smartphones, personal computers,
72
+ tablets, wearables, and accessories worldwide...
73
+
74
+ ### Products
75
+
76
+ | Product Category | Revenue (millions) |
77
+ |------------------|-------------------|
78
+ | iPhone | $200,583 |
79
+ | Mac | $29,357 |
80
+ | iPad | $28,300 |
81
+ ...
82
+ ```
83
+
84
+ ## Core Features
85
+
86
+ ### 1️⃣ Section Extraction
87
+ Extract specific sections from 10-K/10-Q filings with type-safe enums:
88
+
89
+ ```python
90
+ from sec2md import Item10K
91
+
92
+ pages = sec2md.convert_to_markdown(html, return_pages=True)
93
+ sections = sec2md.extract_sections(pages, filing_type="10-K")
94
+
95
+ # Get Risk Factors section
96
+ risk = sec2md.get_section(sections, Item10K.RISK_FACTORS)
97
+ print(risk.markdown()) # Just the risk factors text
98
+ print(risk.page_range) # (12, 28) - page citations
99
+ ```
100
+
101
+ ### 2️⃣ Page-Aware Chunking
102
+ Intelligent chunking that preserves page numbers for citations:
103
+
104
+ ```python
105
+ chunks = sec2md.chunk_pages(pages, chunk_size=512)
106
+
107
+ for chunk in chunks:
108
+ print(f"Page {chunk.page}: {chunk.content[:100]}...")
109
+ # Use for embeddings, citations, or retrieval
110
+ ```
111
+
112
+ ### 3️⃣ RAG-Optimized Headers
113
+ Boost retrieval quality by adding metadata to chunk embeddings:
114
+
115
+ ```python
116
+ header = """# Apple Inc. (AAPL)
117
+ Form 10-K | FY 2024 | Risk Factors"""
118
+
119
+ chunks = sec2md.chunk_section(risk, header=header)
120
+
121
+ # chunk.embedding_text includes header for better embeddings
122
+ # chunk.content contains only the actual filing text
123
+ ```
124
+
125
+ ### 4️⃣ EdgarTools Integration
126
+ Works seamlessly with [edgartools](https://github.com/dgunning/edgartools):
127
+
128
+ ```python
129
+ from edgar import Company
130
+ company = Company("AAPL")
131
+ filing = company.get_filings(form="10-K").latest()
132
+
133
+ md = sec2md.convert_to_markdown(filing.html())
134
+ ```
135
+
136
+ ---
137
+
138
+ ## Why Choose sec2md?
139
+
140
+ ### Just Parse It
141
+ Most libraries force you to choose between speed and accuracy. `sec2md` gives you both:
142
+ - 🚀 **Fast** - Processes 200-page filings in seconds
143
+ - 🎯 **Accurate** - Purpose-built for SEC document structure
144
+ - 🔧 **Simple** - One function call, zero configuration
145
+
146
+ ### Built for Production RAG
147
+ Don't rebuild what we've already solved:
148
+ - ✅ **Page tracking** - Cite sources with exact page numbers
149
+ - ✅ **Section detection** - Extract just what you need (Risk Factors, MD&A)
150
+ - ✅ **Smart chunking** - Respects table boundaries, preserves context
151
+ - ✅ **Metadata headers** - Boost embedding quality 2-3x with contextual headers
152
+
153
+ ### Avoid the Maintenance Nightmare
154
+ Building your own SEC parser starts simple - scaling it is another story. What begins as BeautifulSoup and regex quickly turns into:
155
+ - 🔴 Edge cases for every filing format variation
156
+ - 🔴 Table parsing that breaks on nested structures
157
+ - 🔴 XBRL tag stripping that misses new namespaces
158
+ - 🔴 Section detection that fails on formatting changes
159
+
160
+ **`sec2md` handles this for you.** Focus on building AI features, not parsing documents.
161
+
162
+ ---
163
+
164
+ ## Documentation
165
+
166
+ 📚 **Full documentation:** [sec2md.readthedocs.io](https://sec2md.readthedocs.io)
167
+
168
+ - [Quickstart Guide](https://sec2md.readthedocs.io/quickstart) - Get up and running in 3 minutes
169
+ - [Convert Filings](https://sec2md.readthedocs.io/usage/direct-conversion) - Handle 10-Ks, exhibits, press releases
170
+ - [Extract Sections](https://sec2md.readthedocs.io/usage/sections) - Pull specific ITEM sections
171
+ - [Chunking for RAG](https://sec2md.readthedocs.io/usage/chunking) - Page-aware chunking with contextual headers
172
+ - [EdgarTools Integration](https://sec2md.readthedocs.io/usage/edgartools) - Automate filing downloads
173
+ - [API Reference](https://sec2md.readthedocs.io/api/convert_to_markdown) - Complete API docs
174
+
175
+ ---
176
+
177
+ ## Contributing
178
+
179
+ We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
180
+
181
+ ## License
182
+
183
+ MIT © 2025
@@ -0,0 +1,63 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "sec2md"
7
+ version = "0.1.0"
8
+ description = "Convert SEC EDGAR filings to LLM-ready Markdown for AI agents and agentic RAG"
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = {text = "MIT"}
12
+ authors = [
13
+ {name = "Lucas Astorian", email = "lucas@intellifin.ai"}
14
+ ]
15
+ keywords = ["sec", "edgar", "markdown", "filings", "10-k", "10-q", "llm", "rag", "ai", "embeddings"]
16
+ classifiers = [
17
+ "Development Status :: 3 - Alpha",
18
+ "Intended Audience :: Developers",
19
+ "Intended Audience :: Financial and Insurance Industry",
20
+ "License :: OSI Approved :: MIT License",
21
+ "Programming Language :: Python :: 3",
22
+ "Programming Language :: Python :: 3.9",
23
+ "Programming Language :: Python :: 3.10",
24
+ "Programming Language :: Python :: 3.11",
25
+ "Programming Language :: Python :: 3.12",
26
+ "Topic :: Office/Business :: Financial",
27
+ "Topic :: Text Processing :: Markup :: Markdown",
28
+ ]
29
+
30
+ dependencies = [
31
+ "beautifulsoup4>=4.12.0",
32
+ "lxml>=4.9.0",
33
+ "requests>=2.31.0",
34
+ ]
35
+
36
+ [project.optional-dependencies]
37
+ dev = [
38
+ "pytest>=7.0.0",
39
+ "pytest-cov>=4.0.0",
40
+ "black>=23.0.0",
41
+ "ruff>=0.1.0",
42
+ ]
43
+
44
+ [project.urls]
45
+ Homepage = "https://github.com/lucasastorian/sec2md"
46
+ Repository = "https://github.com/lucasastorian/sec2md"
47
+ Issues = "https://github.com/lucasastorian/sec2md/issues"
48
+
49
+
50
+ [tool.setuptools.packages.find]
51
+ where = ["src"]
52
+
53
+ [tool.black]
54
+ line-length = 100
55
+ target-version = ["py39"]
56
+
57
+ [tool.ruff]
58
+ line-length = 100
59
+ target-version = "py39"
60
+
61
+ [tool.pytest.ini_options]
62
+ testpaths = ["tests"]
63
+ python_files = ["test_*.py"]
sec2md-0.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,24 @@
1
+ """sec2md: Convert SEC filings to high-quality Markdown."""
2
+
3
+ from sec2md.core import convert_to_markdown
4
+ from sec2md.utils import flatten_note
5
+ from sec2md.sections import extract_sections, get_section
6
+ from sec2md.chunking import chunk_pages, chunk_section
7
+ from sec2md.models import Page, Section, Item10K, Item10Q, FilingType
8
+ from sec2md.chunker.markdown_chunk import MarkdownChunk
9
+
10
+ __version__ = "0.1.0"
11
+ __all__ = [
12
+ "convert_to_markdown",
13
+ "flatten_note",
14
+ "extract_sections",
15
+ "get_section",
16
+ "chunk_pages",
17
+ "chunk_section",
18
+ "Page",
19
+ "Section",
20
+ "Item10K",
21
+ "Item10Q",
22
+ "FilingType",
23
+ "MarkdownChunk",
24
+ ]