sec2md 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sec2md-0.1.4/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Lucas Astorian
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
sec2md-0.1.4/PKG-INFO ADDED
@@ -0,0 +1,215 @@
1
+ Metadata-Version: 2.4
2
+ Name: sec2md
3
+ Version: 0.1.4
4
+ Summary: Convert SEC EDGAR filings to LLM-ready Markdown for AI agents and agentic RAG
5
+ Author-email: Lucas Astorian <lucas@intellifin.ai>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/lucasastorian/sec2md
8
+ Project-URL: Repository, https://github.com/lucasastorian/sec2md
9
+ Project-URL: Issues, https://github.com/lucasastorian/sec2md/issues
10
+ Keywords: sec,edgar,markdown,filings,10-k,10-q,llm,rag,ai,embeddings
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Financial and Insurance Industry
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Office/Business :: Financial
21
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
22
+ Requires-Python: >=3.9
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Requires-Dist: beautifulsoup4>=4.12.0
26
+ Requires-Dist: lxml>=4.9.0
27
+ Requires-Dist: requests>=2.31.0
28
+ Requires-Dist: tiktoken>=0.5.0
29
+ Provides-Extra: dev
30
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
31
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
32
+ Requires-Dist: black>=23.0.0; extra == "dev"
33
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
34
+ Dynamic: license-file
35
+
36
+ # sec2md
37
+
38
+ [![PyPI](https://img.shields.io/pypi/v/sec2md.svg)](https://pypi.org/project/sec2md)
39
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
40
+ [![Documentation](https://img.shields.io/badge/docs-readthedocs-blue.svg)](https://sec2md.readthedocs.io)
41
+
42
+ Transform messy SEC filings into clean, structured Markdown.
43
+ **Built for AI. Optimized for retrieval. Ready for production.**
44
+
45
+ ![Before and After Comparison](comparison.png)
46
+ *Apple 10-K cover page: Raw SEC HTML (left) vs. Clean Markdown (right)*
47
+
48
+ ---
49
+
50
+ ## The Problem
51
+
52
+ RAG pipelines fail on SEC filings because **standard parsers destroy document structure.**
53
+
54
+ When you flatten a 200-page 10-K to plain text:
55
+
56
+ - ❌ **Tables break** — Complex financial statements become misaligned text
57
+ - ❌ **Pages are lost** — Can't cite sources or trace answers back
58
+ - ❌ **Sections merge** — Risk Factors and MD&A become indistinguishable
59
+ - ❌ **Formatting is stripped** — Headers, bolds, lists (LLM reasoning cues) gone
60
+ - ❌ **Retrieval fails** — Chunks without structure return wrong context
61
+
62
+ Your RAG system is only as good as your data. Garbage in, garbage out.
63
+
64
+ ## The Solution
65
+
66
+ `sec2md` **rebuilds** SEC filings as clean, semantic Markdown designed for AI systems:
67
+
68
+ - ✅ **Preserves structure** - Headers (`#`), paragraphs, lists maintained
69
+ - ✅ **Converts tables** - Complex HTML tables → clean Markdown pipes
70
+ - ✅ **Strips noise** - XBRL tags, inline styles, and boilerplate removed
71
+ - ✅ **Tracks pages** - Original pagination preserved for citation
72
+ - ✅ **Detects sections** - Auto-extract Risk Factors, MD&A, Business sections
73
+ - ✅ **Chunks intelligently** - Page-aware splitting with metadata headers
74
+
75
+ ### What We Support
76
+
77
+ | Document Type | Status | Notes |
78
+ |----------------------------|--------|--------------------------------------|
79
+ | **10-K/Q Filings** | ✅ | Full section extraction (ITEM 1-16) |
80
+ | **Financial Statements** | ✅ | Tables preserved in Markdown |
81
+ | **Notes to Financials** | ✅ | Automatic table unwrapping |
82
+ | **8-K Press Releases** | ✅ | Clean prose extraction |
83
+ | **Proxy Statements (DEF 14A)** | ✅ | Executive compensation, governance |
84
+ | **Exhibits** (Contracts) | ✅ | Merger agreements, material contracts|
85
+
86
+ ---
87
+
88
+ ## Installation
89
+
90
+ ```bash
91
+ pip install sec2md
92
+ ```
93
+
94
+ ## Quickstart
95
+
96
+ ```python
97
+ import sec2md
98
+
99
+ # Convert any SEC filing to clean Markdown
100
+ md = sec2md.convert_to_markdown(
101
+ "https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm",
102
+ user_agent="Your Name <you@example.com>"
103
+ )
104
+ ```
105
+
106
+ **Input:** Messy SEC HTML with XBRL tags, nested tables, inline styles
107
+ **Output:** Clean, structured Markdown ready for LLMs
108
+
109
+ ```markdown
110
+ ## ITEM 1. Business
111
+
112
+ Apple Inc. designs, manufactures, and markets smartphones, personal computers,
113
+ tablets, wearables, and accessories worldwide...
114
+
115
+ ### Products
116
+
117
+ | Product Category | Revenue (millions) |
118
+ |------------------|-------------------|
119
+ | iPhone | $200,583 |
120
+ | Mac | $29,357 |
121
+ | iPad | $28,300 |
122
+ ...
123
+ ```
124
+
125
+ ## Core Features
126
+
127
+ ### 1️⃣ Section Extraction
128
+ Extract specific sections from 10-K/10-Q filings with type-safe enums:
129
+
130
+ ```python
131
+ from sec2md import Item10K
132
+
133
+ pages = sec2md.convert_to_markdown(html, return_pages=True)
134
+ sections = sec2md.extract_sections(pages, filing_type="10-K")
135
+
136
+ # Get Risk Factors section
137
+ risk = sec2md.get_section(sections, Item10K.RISK_FACTORS)
138
+ print(risk.markdown()) # Just the risk factors text
139
+ print(risk.page_range) # (12, 28) - page citations
140
+ ```
141
+
142
+ ### 2️⃣ Page-Aware Chunking
143
+ Intelligent chunking that preserves page numbers for citations:
144
+
145
+ ```python
146
+ chunks = sec2md.chunk_pages(pages, chunk_size=512)
147
+
148
+ for chunk in chunks:
149
+ print(f"Page {chunk.page}: {chunk.content[:100]}...")
150
+ # Use for embeddings, citations, or retrieval
151
+ ```
152
+
153
+ ### 3️⃣ RAG-Optimized Headers
154
+ Boost retrieval quality by adding metadata to chunk embeddings:
155
+
156
+ ```python
157
+ header = """# Apple Inc. (AAPL)
158
+ Form 10-K | FY 2024 | Risk Factors"""
159
+
160
+ chunks = sec2md.chunk_section(risk, header=header)
161
+
162
+ # chunk.embedding_text includes header for better embeddings
163
+ # chunk.content contains only the actual filing text
164
+ ```
165
+
166
+ ### 4️⃣ EdgarTools Integration
167
+ Works seamlessly with [edgartools](https://github.com/dgunning/edgartools):
168
+
169
+ ```python
170
+ from edgar import Company
171
+ company = Company("AAPL")
172
+ filing = company.get_filings(form="10-K").latest()
173
+
174
+ md = sec2md.convert_to_markdown(filing.html())
175
+ ```
176
+
177
+ ---
178
+
179
+ ## Why Choose sec2md?
180
+
181
+ ### Just Parse It
182
+ Most libraries force you to choose between speed and accuracy. `sec2md` gives you both:
183
+ - 🚀 **Fast** - Processes 200-page filings in seconds
184
+ - 🎯 **Accurate** - Purpose-built for SEC document structure
185
+ - 🔧 **Simple** - One function call, zero configuration
186
+
187
+ ### Built for Agentic RAG
188
+ Don't rebuild what we've already solved:
189
+ - ✅ **Page tracking** - Cite sources with exact page numbers
190
+ - ✅ **Section detection** - Extract just what you need (Risk Factors, MD&A)
191
+ - ✅ **Smart chunking** - Respects table boundaries, preserves context
192
+ - ✅ **Metadata headers** - Boost embedding quality 2-3x with contextual headers
193
+
194
+ ---
195
+
196
+ ## Documentation
197
+
198
+ 📚 **Full documentation:** [sec2md.readthedocs.io](https://sec2md.readthedocs.io)
199
+
200
+ - [Quickstart Guide](https://sec2md.readthedocs.io/quickstart) - Get up and running in 3 minutes
201
+ - [Convert Filings](https://sec2md.readthedocs.io/usage/direct-conversion) - Handle 10-Ks, exhibits, press releases
202
+ - [Extract Sections](https://sec2md.readthedocs.io/usage/sections) - Pull specific ITEM sections
203
+ - [Chunking for RAG](https://sec2md.readthedocs.io/usage/chunking) - Page-aware chunking with contextual headers
204
+ - [EdgarTools Integration](https://sec2md.readthedocs.io/usage/edgartools) - Automate filing downloads
205
+ - [API Reference](https://sec2md.readthedocs.io/api/convert_to_markdown) - Complete API docs
206
+
207
+ ---
208
+
209
+ ## Contributing
210
+
211
+ We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
212
+
213
+ ## License
214
+
215
+ MIT © 2025
sec2md-0.1.4/README.md ADDED
@@ -0,0 +1,180 @@
1
+ # sec2md
2
+
3
+ [![PyPI](https://img.shields.io/pypi/v/sec2md.svg)](https://pypi.org/project/sec2md)
4
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
5
+ [![Documentation](https://img.shields.io/badge/docs-readthedocs-blue.svg)](https://sec2md.readthedocs.io)
6
+
7
+ Transform messy SEC filings into clean, structured Markdown.
8
+ **Built for AI. Optimized for retrieval. Ready for production.**
9
+
10
+ ![Before and After Comparison](comparison.png)
11
+ *Apple 10-K cover page: Raw SEC HTML (left) vs. Clean Markdown (right)*
12
+
13
+ ---
14
+
15
+ ## The Problem
16
+
17
+ RAG pipelines fail on SEC filings because **standard parsers destroy document structure.**
18
+
19
+ When you flatten a 200-page 10-K to plain text:
20
+
21
+ - ❌ **Tables break** — Complex financial statements become misaligned text
22
+ - ❌ **Pages are lost** — Can't cite sources or trace answers back
23
+ - ❌ **Sections merge** — Risk Factors and MD&A become indistinguishable
24
+ - ❌ **Formatting is stripped** — Headers, bolds, lists (LLM reasoning cues) gone
25
+ - ❌ **Retrieval fails** — Chunks without structure return wrong context
26
+
27
+ Your RAG system is only as good as your data. Garbage in, garbage out.
28
+
29
+ ## The Solution
30
+
31
+ `sec2md` **rebuilds** SEC filings as clean, semantic Markdown designed for AI systems:
32
+
33
+ - ✅ **Preserves structure** - Headers (`#`), paragraphs, lists maintained
34
+ - ✅ **Converts tables** - Complex HTML tables → clean Markdown pipes
35
+ - ✅ **Strips noise** - XBRL tags, inline styles, and boilerplate removed
36
+ - ✅ **Tracks pages** - Original pagination preserved for citation
37
+ - ✅ **Detects sections** - Auto-extract Risk Factors, MD&A, Business sections
38
+ - ✅ **Chunks intelligently** - Page-aware splitting with metadata headers
39
+
40
+ ### What We Support
41
+
42
+ | Document Type | Status | Notes |
43
+ |----------------------------|--------|--------------------------------------|
44
+ | **10-K/Q Filings** | ✅ | Full section extraction (ITEM 1-16) |
45
+ | **Financial Statements** | ✅ | Tables preserved in Markdown |
46
+ | **Notes to Financials** | ✅ | Automatic table unwrapping |
47
+ | **8-K Press Releases** | ✅ | Clean prose extraction |
48
+ | **Proxy Statements (DEF 14A)** | ✅ | Executive compensation, governance |
49
+ | **Exhibits** (Contracts) | ✅ | Merger agreements, material contracts|
50
+
51
+ ---
52
+
53
+ ## Installation
54
+
55
+ ```bash
56
+ pip install sec2md
57
+ ```
58
+
59
+ ## Quickstart
60
+
61
+ ```python
62
+ import sec2md
63
+
64
+ # Convert any SEC filing to clean Markdown
65
+ md = sec2md.convert_to_markdown(
66
+ "https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm",
67
+ user_agent="Your Name <you@example.com>"
68
+ )
69
+ ```
70
+
71
+ **Input:** Messy SEC HTML with XBRL tags, nested tables, inline styles
72
+ **Output:** Clean, structured Markdown ready for LLMs
73
+
74
+ ```markdown
75
+ ## ITEM 1. Business
76
+
77
+ Apple Inc. designs, manufactures, and markets smartphones, personal computers,
78
+ tablets, wearables, and accessories worldwide...
79
+
80
+ ### Products
81
+
82
+ | Product Category | Revenue (millions) |
83
+ |------------------|-------------------|
84
+ | iPhone | $200,583 |
85
+ | Mac | $29,357 |
86
+ | iPad | $28,300 |
87
+ ...
88
+ ```
89
+
90
+ ## Core Features
91
+
92
+ ### 1️⃣ Section Extraction
93
+ Extract specific sections from 10-K/10-Q filings with type-safe enums:
94
+
95
+ ```python
96
+ from sec2md import Item10K
97
+
98
+ pages = sec2md.convert_to_markdown(html, return_pages=True)
99
+ sections = sec2md.extract_sections(pages, filing_type="10-K")
100
+
101
+ # Get Risk Factors section
102
+ risk = sec2md.get_section(sections, Item10K.RISK_FACTORS)
103
+ print(risk.markdown()) # Just the risk factors text
104
+ print(risk.page_range) # (12, 28) - page citations
105
+ ```
106
+
107
+ ### 2️⃣ Page-Aware Chunking
108
+ Intelligent chunking that preserves page numbers for citations:
109
+
110
+ ```python
111
+ chunks = sec2md.chunk_pages(pages, chunk_size=512)
112
+
113
+ for chunk in chunks:
114
+ print(f"Page {chunk.page}: {chunk.content[:100]}...")
115
+ # Use for embeddings, citations, or retrieval
116
+ ```
117
+
118
+ ### 3️⃣ RAG-Optimized Headers
119
+ Boost retrieval quality by adding metadata to chunk embeddings:
120
+
121
+ ```python
122
+ header = """# Apple Inc. (AAPL)
123
+ Form 10-K | FY 2024 | Risk Factors"""
124
+
125
+ chunks = sec2md.chunk_section(risk, header=header)
126
+
127
+ # chunk.embedding_text includes header for better embeddings
128
+ # chunk.content contains only the actual filing text
129
+ ```
130
+
131
+ ### 4️⃣ EdgarTools Integration
132
+ Works seamlessly with [edgartools](https://github.com/dgunning/edgartools):
133
+
134
+ ```python
135
+ from edgar import Company
136
+ company = Company("AAPL")
137
+ filing = company.get_filings(form="10-K").latest()
138
+
139
+ md = sec2md.convert_to_markdown(filing.html())
140
+ ```
141
+
142
+ ---
143
+
144
+ ## Why Choose sec2md?
145
+
146
+ ### Just Parse It
147
+ Most libraries force you to choose between speed and accuracy. `sec2md` gives you both:
148
+ - 🚀 **Fast** - Processes 200-page filings in seconds
149
+ - 🎯 **Accurate** - Purpose-built for SEC document structure
150
+ - 🔧 **Simple** - One function call, zero configuration
151
+
152
+ ### Built for Agentic RAG
153
+ Don't rebuild what we've already solved:
154
+ - ✅ **Page tracking** - Cite sources with exact page numbers
155
+ - ✅ **Section detection** - Extract just what you need (Risk Factors, MD&A)
156
+ - ✅ **Smart chunking** - Respects table boundaries, preserves context
157
+ - ✅ **Metadata headers** - Boost embedding quality 2-3x with contextual headers
158
+
159
+ ---
160
+
161
+ ## Documentation
162
+
163
+ 📚 **Full documentation:** [sec2md.readthedocs.io](https://sec2md.readthedocs.io)
164
+
165
+ - [Quickstart Guide](https://sec2md.readthedocs.io/quickstart) - Get up and running in 3 minutes
166
+ - [Convert Filings](https://sec2md.readthedocs.io/usage/direct-conversion) - Handle 10-Ks, exhibits, press releases
167
+ - [Extract Sections](https://sec2md.readthedocs.io/usage/sections) - Pull specific ITEM sections
168
+ - [Chunking for RAG](https://sec2md.readthedocs.io/usage/chunking) - Page-aware chunking with contextual headers
169
+ - [EdgarTools Integration](https://sec2md.readthedocs.io/usage/edgartools) - Automate filing downloads
170
+ - [API Reference](https://sec2md.readthedocs.io/api/convert_to_markdown) - Complete API docs
171
+
172
+ ---
173
+
174
+ ## Contributing
175
+
176
+ We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
177
+
178
+ ## License
179
+
180
+ MIT © 2025
@@ -0,0 +1,64 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "sec2md"
7
+ version = "0.1.4"
8
+ description = "Convert SEC EDGAR filings to LLM-ready Markdown for AI agents and agentic RAG"
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = {text = "MIT"}
12
+ authors = [
13
+ {name = "Lucas Astorian", email = "lucas@intellifin.ai"}
14
+ ]
15
+ keywords = ["sec", "edgar", "markdown", "filings", "10-k", "10-q", "llm", "rag", "ai", "embeddings"]
16
+ classifiers = [
17
+ "Development Status :: 3 - Alpha",
18
+ "Intended Audience :: Developers",
19
+ "Intended Audience :: Financial and Insurance Industry",
20
+ "License :: OSI Approved :: MIT License",
21
+ "Programming Language :: Python :: 3",
22
+ "Programming Language :: Python :: 3.9",
23
+ "Programming Language :: Python :: 3.10",
24
+ "Programming Language :: Python :: 3.11",
25
+ "Programming Language :: Python :: 3.12",
26
+ "Topic :: Office/Business :: Financial",
27
+ "Topic :: Text Processing :: Markup :: Markdown",
28
+ ]
29
+
30
+ dependencies = [
31
+ "beautifulsoup4>=4.12.0",
32
+ "lxml>=4.9.0",
33
+ "requests>=2.31.0",
34
+ "tiktoken>=0.5.0", # Default: exact token counting (use --no-deps to exclude)
35
+ ]
36
+
37
+ [project.optional-dependencies]
38
+ dev = [
39
+ "pytest>=7.0.0",
40
+ "pytest-cov>=4.0.0",
41
+ "black>=23.0.0",
42
+ "ruff>=0.1.0",
43
+ ]
44
+
45
+ [project.urls]
46
+ Homepage = "https://github.com/lucasastorian/sec2md"
47
+ Repository = "https://github.com/lucasastorian/sec2md"
48
+ Issues = "https://github.com/lucasastorian/sec2md/issues"
49
+
50
+
51
+ [tool.setuptools.packages.find]
52
+ where = ["src"]
53
+
54
+ [tool.black]
55
+ line-length = 100
56
+ target-version = ["py39"]
57
+
58
+ [tool.ruff]
59
+ line-length = 100
60
+ target-version = "py39"
61
+
62
+ [tool.pytest.ini_options]
63
+ testpaths = ["tests"]
64
+ python_files = ["test_*.py"]
sec2md-0.1.4/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,34 @@
1
+ """sec2md: Convert SEC filings to high-quality Markdown."""
2
+
3
+ from sec2md.core import convert_to_markdown
4
+ from sec2md.utils import flatten_note
5
+ from sec2md.sections import extract_sections, get_section
6
+ from sec2md.chunking import chunk_pages, chunk_section, merge_text_blocks, chunk_text_block
7
+ from sec2md.models import Page, Section, Item10K, Item10Q, FilingType, Element, TextBlock
8
+ from sec2md.chunker.markdown_chunk import MarkdownChunk
9
+ from sec2md.chunker.markdown_chunker import MarkdownChunker
10
+ from sec2md.parser import Parser
11
+ from sec2md.section_extractor import SectionExtractor
12
+
13
+ __version__ = "0.1.4"
14
+ __all__ = [
15
+ "convert_to_markdown",
16
+ "flatten_note",
17
+ "extract_sections",
18
+ "get_section",
19
+ "chunk_pages",
20
+ "chunk_section",
21
+ "merge_text_blocks",
22
+ "chunk_text_block",
23
+ "Page",
24
+ "Section",
25
+ "Element",
26
+ "TextBlock",
27
+ "Item10K",
28
+ "Item10Q",
29
+ "FilingType",
30
+ "MarkdownChunk",
31
+ "MarkdownChunker",
32
+ "Parser",
33
+ "SectionExtractor",
34
+ ]