sec2md 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sec2md/chunker/markdown_blocks.py +15 -4
- {sec2md-0.1.0.dist-info → sec2md-0.1.1.dist-info}/METADATA +19 -21
- {sec2md-0.1.0.dist-info → sec2md-0.1.1.dist-info}/RECORD +6 -6
- {sec2md-0.1.0.dist-info → sec2md-0.1.1.dist-info}/WHEEL +0 -0
- {sec2md-0.1.0.dist-info → sec2md-0.1.1.dist-info}/licenses/LICENSE +0 -0
- {sec2md-0.1.0.dist-info → sec2md-0.1.1.dist-info}/top_level.txt +0 -0
|
@@ -2,15 +2,26 @@ import re
|
|
|
2
2
|
from abc import ABC
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
try:
|
|
6
|
+
import tiktoken
|
|
7
|
+
TIKTOKEN_AVAILABLE = True
|
|
8
|
+
except ImportError:
|
|
9
|
+
TIKTOKEN_AVAILABLE = False
|
|
10
|
+
|
|
5
11
|
|
|
6
12
|
def estimate_tokens(text: str) -> int:
|
|
7
13
|
"""
|
|
8
|
-
|
|
14
|
+
Calculate token count for text.
|
|
9
15
|
|
|
10
|
-
|
|
11
|
-
|
|
16
|
+
Uses tiktoken with cl100k_base encoding (gpt-3.5-turbo/gpt-4) if available.
|
|
17
|
+
Falls back to character/4 heuristic if tiktoken is not installed.
|
|
12
18
|
"""
|
|
13
|
-
|
|
19
|
+
if TIKTOKEN_AVAILABLE:
|
|
20
|
+
encoding = tiktoken.get_encoding("cl100k_base")
|
|
21
|
+
return len(encoding.encode(text))
|
|
22
|
+
else:
|
|
23
|
+
# Fallback: simple heuristic
|
|
24
|
+
return max(1, len(text) // 4)
|
|
14
25
|
|
|
15
26
|
|
|
16
27
|
def split_sentences(text: str) -> List[str]:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sec2md
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: Convert SEC EDGAR filings to LLM-ready Markdown for AI agents and agentic RAG
|
|
5
5
|
Author-email: Lucas Astorian <lucas@intellifin.ai>
|
|
6
6
|
License: MIT
|
|
@@ -25,6 +25,7 @@ License-File: LICENSE
|
|
|
25
25
|
Requires-Dist: beautifulsoup4>=4.12.0
|
|
26
26
|
Requires-Dist: lxml>=4.9.0
|
|
27
27
|
Requires-Dist: requests>=2.31.0
|
|
28
|
+
Requires-Dist: tiktoken>=0.5.0
|
|
28
29
|
Provides-Extra: dev
|
|
29
30
|
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
30
31
|
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
@@ -34,25 +35,31 @@ Dynamic: license-file
|
|
|
34
35
|
|
|
35
36
|
# sec2md
|
|
36
37
|
|
|
37
|
-
Transform messy SEC filings into clean, structured Markdown.
|
|
38
|
-
**Built for AI. Optimized for retrieval. Ready for production.**
|
|
39
|
-
|
|
40
38
|
[](https://pypi.org/project/sec2md)
|
|
41
|
-
[](https://pepy.tech/project/sec2md)
|
|
42
39
|
[](LICENSE)
|
|
43
40
|
[](https://sec2md.readthedocs.io)
|
|
44
41
|
|
|
42
|
+
Transform messy SEC filings into clean, structured Markdown.
|
|
43
|
+
**Built for AI. Optimized for retrieval. Ready for production.**
|
|
44
|
+
|
|
45
|
+

|
|
46
|
+
*Apple 10-K cover page: Raw SEC HTML (left) vs. Clean Markdown (right)*
|
|
47
|
+
|
|
45
48
|
---
|
|
46
49
|
|
|
47
50
|
## The Problem
|
|
48
51
|
|
|
49
|
-
SEC filings
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
- ❌ **
|
|
52
|
+
RAG pipelines fail on SEC filings because **standard parsers destroy document structure.**
|
|
53
|
+
|
|
54
|
+
When you flatten a 200-page 10-K to plain text:
|
|
55
|
+
|
|
56
|
+
- ❌ **Tables break** — Complex financial statements become misaligned text
|
|
57
|
+
- ❌ **Pages are lost** — Can't cite sources or trace answers back
|
|
58
|
+
- ❌ **Sections merge** — Risk Factors and MD&A become indistinguishable
|
|
59
|
+
- ❌ **Formatting is stripped** — Headers, bolds, lists (LLM reasoning cues) gone
|
|
60
|
+
- ❌ **Retrieval fails** — Chunks without structure return wrong context
|
|
54
61
|
|
|
55
|
-
|
|
62
|
+
Your RAG system is only as good as your data. Garbage in, garbage out.
|
|
56
63
|
|
|
57
64
|
## The Solution
|
|
58
65
|
|
|
@@ -177,22 +184,13 @@ Most libraries force you to choose between speed and accuracy. `sec2md` gives yo
|
|
|
177
184
|
- 🎯 **Accurate** - Purpose-built for SEC document structure
|
|
178
185
|
- 🔧 **Simple** - One function call, zero configuration
|
|
179
186
|
|
|
180
|
-
### Built for
|
|
187
|
+
### Built for Agentic RAG
|
|
181
188
|
Don't rebuild what we've already solved:
|
|
182
189
|
- ✅ **Page tracking** - Cite sources with exact page numbers
|
|
183
190
|
- ✅ **Section detection** - Extract just what you need (Risk Factors, MD&A)
|
|
184
191
|
- ✅ **Smart chunking** - Respects table boundaries, preserves context
|
|
185
192
|
- ✅ **Metadata headers** - Boost embedding quality 2-3x with contextual headers
|
|
186
193
|
|
|
187
|
-
### Avoid the Maintenance Nightmare
|
|
188
|
-
Building your own SEC parser starts simple - scaling it is another story. What begins as BeautifulSoup and regex quickly turns into:
|
|
189
|
-
- 🔴 Edge cases for every filing format variation
|
|
190
|
-
- 🔴 Table parsing that breaks on nested structures
|
|
191
|
-
- 🔴 XBRL tag stripping that misses new namespaces
|
|
192
|
-
- 🔴 Section detection that fails on formatting changes
|
|
193
|
-
|
|
194
|
-
**`sec2md` handles this for you.** Focus on building AI features, not parsing documents.
|
|
195
|
-
|
|
196
194
|
---
|
|
197
195
|
|
|
198
196
|
## Documentation
|
|
@@ -9,11 +9,11 @@ sec2md/sections.py,sha256=TxmpucUH389T9QDqu_SYVsJi0_WtsNTaG0ILspoF88E,3395
|
|
|
9
9
|
sec2md/table_parser.py,sha256=FhR8OwX5NAJmzdbTFzHQTGUNUPieYN37UzMFbQMkogU,12551
|
|
10
10
|
sec2md/utils.py,sha256=2lUeN5irTbdIyjylCkaPKMv4ALWxWMJl96PTO8FV3Ik,2990
|
|
11
11
|
sec2md/chunker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
-
sec2md/chunker/markdown_blocks.py,sha256=
|
|
12
|
+
sec2md/chunker/markdown_blocks.py,sha256=vfLs3sZp6lVcSY2ZDm_MIuQ41cSxlnclBsEWfxgoGBw,3787
|
|
13
13
|
sec2md/chunker/markdown_chunk.py,sha256=87s_zdZ5prQ4GyeULhdCijJYz8xXn4gSGDRNpotziFU,2632
|
|
14
14
|
sec2md/chunker/markdown_chunker.py,sha256=ex8Kpzae9b8hluLt-CnW_RM6aQ5Ag7u8cy9gWHM8yhU,9305
|
|
15
|
-
sec2md-0.1.
|
|
16
|
-
sec2md-0.1.
|
|
17
|
-
sec2md-0.1.
|
|
18
|
-
sec2md-0.1.
|
|
19
|
-
sec2md-0.1.
|
|
15
|
+
sec2md-0.1.1.dist-info/licenses/LICENSE,sha256=uJDiSGQ5TOx-PGhu2LGH4A-O53vS5hrQ5sc3j2Ps_Rk,1071
|
|
16
|
+
sec2md-0.1.1.dist-info/METADATA,sha256=QcLiN6MRfjo_VQyyruCwn15RVnrsJqUxW_AgViPjNJ4,7594
|
|
17
|
+
sec2md-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
18
|
+
sec2md-0.1.1.dist-info/top_level.txt,sha256=Jpmw3laEWwS9fljtAEg4sExjFw3zP8dGarjIknyh1v8,7
|
|
19
|
+
sec2md-0.1.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|