stanford-edgar-parser 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- stanford_edgar_parser-0.1.0/LICENSE +21 -0
- stanford_edgar_parser-0.1.0/MANIFEST.in +2 -0
- stanford_edgar_parser-0.1.0/PKG-INFO +135 -0
- stanford_edgar_parser-0.1.0/README.md +76 -0
- stanford_edgar_parser-0.1.0/pyproject.toml +56 -0
- stanford_edgar_parser-0.1.0/setup.cfg +4 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/README.md +102 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/__init__.py +39 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/__main__.py +80 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/_fragments.py +20 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/_state.py +7 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/agent_assets/claude/stanford-edgar-parser/SKILL.md +38 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/agent_assets/codex/stanford-edgar-parser/SKILL.md +50 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/agent_assets/codex/stanford-edgar-parser/agents/openai.yaml +4 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/agent_assets/codex/stanford-edgar-parser/references/review.md +34 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/ai.py +123 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/api.py +48 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/config.py +41 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/hardcodes.py +87 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/mcp_server.py +307 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/multimarkdown/__init__.py +8 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/multimarkdown/multimarkdown.py +468 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/orchestrator.py +858 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/__init__.py +0 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/html/__init__.py +16 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/html/html.py +915 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/html/postprocessing.py +755 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/html/preprocessing.py +1788 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/html/table_cleaning.py +2120 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/ocr/__init__.py +21 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/ocr/mistral_keys.py +559 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/ocr/ocr.py +150 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/ocr/ocr_utils.py +864 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/plaintext/__init__.py +12 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/plaintext/legacy_form_parsers.py +152 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/plaintext/plaintext_parser.py +89 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/sgml/__init__.py +12 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/sgml/sgml_utils.py +274 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/xml/__init__.py +35 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/xml/fund_and_ownership.py +2788 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/xml/ownership.py +312 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/xml/regulatory_forms.py +4079 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/runtime.py +41 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/sec_parser.py +24 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/special_chars.py +158 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/utils/__init__.py +12 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/utils/bootstrap.py +106 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/utils/parse_stats.py +289 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser/utils/tokenizer.py +137 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser.egg-info/PKG-INFO +135 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser.egg-info/SOURCES.txt +53 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser.egg-info/dependency_links.txt +1 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser.egg-info/entry_points.txt +4 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser.egg-info/requires.txt +17 -0
- stanford_edgar_parser-0.1.0/stanford_edgar_parser.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Stanford Advanced FinTech Lab(SAFTL)
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: stanford-edgar-parser
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Layout-faithful SEC EDGAR filing parser from the Stanford EDGAR Filings Dataset.
|
|
5
|
+
Author: Stanford Advanced Financial Technologies Lab
|
|
6
|
+
Project-URL: Homepage, https://github.com/Stanford-Advanced-FinTech-Lab-SAFTL/stanford-edgar-filings-dataset
|
|
7
|
+
Project-URL: Repository, https://github.com/Stanford-Advanced-FinTech-Lab-SAFTL/stanford-edgar-filings-dataset
|
|
8
|
+
Keywords: sec,edgar,filings,multimarkdown,financial-documents
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Topic :: Text Processing :: Markup
|
|
13
|
+
Requires-Python: >=3.11
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE
|
|
16
|
+
Requires-Dist: beautifulsoup4>=4.12.3
|
|
17
|
+
Requires-Dist: imgkit>=1.2.3
|
|
18
|
+
Requires-Dist: lxml>=6.0.0
|
|
19
|
+
Requires-Dist: mistralai>=1.9.0
|
|
20
|
+
Requires-Dist: numpy>=2.0.0
|
|
21
|
+
Requires-Dist: pandas>=2.0.0
|
|
22
|
+
Requires-Dist: playwright>=1.48.0
|
|
23
|
+
Requires-Dist: pydantic>=2.0.0
|
|
24
|
+
Requires-Dist: PyMuPDF>=1.24.0
|
|
25
|
+
Requires-Dist: PyPDF2>=3.0.1
|
|
26
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
27
|
+
Requires-Dist: requests>=2.32.0
|
|
28
|
+
Requires-Dist: tabulate>=0.9.0
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: build>=1.2.0; extra == "dev"
|
|
31
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
32
|
+
Dynamic: license-file
|
|
33
|
+
|
|
34
|
+
# Stanford EDGAR Parser
|
|
35
|
+
|
|
36
|
+
Layout-faithful SEC filing parser used by the Stanford EDGAR Filings Dataset.
|
|
37
|
+
It converts raw EDGAR TXT/HTML/SGML/XML submissions into Markdown or
|
|
38
|
+
MultiMarkdown while preserving financial-table structure, indentation, links,
|
|
39
|
+
inline formatting, and filing metadata where possible.
|
|
40
|
+
|
|
41
|
+
## Install
|
|
42
|
+
|
|
43
|
+
From PyPI, after release:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install stanford-edgar-parser
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Until then, install directly from GitHub:
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install "stanford-edgar-parser @ git+https://github.com/Stanford-Advanced-FinTech-Lab-SAFTL/stanford-edgar-filings-dataset.git"
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
For local development from a clone:
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
pip install -e .
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Layout
|
|
62
|
+
|
|
63
|
+
- `runtime.py`: backward-compatible re-export shim
|
|
64
|
+
- `orchestrator.py`: local filing orchestration and final output cleanup
|
|
65
|
+
- `utils/`: imports, tokenizer helpers, parse statistics, and shared setup
|
|
66
|
+
- `multimarkdown/`: MultiMarkdown table conversion
|
|
67
|
+
- `parsers/html/`: HTML preprocessing, table cleanup, parser, and postprocessing
|
|
68
|
+
- `parsers/ocr/`: Mistral OCR key rotation, PDF/image OCR, and OCR utilities
|
|
69
|
+
- `parsers/plaintext/`: plaintext and legacy text-form parsers
|
|
70
|
+
- `parsers/sgml/`: SGML document-block utilities
|
|
71
|
+
- `parsers/xml/`: XML filing-form parsers
|
|
72
|
+
- `sec_parser.py`: compatibility shim for old `python stanford_edgar_parser/sec_parser.py` usage
|
|
73
|
+
- `__main__.py`: `python -m stanford_edgar_parser` command-line entrypoint
|
|
74
|
+
|
|
75
|
+
The original implementation remains untouched at `sec_parser/sec_parser.py`.
|
|
76
|
+
The equivalence tests in `tests/parser_equivalence/` verify the split-module
|
|
77
|
+
coverage and compare parser outputs bit-for-bit.
|
|
78
|
+
|
|
79
|
+
## Usage
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
python -m stanford_edgar_parser path/to/filing.txt
|
|
83
|
+
python -m stanford_edgar_parser path/to/filing.txt --to_mmd
|
|
84
|
+
stanford-edgar-parser path/to/filing.txt --to_mmd
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
from stanford_edgar_parser import main_one, parse_html_filing
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Agent Skill Install
|
|
92
|
+
|
|
93
|
+
Install bundled Codex and Claude skill files:
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
stanford-edgar-install-skill
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Or from Python:
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
from stanford_edgar_parser.ai import install_skill
|
|
103
|
+
|
|
104
|
+
install_skill()
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Use `--overwrite` if you want to replace an existing installed skill.
|
|
108
|
+
|
|
109
|
+
## MCP
|
|
110
|
+
|
|
111
|
+
After package install, expose the parser as an MCP server with:
|
|
112
|
+
|
|
113
|
+
```toml
|
|
114
|
+
[mcp_servers.stanford_edgar_parser]
|
|
115
|
+
command = "uvx"
|
|
116
|
+
args = ["--from", "stanford-edgar-parser", "stanford-edgar-mcp"]
|
|
117
|
+
startup_timeout_sec = 120
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Before the PyPI release, use the GitHub package source:
|
|
121
|
+
|
|
122
|
+
```toml
|
|
123
|
+
[mcp_servers.stanford_edgar_parser]
|
|
124
|
+
command = "uvx"
|
|
125
|
+
args = [
|
|
126
|
+
"--from",
|
|
127
|
+
"stanford-edgar-parser @ git+https://github.com/Stanford-Advanced-FinTech-Lab-SAFTL/stanford-edgar-filings-dataset.git",
|
|
128
|
+
"stanford-edgar-mcp"
|
|
129
|
+
]
|
|
130
|
+
startup_timeout_sec = 120
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
The package-installed MCP server always exposes `parse_filing`. Repo-local
|
|
134
|
+
rendering and review tools are exposed when the full clone includes
|
|
135
|
+
`multimarkdown.js`, `html-to-pdf.mjs`, and `tools/`.
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# Stanford EDGAR Parser
|
|
2
|
+
|
|
3
|
+
Layout-faithful SEC EDGAR filing parser from the Stanford EDGAR Filings Dataset.
|
|
4
|
+
It converts raw EDGAR TXT/HTML/SGML/XML submissions into Markdown or
|
|
5
|
+
MultiMarkdown while preserving financial-table structure, indentation, links,
|
|
6
|
+
inline formatting, and filing metadata where possible.
|
|
7
|
+
|
|
8
|
+
## Install
|
|
9
|
+
|
|
10
|
+
After PyPI release:
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
pip install stanford-edgar-parser
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
Until then, install directly from GitHub:
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install "stanford-edgar-parser @ git+https://github.com/Stanford-Advanced-FinTech-Lab-SAFTL/stanford-edgar-filings-dataset.git"
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Parse
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
stanford-edgar-parser path/to/filing.txt --to_mmd
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
or:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
python -m stanford_edgar_parser path/to/filing.txt --to_mmd
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Agent Skills
|
|
35
|
+
|
|
36
|
+
Install bundled Codex and Claude skills:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
stanford-edgar-install-skill
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
or:
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from stanford_edgar_parser.ai import install_skill
|
|
46
|
+
|
|
47
|
+
install_skill()
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## MCP
|
|
51
|
+
|
|
52
|
+
Package install:
|
|
53
|
+
|
|
54
|
+
```toml
|
|
55
|
+
[mcp_servers.stanford_edgar_parser]
|
|
56
|
+
command = "uvx"
|
|
57
|
+
args = ["--from", "stanford-edgar-parser", "stanford-edgar-mcp"]
|
|
58
|
+
startup_timeout_sec = 120
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
GitHub install before PyPI release:
|
|
62
|
+
|
|
63
|
+
```toml
|
|
64
|
+
[mcp_servers.stanford_edgar_parser]
|
|
65
|
+
command = "uvx"
|
|
66
|
+
args = [
|
|
67
|
+
"--from",
|
|
68
|
+
"stanford-edgar-parser @ git+https://github.com/Stanford-Advanced-FinTech-Lab-SAFTL/stanford-edgar-filings-dataset.git",
|
|
69
|
+
"stanford-edgar-mcp"
|
|
70
|
+
]
|
|
71
|
+
startup_timeout_sec = 120
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
The package-installed MCP server always exposes `parse_filing`. Repo-local
|
|
75
|
+
rendering and review tools are exposed when the full clone includes
|
|
76
|
+
`multimarkdown.js`, `html-to-pdf.mjs`, and `tools/`.
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=69", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "stanford-edgar-parser"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Layout-faithful SEC EDGAR filing parser from the Stanford EDGAR Filings Dataset."
|
|
9
|
+
readme = "stanford_edgar_parser/README.md"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
authors = [
|
|
12
|
+
{ name = "Stanford Advanced Financial Technologies Lab" }
|
|
13
|
+
]
|
|
14
|
+
keywords = ["sec", "edgar", "filings", "multimarkdown", "financial-documents"]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Programming Language :: Python :: 3.11",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
"Topic :: Text Processing :: Markup",
|
|
20
|
+
]
|
|
21
|
+
dependencies = [
|
|
22
|
+
"beautifulsoup4>=4.12.3",
|
|
23
|
+
"imgkit>=1.2.3",
|
|
24
|
+
"lxml>=6.0.0",
|
|
25
|
+
"mistralai>=1.9.0",
|
|
26
|
+
"numpy>=2.0.0",
|
|
27
|
+
"pandas>=2.0.0",
|
|
28
|
+
"playwright>=1.48.0",
|
|
29
|
+
"pydantic>=2.0.0",
|
|
30
|
+
"PyMuPDF>=1.24.0",
|
|
31
|
+
"PyPDF2>=3.0.1",
|
|
32
|
+
"python-dotenv>=1.0.0",
|
|
33
|
+
"requests>=2.32.0",
|
|
34
|
+
"tabulate>=0.9.0",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[project.optional-dependencies]
|
|
38
|
+
dev = [
|
|
39
|
+
"build>=1.2.0",
|
|
40
|
+
"pytest>=8.0.0",
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
[project.urls]
|
|
44
|
+
Homepage = "https://github.com/Stanford-Advanced-FinTech-Lab-SAFTL/stanford-edgar-filings-dataset"
|
|
45
|
+
Repository = "https://github.com/Stanford-Advanced-FinTech-Lab-SAFTL/stanford-edgar-filings-dataset"
|
|
46
|
+
|
|
47
|
+
[project.scripts]
|
|
48
|
+
stanford-edgar-parser = "stanford_edgar_parser.__main__:main"
|
|
49
|
+
stanford-edgar-mcp = "stanford_edgar_parser.mcp_server:main"
|
|
50
|
+
stanford-edgar-install-skill = "stanford_edgar_parser.ai:main"
|
|
51
|
+
|
|
52
|
+
[tool.setuptools.packages.find]
|
|
53
|
+
include = ["stanford_edgar_parser*"]
|
|
54
|
+
|
|
55
|
+
[tool.setuptools.package-data]
|
|
56
|
+
stanford_edgar_parser = ["agent_assets/**/*", "README.md"]
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# Stanford EDGAR Parser
|
|
2
|
+
|
|
3
|
+
Layout-faithful SEC filing parser used by the Stanford EDGAR Filings Dataset.
|
|
4
|
+
It converts raw EDGAR TXT/HTML/SGML/XML submissions into Markdown or
|
|
5
|
+
MultiMarkdown while preserving financial-table structure, indentation, links,
|
|
6
|
+
inline formatting, and filing metadata where possible.
|
|
7
|
+
|
|
8
|
+
## Install
|
|
9
|
+
|
|
10
|
+
From PyPI, after release:
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
pip install stanford-edgar-parser
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
Until then, install directly from GitHub:
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install "stanford-edgar-parser @ git+https://github.com/Stanford-Advanced-FinTech-Lab-SAFTL/stanford-edgar-filings-dataset.git"
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
For local development from a clone:
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install -e .
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Layout
|
|
29
|
+
|
|
30
|
+
- `runtime.py`: backward-compatible re-export shim
|
|
31
|
+
- `orchestrator.py`: local filing orchestration and final output cleanup
|
|
32
|
+
- `utils/`: imports, tokenizer helpers, parse statistics, and shared setup
|
|
33
|
+
- `multimarkdown/`: MultiMarkdown table conversion
|
|
34
|
+
- `parsers/html/`: HTML preprocessing, table cleanup, parser, and postprocessing
|
|
35
|
+
- `parsers/ocr/`: Mistral OCR key rotation, PDF/image OCR, and OCR utilities
|
|
36
|
+
- `parsers/plaintext/`: plaintext and legacy text-form parsers
|
|
37
|
+
- `parsers/sgml/`: SGML document-block utilities
|
|
38
|
+
- `parsers/xml/`: XML filing-form parsers
|
|
39
|
+
- `sec_parser.py`: compatibility shim for old `python stanford_edgar_parser/sec_parser.py` usage
|
|
40
|
+
- `__main__.py`: `python -m stanford_edgar_parser` command-line entrypoint
|
|
41
|
+
|
|
42
|
+
The original implementation remains untouched at `sec_parser/sec_parser.py`.
|
|
43
|
+
The equivalence tests in `tests/parser_equivalence/` verify the split-module
|
|
44
|
+
coverage and compare parser outputs bit-for-bit.
|
|
45
|
+
|
|
46
|
+
## Usage
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
python -m stanford_edgar_parser path/to/filing.txt
|
|
50
|
+
python -m stanford_edgar_parser path/to/filing.txt --to_mmd
|
|
51
|
+
stanford-edgar-parser path/to/filing.txt --to_mmd
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
from stanford_edgar_parser import main_one, parse_html_filing
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Agent Skill Install
|
|
59
|
+
|
|
60
|
+
Install bundled Codex and Claude skill files:
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
stanford-edgar-install-skill
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Or from Python:
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from stanford_edgar_parser.ai import install_skill
|
|
70
|
+
|
|
71
|
+
install_skill()
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Use `--overwrite` if you want to replace an existing installed skill.
|
|
75
|
+
|
|
76
|
+
## MCP
|
|
77
|
+
|
|
78
|
+
After package install, expose the parser as an MCP server with:
|
|
79
|
+
|
|
80
|
+
```toml
|
|
81
|
+
[mcp_servers.stanford_edgar_parser]
|
|
82
|
+
command = "uvx"
|
|
83
|
+
args = ["--from", "stanford-edgar-parser", "stanford-edgar-mcp"]
|
|
84
|
+
startup_timeout_sec = 120
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Before the PyPI release, use the GitHub package source:
|
|
88
|
+
|
|
89
|
+
```toml
|
|
90
|
+
[mcp_servers.stanford_edgar_parser]
|
|
91
|
+
command = "uvx"
|
|
92
|
+
args = [
|
|
93
|
+
"--from",
|
|
94
|
+
"stanford-edgar-parser @ git+https://github.com/Stanford-Advanced-FinTech-Lab-SAFTL/stanford-edgar-filings-dataset.git",
|
|
95
|
+
"stanford-edgar-mcp"
|
|
96
|
+
]
|
|
97
|
+
startup_timeout_sec = 120
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
The package-installed MCP server always exposes `parse_filing`. Repo-local
|
|
101
|
+
rendering and review tools are exposed when the full clone includes
|
|
102
|
+
`multimarkdown.js`, `html-to-pdf.mjs`, and `tools/`.
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Stanford EDGAR filings parser package."""
|
|
2
|
+
|
|
3
|
+
from . import _state
|
|
4
|
+
from . import api as _sec_parser
|
|
5
|
+
|
|
6
|
+
clean_financial_df = _sec_parser.clean_financial_df
|
|
7
|
+
convert_all_tables_to_mmd = _sec_parser.convert_all_tables_to_mmd
|
|
8
|
+
df_to_markdown = _sec_parser.df_to_markdown
|
|
9
|
+
df_to_multimarkdown = _sec_parser.df_to_multimarkdown
|
|
10
|
+
estimate_parser_tokens = _sec_parser.estimate_parser_tokens
|
|
11
|
+
main_one = _sec_parser.main_one
|
|
12
|
+
normalize_text_markup = _sec_parser.normalize_text_markup
|
|
13
|
+
parse_any_xml = _sec_parser.parse_any_xml
|
|
14
|
+
parse_html_filing = _sec_parser.parse_html_filing
|
|
15
|
+
parse_pdf_attachments = _sec_parser.parse_pdf_attachments
|
|
16
|
+
parse_plaintext_filing = _sec_parser.parse_plaintext_filing
|
|
17
|
+
process_local_xbrl = _sec_parser.process_local_xbrl
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def __getattr__(name: str):
|
|
21
|
+
if hasattr(_state, name):
|
|
22
|
+
return getattr(_state, name)
|
|
23
|
+
return getattr(_sec_parser, name)
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
"LAST_PARSE_STATS",
|
|
27
|
+
"clean_financial_df",
|
|
28
|
+
"convert_all_tables_to_mmd",
|
|
29
|
+
"df_to_markdown",
|
|
30
|
+
"df_to_multimarkdown",
|
|
31
|
+
"estimate_parser_tokens",
|
|
32
|
+
"main_one",
|
|
33
|
+
"normalize_text_markup",
|
|
34
|
+
"parse_any_xml",
|
|
35
|
+
"parse_html_filing",
|
|
36
|
+
"parse_pdf_attachments",
|
|
37
|
+
"parse_plaintext_filing",
|
|
38
|
+
"process_local_xbrl",
|
|
39
|
+
]
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""Command-line entrypoint for ``python -m stanford_edgar_parser``."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import json
|
|
7
|
+
import pathlib
|
|
8
|
+
import sys
|
|
9
|
+
|
|
10
|
+
from .api import (
|
|
11
|
+
get_mistral_key_status_snapshot,
|
|
12
|
+
main_one,
|
|
13
|
+
reset_mistral_key_status,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def main(argv: list[str] | None = None) -> int:
|
|
18
|
+
parser = argparse.ArgumentParser(
|
|
19
|
+
description="Parse an SEC filing in HTML, HTM, or TXT format and convert it to Markdown.",
|
|
20
|
+
formatter_class=argparse.RawTextHelpFormatter,
|
|
21
|
+
)
|
|
22
|
+
parser.add_argument("path", nargs="?", help="Path to the SEC filing.")
|
|
23
|
+
parser.add_argument(
|
|
24
|
+
"--to_mmd",
|
|
25
|
+
action="store_true",
|
|
26
|
+
help="Convert all tables in the final output to MultiMarkdown format.",
|
|
27
|
+
)
|
|
28
|
+
parser.add_argument(
|
|
29
|
+
"--source-document-url",
|
|
30
|
+
help="Optional absolute source document URL used to resolve normal relative links.",
|
|
31
|
+
)
|
|
32
|
+
parser.add_argument(
|
|
33
|
+
"--disable_indentation",
|
|
34
|
+
"--disable-indentation",
|
|
35
|
+
action="store_true",
|
|
36
|
+
dest="disable_indentation",
|
|
37
|
+
help="Remove final-output indentation NBSP markers from the written Markdown.",
|
|
38
|
+
)
|
|
39
|
+
parser.add_argument(
|
|
40
|
+
"--mistral-key-status",
|
|
41
|
+
action="store_true",
|
|
42
|
+
help="Print the shared Mistral key rotation/usage monitor JSON and exit.",
|
|
43
|
+
)
|
|
44
|
+
parser.add_argument(
|
|
45
|
+
"--reset-mistral-key-status",
|
|
46
|
+
action="store_true",
|
|
47
|
+
help="Reset the shared Mistral key rotation/usage monitor JSON and exit.",
|
|
48
|
+
)
|
|
49
|
+
args = parser.parse_args(argv)
|
|
50
|
+
|
|
51
|
+
if args.reset_mistral_key_status:
|
|
52
|
+
print(json.dumps(reset_mistral_key_status(), indent=2, sort_keys=True))
|
|
53
|
+
return 0
|
|
54
|
+
if args.mistral_key_status:
|
|
55
|
+
print(json.dumps(get_mistral_key_status_snapshot(), indent=2, sort_keys=True))
|
|
56
|
+
return 0
|
|
57
|
+
if not args.path:
|
|
58
|
+
parser.error("the following arguments are required: path")
|
|
59
|
+
|
|
60
|
+
file_path = pathlib.Path(args.path)
|
|
61
|
+
if not file_path.is_file() and file_path.parts and file_path.parts[0] == "sec_parser":
|
|
62
|
+
alt_path = pathlib.Path(*file_path.parts[1:])
|
|
63
|
+
if alt_path.is_file():
|
|
64
|
+
print(f"[info] Using '{alt_path}' instead of '{args.path}'.")
|
|
65
|
+
file_path = alt_path
|
|
66
|
+
if not file_path.is_file():
|
|
67
|
+
print(f"Error: File not found at {args.path}", file=sys.stderr)
|
|
68
|
+
return 1
|
|
69
|
+
|
|
70
|
+
main_one(
|
|
71
|
+
file_path,
|
|
72
|
+
to_mmd=args.to_mmd,
|
|
73
|
+
source_document_url=args.source_document_url,
|
|
74
|
+
disable_indentation=args.disable_indentation,
|
|
75
|
+
)
|
|
76
|
+
return 0
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
if __name__ == "__main__":
|
|
80
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
FRAGMENTS = [
|
|
2
|
+
('utils/bootstrap.py', 1, 113),
|
|
3
|
+
('utils/tokenizer.py', 114, 232),
|
|
4
|
+
('utils/parse_stats.py', 233, 497),
|
|
5
|
+
('parsers/ocr/mistral_keys.py', 498, 1033),
|
|
6
|
+
('parsers/ocr/ocr_utils.py', 1034, 1815),
|
|
7
|
+
('multimarkdown/multimarkdown.py', 1816, 2128),
|
|
8
|
+
('parsers/html/table_cleaning.py', 2129, 4211),
|
|
9
|
+
('parsers/xml/ownership.py', 4212, 4509),
|
|
10
|
+
('parsers/plaintext/plaintext_parser.py', 4510, 4592),
|
|
11
|
+
('parsers/xml/fund_and_ownership.py', 4593, 7342),
|
|
12
|
+
('parsers/xml/regulatory_forms.py', 7343, 11388),
|
|
13
|
+
('parsers/html/preprocessing.py', 11389, 13085),
|
|
14
|
+
('parsers/html/html.py', 13086, 13850),
|
|
15
|
+
('parsers/html/postprocessing.py', 13851, 14531),
|
|
16
|
+
('parsers/plaintext/legacy_form_parsers.py', 14532, 14675),
|
|
17
|
+
('parsers/ocr/ocr.py', 14676, 14794),
|
|
18
|
+
('parsers/sgml/sgml_utils.py', 14795, 15053),
|
|
19
|
+
('orchestrator.py', 15054, 15460),
|
|
20
|
+
]
|
stanford_edgar_parser-0.1.0/stanford_edgar_parser/agent_assets/claude/stanford-edgar-parser/SKILL.md
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: stanford-edgar-parser
|
|
3
|
+
description: Use the Stanford EDGAR Filings Dataset parser to parse SEC filings into layout-faithful MultiMarkdown, render and inspect examples, debug table/indentation/link/OCR issues, run showcase checks, and use the local MCP server.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Stanford EDGAR Parser
|
|
7
|
+
|
|
8
|
+
Use this skill in the Stanford EDGAR parser repository or in an environment
|
|
9
|
+
where the `stanford-edgar-parser` Python package is installed.
|
|
10
|
+
|
|
11
|
+
## Commands
|
|
12
|
+
|
|
13
|
+
- Parse to MultiMarkdown:
|
|
14
|
+
`python -m stanford_edgar_parser path/to/filing.txt --to_mmd`
|
|
15
|
+
or `stanford-edgar-parser path/to/filing.txt --to_mmd`
|
|
16
|
+
- Remove final indentation markers:
|
|
17
|
+
`python -m stanford_edgar_parser path/to/filing.txt --to_mmd --disable-indentation`
|
|
18
|
+
- Render:
|
|
19
|
+
`node multimarkdown.js path/to/parsed.md > /tmp/sefd.html`
|
|
20
|
+
`node html-to-pdf.mjs /tmp/sefd.html path/to/rendered.pdf`
|
|
21
|
+
- Static showcase checks:
|
|
22
|
+
`python tools/check_showcase_tables.py examples`
|
|
23
|
+
- Raw-vs-parsed review:
|
|
24
|
+
`python tools/review_snippet.py <example-dir-or-accession> "<needle text>"`
|
|
25
|
+
- MCP server:
|
|
26
|
+
`python -m stanford_edgar_parser.mcp_server`
|
|
27
|
+
or `stanford-edgar-mcp`
|
|
28
|
+
|
|
29
|
+
The repo includes `.mcp.json` for clients that support project-local MCP configuration.
|
|
30
|
+
Package installs always expose `parse_filing`; repo-only render/review tools are exposed only when helper scripts are present.
|
|
31
|
+
|
|
32
|
+
## Review Standard
|
|
33
|
+
|
|
34
|
+
Compare parser output to the raw browser view. Do not approve output just because rendered Markdown looks plausible.
|
|
35
|
+
|
|
36
|
+
Prioritize table fidelity: visible columns/rows, merged headers, indentation hierarchy, `$`, `%`, `)`, `bp`, accounting parentheses, superscripts, subscripts, same-target links, and image placeholders.
|
|
37
|
+
|
|
38
|
+
Fix parser root causes. Do not introduce phrase-specific showcase hardcodes.
|
stanford_edgar_parser-0.1.0/stanford_edgar_parser/agent_assets/codex/stanford-edgar-parser/SKILL.md
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: stanford-edgar-parser
|
|
3
|
+
description: Parse, render, inspect, and debug SEC EDGAR filings with the Stanford EDGAR Filings Dataset parser. Use when Codex is asked to convert local SEC filing TXT/HTML/SGML/XML/PDF-containing submissions into layout-faithful MultiMarkdown, review parser output against raw browser layout, diagnose table/indentation/link/OCR artifacts, run showcase checks, or use the repo's MCP server.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Stanford EDGAR Parser
|
|
7
|
+
|
|
8
|
+
Use this skill inside a Stanford EDGAR parser repo clone or in an environment
|
|
9
|
+
where the `stanford-edgar-parser` Python package is installed.
|
|
10
|
+
|
|
11
|
+
## Quick Workflow
|
|
12
|
+
|
|
13
|
+
1. Parse local filings with:
|
|
14
|
+
`python -m stanford_edgar_parser path/to/filing.txt --to_mmd`
|
|
15
|
+
or, from an installed package:
|
|
16
|
+
`stanford-edgar-parser path/to/filing.txt --to_mmd`
|
|
17
|
+
2. Use `--disable-indentation` only when the caller wants final ` ` indentation markers removed.
|
|
18
|
+
3. Render Markdown when visual QA matters:
|
|
19
|
+
`node multimarkdown.js path/to/parsed.md > /tmp/sefd.html`
|
|
20
|
+
`node html-to-pdf.mjs /tmp/sefd.html path/to/rendered.pdf`
|
|
21
|
+
4. Run showcase checks before accepting examples or parser changes:
|
|
22
|
+
`python tools/check_showcase_tables.py examples`
|
|
23
|
+
5. For suspicious output, compare raw HTML and parsed Markdown with:
|
|
24
|
+
`python tools/review_snippet.py <example-dir-or-accession> "<needle text>"`
|
|
25
|
+
|
|
26
|
+
## Debugging Rules
|
|
27
|
+
|
|
28
|
+
- Fix source-level reconstruction logic, not local phrase hardcodes.
|
|
29
|
+
- Preserve filer text if the raw source itself has a typo, missing space, or odd punctuation.
|
|
30
|
+
- Never assume rendered Markdown alone is correct; compare against the raw browser view.
|
|
31
|
+
- Watch especially for detached `$`, `%`, `)`, `bp`, lost negative parentheses, dropped columns, row/col span drift, malformed emphasis, broken same-URL links, and missing indentation in lists/tables.
|
|
32
|
+
- Keep scratch review artifacts out of published `examples/` unless explicitly requested.
|
|
33
|
+
|
|
34
|
+
## MCP
|
|
35
|
+
|
|
36
|
+
Start the local MCP server with:
|
|
37
|
+
`python -m stanford_edgar_parser.mcp_server`
|
|
38
|
+
|
|
39
|
+
or, from an installed package:
|
|
40
|
+
`stanford-edgar-mcp`
|
|
41
|
+
|
|
42
|
+
It exposes parser-oriented tools for parsing filings, rendering Markdown to PDF, running showcase checks, and generating raw-vs-parsed review snippets.
|
|
43
|
+
|
|
44
|
+
Package installs always expose `parse_filing`. Repo-only render/review tools are exposed only when `multimarkdown.js`, `html-to-pdf.mjs`, and `tools/` are present.
|
|
45
|
+
|
|
46
|
+
The repo includes `.mcp.json` for clients that support project-local MCP configuration.
|
|
47
|
+
|
|
48
|
+
## Reference
|
|
49
|
+
|
|
50
|
+
Read `references/review.md` when doing a meticulous parser-output review or preparing showcase examples.
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# Parser Output Review
|
|
2
|
+
|
|
3
|
+
Use this checklist when reviewing parsed filings or showcase examples.
|
|
4
|
+
|
|
5
|
+
## Compare Against Source
|
|
6
|
+
|
|
7
|
+
- Inspect the raw browser view for the same section, table, or paragraph.
|
|
8
|
+
- If the raw source itself has a typo or missing space, preserve it unless the task is explicit normalization.
|
|
9
|
+
- If the browser visually joins split HTML fragments, the parser should usually reconstruct the same semantic value.
|
|
10
|
+
|
|
11
|
+
## Tables
|
|
12
|
+
|
|
13
|
+
- Check that every visible column and row appears.
|
|
14
|
+
- Check that merged headers remain grouped with MultiMarkdown `||` and `^^`.
|
|
15
|
+
- Check that currency/percent/parentheses modifiers attach to numbers: `$7700`, `75.0%`, `(200)`, `)bp`.
|
|
16
|
+
- Check that empty body rows are not artifacts from rowspan/colspan scaffolding.
|
|
17
|
+
- Check that numeric signs and accounting parentheses are not flipped or dropped.
|
|
18
|
+
- Check that row labels keep indentation levels where they carry hierarchy.
|
|
19
|
+
|
|
20
|
+
## Text And Lists
|
|
21
|
+
|
|
22
|
+
- Check numbered, alphabetic, and parenthesized list markers for a visible space or indentation after the marker.
|
|
23
|
+
- Check that paragraph indentation is preserved where it expresses hierarchy.
|
|
24
|
+
- Check that adjacent styled spans do not create malformed Markdown emphasis.
|
|
25
|
+
- Check that adjacent links with the same target preserve readable spacing.
|
|
26
|
+
- Check image placeholders are explicit and not confused with body text.
|
|
27
|
+
|
|
28
|
+
## Useful Commands
|
|
29
|
+
|
|
30
|
+
- `python tools/check_showcase_tables.py examples`
|
|
31
|
+
- `python tools/review_snippet.py examples/<accession> "<needle text>"`
|
|
32
|
+
- `python -m stanford_edgar_parser examples/<accession>/raw.txt --to_mmd`
|
|
33
|
+
- `node multimarkdown.js examples/<accession>/parsed.md > /tmp/sefd.html`
|
|
34
|
+
- `node html-to-pdf.mjs /tmp/sefd.html /tmp/sefd.pdf`
|