stanford-edgar-parser 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. stanford_edgar_parser-0.1.0/LICENSE +21 -0
  2. stanford_edgar_parser-0.1.0/MANIFEST.in +2 -0
  3. stanford_edgar_parser-0.1.0/PKG-INFO +135 -0
  4. stanford_edgar_parser-0.1.0/README.md +76 -0
  5. stanford_edgar_parser-0.1.0/pyproject.toml +56 -0
  6. stanford_edgar_parser-0.1.0/setup.cfg +4 -0
  7. stanford_edgar_parser-0.1.0/stanford_edgar_parser/README.md +102 -0
  8. stanford_edgar_parser-0.1.0/stanford_edgar_parser/__init__.py +39 -0
  9. stanford_edgar_parser-0.1.0/stanford_edgar_parser/__main__.py +80 -0
  10. stanford_edgar_parser-0.1.0/stanford_edgar_parser/_fragments.py +20 -0
  11. stanford_edgar_parser-0.1.0/stanford_edgar_parser/_state.py +7 -0
  12. stanford_edgar_parser-0.1.0/stanford_edgar_parser/agent_assets/claude/stanford-edgar-parser/SKILL.md +38 -0
  13. stanford_edgar_parser-0.1.0/stanford_edgar_parser/agent_assets/codex/stanford-edgar-parser/SKILL.md +50 -0
  14. stanford_edgar_parser-0.1.0/stanford_edgar_parser/agent_assets/codex/stanford-edgar-parser/agents/openai.yaml +4 -0
  15. stanford_edgar_parser-0.1.0/stanford_edgar_parser/agent_assets/codex/stanford-edgar-parser/references/review.md +34 -0
  16. stanford_edgar_parser-0.1.0/stanford_edgar_parser/ai.py +123 -0
  17. stanford_edgar_parser-0.1.0/stanford_edgar_parser/api.py +48 -0
  18. stanford_edgar_parser-0.1.0/stanford_edgar_parser/config.py +41 -0
  19. stanford_edgar_parser-0.1.0/stanford_edgar_parser/hardcodes.py +87 -0
  20. stanford_edgar_parser-0.1.0/stanford_edgar_parser/mcp_server.py +307 -0
  21. stanford_edgar_parser-0.1.0/stanford_edgar_parser/multimarkdown/__init__.py +8 -0
  22. stanford_edgar_parser-0.1.0/stanford_edgar_parser/multimarkdown/multimarkdown.py +468 -0
  23. stanford_edgar_parser-0.1.0/stanford_edgar_parser/orchestrator.py +858 -0
  24. stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/__init__.py +0 -0
  25. stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/html/__init__.py +16 -0
  26. stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/html/html.py +915 -0
  27. stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/html/postprocessing.py +755 -0
  28. stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/html/preprocessing.py +1788 -0
  29. stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/html/table_cleaning.py +2120 -0
  30. stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/ocr/__init__.py +21 -0
  31. stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/ocr/mistral_keys.py +559 -0
  32. stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/ocr/ocr.py +150 -0
  33. stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/ocr/ocr_utils.py +864 -0
  34. stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/plaintext/__init__.py +12 -0
  35. stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/plaintext/legacy_form_parsers.py +152 -0
  36. stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/plaintext/plaintext_parser.py +89 -0
  37. stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/sgml/__init__.py +12 -0
  38. stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/sgml/sgml_utils.py +274 -0
  39. stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/xml/__init__.py +35 -0
  40. stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/xml/fund_and_ownership.py +2788 -0
  41. stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/xml/ownership.py +312 -0
  42. stanford_edgar_parser-0.1.0/stanford_edgar_parser/parsers/xml/regulatory_forms.py +4079 -0
  43. stanford_edgar_parser-0.1.0/stanford_edgar_parser/runtime.py +41 -0
  44. stanford_edgar_parser-0.1.0/stanford_edgar_parser/sec_parser.py +24 -0
  45. stanford_edgar_parser-0.1.0/stanford_edgar_parser/special_chars.py +158 -0
  46. stanford_edgar_parser-0.1.0/stanford_edgar_parser/utils/__init__.py +12 -0
  47. stanford_edgar_parser-0.1.0/stanford_edgar_parser/utils/bootstrap.py +106 -0
  48. stanford_edgar_parser-0.1.0/stanford_edgar_parser/utils/parse_stats.py +289 -0
  49. stanford_edgar_parser-0.1.0/stanford_edgar_parser/utils/tokenizer.py +137 -0
  50. stanford_edgar_parser-0.1.0/stanford_edgar_parser.egg-info/PKG-INFO +135 -0
  51. stanford_edgar_parser-0.1.0/stanford_edgar_parser.egg-info/SOURCES.txt +53 -0
  52. stanford_edgar_parser-0.1.0/stanford_edgar_parser.egg-info/dependency_links.txt +1 -0
  53. stanford_edgar_parser-0.1.0/stanford_edgar_parser.egg-info/entry_points.txt +4 -0
  54. stanford_edgar_parser-0.1.0/stanford_edgar_parser.egg-info/requires.txt +17 -0
  55. stanford_edgar_parser-0.1.0/stanford_edgar_parser.egg-info/top_level.txt +1 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Stanford Advanced FinTech Lab(SAFTL)
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,2 @@
1
+ include stanford_edgar_parser/README.md
2
+ recursive-include stanford_edgar_parser/agent_assets *
@@ -0,0 +1,135 @@
1
+ Metadata-Version: 2.4
2
+ Name: stanford-edgar-parser
3
+ Version: 0.1.0
4
+ Summary: Layout-faithful SEC EDGAR filing parser from the Stanford EDGAR Filings Dataset.
5
+ Author: Stanford Advanced Financial Technologies Lab
6
+ Project-URL: Homepage, https://github.com/Stanford-Advanced-FinTech-Lab-SAFTL/stanford-edgar-filings-dataset
7
+ Project-URL: Repository, https://github.com/Stanford-Advanced-FinTech-Lab-SAFTL/stanford-edgar-filings-dataset
8
+ Keywords: sec,edgar,filings,multimarkdown,financial-documents
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Topic :: Text Processing :: Markup
13
+ Requires-Python: >=3.11
14
+ Description-Content-Type: text/markdown
15
+ License-File: LICENSE
16
+ Requires-Dist: beautifulsoup4>=4.12.3
17
+ Requires-Dist: imgkit>=1.2.3
18
+ Requires-Dist: lxml>=6.0.0
19
+ Requires-Dist: mistralai>=1.9.0
20
+ Requires-Dist: numpy>=2.0.0
21
+ Requires-Dist: pandas>=2.0.0
22
+ Requires-Dist: playwright>=1.48.0
23
+ Requires-Dist: pydantic>=2.0.0
24
+ Requires-Dist: PyMuPDF>=1.24.0
25
+ Requires-Dist: PyPDF2>=3.0.1
26
+ Requires-Dist: python-dotenv>=1.0.0
27
+ Requires-Dist: requests>=2.32.0
28
+ Requires-Dist: tabulate>=0.9.0
29
+ Provides-Extra: dev
30
+ Requires-Dist: build>=1.2.0; extra == "dev"
31
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
32
+ Dynamic: license-file
33
+
34
+ # Stanford EDGAR Parser
35
+
36
+ Layout-faithful SEC filing parser used by the Stanford EDGAR Filings Dataset.
37
+ It converts raw EDGAR TXT/HTML/SGML/XML submissions into Markdown or
38
+ MultiMarkdown while preserving financial-table structure, indentation, links,
39
+ inline formatting, and filing metadata where possible.
40
+
41
+ ## Install
42
+
43
+ From PyPI, after release:
44
+
45
+ ```bash
46
+ pip install stanford-edgar-parser
47
+ ```
48
+
49
+ Until then, install directly from GitHub:
50
+
51
+ ```bash
52
+ pip install "stanford-edgar-parser @ git+https://github.com/Stanford-Advanced-FinTech-Lab-SAFTL/stanford-edgar-filings-dataset.git"
53
+ ```
54
+
55
+ For local development from a clone:
56
+
57
+ ```bash
58
+ pip install -e .
59
+ ```
60
+
61
+ ## Layout
62
+
63
+ - `runtime.py`: backward-compatible re-export shim
64
+ - `orchestrator.py`: local filing orchestration and final output cleanup
65
+ - `utils/`: imports, tokenizer helpers, parse statistics, and shared setup
66
+ - `multimarkdown/`: MultiMarkdown table conversion
67
+ - `parsers/html/`: HTML preprocessing, table cleanup, parser, and postprocessing
68
+ - `parsers/ocr/`: Mistral OCR key rotation, PDF/image OCR, and OCR utilities
69
+ - `parsers/plaintext/`: plaintext and legacy text-form parsers
70
+ - `parsers/sgml/`: SGML document-block utilities
71
+ - `parsers/xml/`: XML filing-form parsers
72
+ - `sec_parser.py`: compatibility shim for old `python stanford_edgar_parser/sec_parser.py` usage
73
+ - `__main__.py`: `python -m stanford_edgar_parser` command-line entrypoint
74
+
75
+ The original implementation remains untouched at `sec_parser/sec_parser.py`.
76
+ The equivalence tests in `tests/parser_equivalence/` verify the split-module
77
+ coverage and compare parser outputs bit-for-bit.
78
+
79
+ ## Usage
80
+
81
+ ```bash
82
+ python -m stanford_edgar_parser path/to/filing.txt
83
+ python -m stanford_edgar_parser path/to/filing.txt --to_mmd
84
+ stanford-edgar-parser path/to/filing.txt --to_mmd
85
+ ```
86
+
87
+ ```python
88
+ from stanford_edgar_parser import main_one, parse_html_filing
89
+ ```
90
+
91
+ ## Agent Skill Install
92
+
93
+ Install bundled Codex and Claude skill files:
94
+
95
+ ```bash
96
+ stanford-edgar-install-skill
97
+ ```
98
+
99
+ Or from Python:
100
+
101
+ ```python
102
+ from stanford_edgar_parser.ai import install_skill
103
+
104
+ install_skill()
105
+ ```
106
+
107
+ Use `--overwrite` if you want to replace an existing installed skill.
108
+
109
+ ## MCP
110
+
111
+ After package install, expose the parser as an MCP server with:
112
+
113
+ ```toml
114
+ [mcp_servers.stanford_edgar_parser]
115
+ command = "uvx"
116
+ args = ["--from", "stanford-edgar-parser", "stanford-edgar-mcp"]
117
+ startup_timeout_sec = 120
118
+ ```
119
+
120
+ Before the PyPI release, use the GitHub package source:
121
+
122
+ ```toml
123
+ [mcp_servers.stanford_edgar_parser]
124
+ command = "uvx"
125
+ args = [
126
+ "--from",
127
+ "stanford-edgar-parser @ git+https://github.com/Stanford-Advanced-FinTech-Lab-SAFTL/stanford-edgar-filings-dataset.git",
128
+ "stanford-edgar-mcp"
129
+ ]
130
+ startup_timeout_sec = 120
131
+ ```
132
+
133
+ The package-installed MCP server always exposes `parse_filing`. Repo-local
134
+ rendering and review tools are exposed when the full clone includes
135
+ `multimarkdown.js`, `html-to-pdf.mjs`, and `tools/`.
@@ -0,0 +1,76 @@
1
+ # Stanford EDGAR Parser
2
+
3
+ Layout-faithful SEC EDGAR filing parser from the Stanford EDGAR Filings Dataset.
4
+ It converts raw EDGAR TXT/HTML/SGML/XML submissions into Markdown or
5
+ MultiMarkdown while preserving financial-table structure, indentation, links,
6
+ inline formatting, and filing metadata where possible.
7
+
8
+ ## Install
9
+
10
+ After PyPI release:
11
+
12
+ ```bash
13
+ pip install stanford-edgar-parser
14
+ ```
15
+
16
+ Until then, install directly from GitHub:
17
+
18
+ ```bash
19
+ pip install "stanford-edgar-parser @ git+https://github.com/Stanford-Advanced-FinTech-Lab-SAFTL/stanford-edgar-filings-dataset.git"
20
+ ```
21
+
22
+ ## Parse
23
+
24
+ ```bash
25
+ stanford-edgar-parser path/to/filing.txt --to_mmd
26
+ ```
27
+
28
+ or:
29
+
30
+ ```bash
31
+ python -m stanford_edgar_parser path/to/filing.txt --to_mmd
32
+ ```
33
+
34
+ ## Agent Skills
35
+
36
+ Install bundled Codex and Claude skills:
37
+
38
+ ```bash
39
+ stanford-edgar-install-skill
40
+ ```
41
+
42
+ or:
43
+
44
+ ```python
45
+ from stanford_edgar_parser.ai import install_skill
46
+
47
+ install_skill()
48
+ ```
49
+
50
+ ## MCP
51
+
52
+ Package install:
53
+
54
+ ```toml
55
+ [mcp_servers.stanford_edgar_parser]
56
+ command = "uvx"
57
+ args = ["--from", "stanford-edgar-parser", "stanford-edgar-mcp"]
58
+ startup_timeout_sec = 120
59
+ ```
60
+
61
+ GitHub install before PyPI release:
62
+
63
+ ```toml
64
+ [mcp_servers.stanford_edgar_parser]
65
+ command = "uvx"
66
+ args = [
67
+ "--from",
68
+ "stanford-edgar-parser @ git+https://github.com/Stanford-Advanced-FinTech-Lab-SAFTL/stanford-edgar-filings-dataset.git",
69
+ "stanford-edgar-mcp"
70
+ ]
71
+ startup_timeout_sec = 120
72
+ ```
73
+
74
+ The package-installed MCP server always exposes `parse_filing`. Repo-local
75
+ rendering and review tools are exposed when the full clone includes
76
+ `multimarkdown.js`, `html-to-pdf.mjs`, and `tools/`.
@@ -0,0 +1,56 @@
1
+ [build-system]
2
+ requires = ["setuptools>=69", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "stanford-edgar-parser"
7
+ version = "0.1.0"
8
+ description = "Layout-faithful SEC EDGAR filing parser from the Stanford EDGAR Filings Dataset."
9
+ readme = "stanford_edgar_parser/README.md"
10
+ requires-python = ">=3.11"
11
+ authors = [
12
+ { name = "Stanford Advanced Financial Technologies Lab" }
13
+ ]
14
+ keywords = ["sec", "edgar", "filings", "multimarkdown", "financial-documents"]
15
+ classifiers = [
16
+ "Programming Language :: Python :: 3",
17
+ "Programming Language :: Python :: 3.11",
18
+ "Operating System :: OS Independent",
19
+ "Topic :: Text Processing :: Markup",
20
+ ]
21
+ dependencies = [
22
+ "beautifulsoup4>=4.12.3",
23
+ "imgkit>=1.2.3",
24
+ "lxml>=6.0.0",
25
+ "mistralai>=1.9.0",
26
+ "numpy>=2.0.0",
27
+ "pandas>=2.0.0",
28
+ "playwright>=1.48.0",
29
+ "pydantic>=2.0.0",
30
+ "PyMuPDF>=1.24.0",
31
+ "PyPDF2>=3.0.1",
32
+ "python-dotenv>=1.0.0",
33
+ "requests>=2.32.0",
34
+ "tabulate>=0.9.0",
35
+ ]
36
+
37
+ [project.optional-dependencies]
38
+ dev = [
39
+ "build>=1.2.0",
40
+ "pytest>=8.0.0",
41
+ ]
42
+
43
+ [project.urls]
44
+ Homepage = "https://github.com/Stanford-Advanced-FinTech-Lab-SAFTL/stanford-edgar-filings-dataset"
45
+ Repository = "https://github.com/Stanford-Advanced-FinTech-Lab-SAFTL/stanford-edgar-filings-dataset"
46
+
47
+ [project.scripts]
48
+ stanford-edgar-parser = "stanford_edgar_parser.__main__:main"
49
+ stanford-edgar-mcp = "stanford_edgar_parser.mcp_server:main"
50
+ stanford-edgar-install-skill = "stanford_edgar_parser.ai:main"
51
+
52
+ [tool.setuptools.packages.find]
53
+ include = ["stanford_edgar_parser*"]
54
+
55
+ [tool.setuptools.package-data]
56
+ stanford_edgar_parser = ["agent_assets/**/*", "README.md"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,102 @@
1
+ # Stanford EDGAR Parser
2
+
3
+ Layout-faithful SEC filing parser used by the Stanford EDGAR Filings Dataset.
4
+ It converts raw EDGAR TXT/HTML/SGML/XML submissions into Markdown or
5
+ MultiMarkdown while preserving financial-table structure, indentation, links,
6
+ inline formatting, and filing metadata where possible.
7
+
8
+ ## Install
9
+
10
+ From PyPI, after release:
11
+
12
+ ```bash
13
+ pip install stanford-edgar-parser
14
+ ```
15
+
16
+ Until then, install directly from GitHub:
17
+
18
+ ```bash
19
+ pip install "stanford-edgar-parser @ git+https://github.com/Stanford-Advanced-FinTech-Lab-SAFTL/stanford-edgar-filings-dataset.git"
20
+ ```
21
+
22
+ For local development from a clone:
23
+
24
+ ```bash
25
+ pip install -e .
26
+ ```
27
+
28
+ ## Layout
29
+
30
+ - `runtime.py`: backward-compatible re-export shim
31
+ - `orchestrator.py`: local filing orchestration and final output cleanup
32
+ - `utils/`: imports, tokenizer helpers, parse statistics, and shared setup
33
+ - `multimarkdown/`: MultiMarkdown table conversion
34
+ - `parsers/html/`: HTML preprocessing, table cleanup, parser, and postprocessing
35
+ - `parsers/ocr/`: Mistral OCR key rotation, PDF/image OCR, and OCR utilities
36
+ - `parsers/plaintext/`: plaintext and legacy text-form parsers
37
+ - `parsers/sgml/`: SGML document-block utilities
38
+ - `parsers/xml/`: XML filing-form parsers
39
+ - `sec_parser.py`: compatibility shim for old `python stanford_edgar_parser/sec_parser.py` usage
40
+ - `__main__.py`: `python -m stanford_edgar_parser` command-line entrypoint
41
+
42
+ The original implementation remains untouched at `sec_parser/sec_parser.py`.
43
+ The equivalence tests in `tests/parser_equivalence/` verify the split-module
44
+ coverage and compare parser outputs bit-for-bit.
45
+
46
+ ## Usage
47
+
48
+ ```bash
49
+ python -m stanford_edgar_parser path/to/filing.txt
50
+ python -m stanford_edgar_parser path/to/filing.txt --to_mmd
51
+ stanford-edgar-parser path/to/filing.txt --to_mmd
52
+ ```
53
+
54
+ ```python
55
+ from stanford_edgar_parser import main_one, parse_html_filing
56
+ ```
57
+
58
+ ## Agent Skill Install
59
+
60
+ Install bundled Codex and Claude skill files:
61
+
62
+ ```bash
63
+ stanford-edgar-install-skill
64
+ ```
65
+
66
+ Or from Python:
67
+
68
+ ```python
69
+ from stanford_edgar_parser.ai import install_skill
70
+
71
+ install_skill()
72
+ ```
73
+
74
+ Use `--overwrite` if you want to replace an existing installed skill.
75
+
76
+ ## MCP
77
+
78
+ After package install, expose the parser as an MCP server with:
79
+
80
+ ```toml
81
+ [mcp_servers.stanford_edgar_parser]
82
+ command = "uvx"
83
+ args = ["--from", "stanford-edgar-parser", "stanford-edgar-mcp"]
84
+ startup_timeout_sec = 120
85
+ ```
86
+
87
+ Before the PyPI release, use the GitHub package source:
88
+
89
+ ```toml
90
+ [mcp_servers.stanford_edgar_parser]
91
+ command = "uvx"
92
+ args = [
93
+ "--from",
94
+ "stanford-edgar-parser @ git+https://github.com/Stanford-Advanced-FinTech-Lab-SAFTL/stanford-edgar-filings-dataset.git",
95
+ "stanford-edgar-mcp"
96
+ ]
97
+ startup_timeout_sec = 120
98
+ ```
99
+
100
+ The package-installed MCP server always exposes `parse_filing`. Repo-local
101
+ rendering and review tools are exposed when the full clone includes
102
+ `multimarkdown.js`, `html-to-pdf.mjs`, and `tools/`.
@@ -0,0 +1,39 @@
1
+ """Stanford EDGAR filings parser package."""
2
+
3
+ from . import _state
4
+ from . import api as _sec_parser
5
+
6
+ clean_financial_df = _sec_parser.clean_financial_df
7
+ convert_all_tables_to_mmd = _sec_parser.convert_all_tables_to_mmd
8
+ df_to_markdown = _sec_parser.df_to_markdown
9
+ df_to_multimarkdown = _sec_parser.df_to_multimarkdown
10
+ estimate_parser_tokens = _sec_parser.estimate_parser_tokens
11
+ main_one = _sec_parser.main_one
12
+ normalize_text_markup = _sec_parser.normalize_text_markup
13
+ parse_any_xml = _sec_parser.parse_any_xml
14
+ parse_html_filing = _sec_parser.parse_html_filing
15
+ parse_pdf_attachments = _sec_parser.parse_pdf_attachments
16
+ parse_plaintext_filing = _sec_parser.parse_plaintext_filing
17
+ process_local_xbrl = _sec_parser.process_local_xbrl
18
+
19
+
20
+ def __getattr__(name: str):
21
+ if hasattr(_state, name):
22
+ return getattr(_state, name)
23
+ return getattr(_sec_parser, name)
24
+
25
+ __all__ = [
26
+ "LAST_PARSE_STATS",
27
+ "clean_financial_df",
28
+ "convert_all_tables_to_mmd",
29
+ "df_to_markdown",
30
+ "df_to_multimarkdown",
31
+ "estimate_parser_tokens",
32
+ "main_one",
33
+ "normalize_text_markup",
34
+ "parse_any_xml",
35
+ "parse_html_filing",
36
+ "parse_pdf_attachments",
37
+ "parse_plaintext_filing",
38
+ "process_local_xbrl",
39
+ ]
@@ -0,0 +1,80 @@
1
+ """Command-line entrypoint for ``python -m stanford_edgar_parser``."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import json
7
+ import pathlib
8
+ import sys
9
+
10
+ from .api import (
11
+ get_mistral_key_status_snapshot,
12
+ main_one,
13
+ reset_mistral_key_status,
14
+ )
15
+
16
+
17
+ def main(argv: list[str] | None = None) -> int:
18
+ parser = argparse.ArgumentParser(
19
+ description="Parse an SEC filing in HTML, HTM, or TXT format and convert it to Markdown.",
20
+ formatter_class=argparse.RawTextHelpFormatter,
21
+ )
22
+ parser.add_argument("path", nargs="?", help="Path to the SEC filing.")
23
+ parser.add_argument(
24
+ "--to_mmd",
25
+ action="store_true",
26
+ help="Convert all tables in the final output to MultiMarkdown format.",
27
+ )
28
+ parser.add_argument(
29
+ "--source-document-url",
30
+ help="Optional absolute source document URL used to resolve normal relative links.",
31
+ )
32
+ parser.add_argument(
33
+ "--disable_indentation",
34
+ "--disable-indentation",
35
+ action="store_true",
36
+ dest="disable_indentation",
37
+ help="Remove final-output indentation NBSP markers from the written Markdown.",
38
+ )
39
+ parser.add_argument(
40
+ "--mistral-key-status",
41
+ action="store_true",
42
+ help="Print the shared Mistral key rotation/usage monitor JSON and exit.",
43
+ )
44
+ parser.add_argument(
45
+ "--reset-mistral-key-status",
46
+ action="store_true",
47
+ help="Reset the shared Mistral key rotation/usage monitor JSON and exit.",
48
+ )
49
+ args = parser.parse_args(argv)
50
+
51
+ if args.reset_mistral_key_status:
52
+ print(json.dumps(reset_mistral_key_status(), indent=2, sort_keys=True))
53
+ return 0
54
+ if args.mistral_key_status:
55
+ print(json.dumps(get_mistral_key_status_snapshot(), indent=2, sort_keys=True))
56
+ return 0
57
+ if not args.path:
58
+ parser.error("the following arguments are required: path")
59
+
60
+ file_path = pathlib.Path(args.path)
61
+ if not file_path.is_file() and file_path.parts and file_path.parts[0] == "sec_parser":
62
+ alt_path = pathlib.Path(*file_path.parts[1:])
63
+ if alt_path.is_file():
64
+ print(f"[info] Using '{alt_path}' instead of '{args.path}'.")
65
+ file_path = alt_path
66
+ if not file_path.is_file():
67
+ print(f"Error: File not found at {args.path}", file=sys.stderr)
68
+ return 1
69
+
70
+ main_one(
71
+ file_path,
72
+ to_mmd=args.to_mmd,
73
+ source_document_url=args.source_document_url,
74
+ disable_indentation=args.disable_indentation,
75
+ )
76
+ return 0
77
+
78
+
79
+ if __name__ == "__main__":
80
+ raise SystemExit(main())
@@ -0,0 +1,20 @@
1
+ FRAGMENTS = [
2
+ ('utils/bootstrap.py', 1, 113),
3
+ ('utils/tokenizer.py', 114, 232),
4
+ ('utils/parse_stats.py', 233, 497),
5
+ ('parsers/ocr/mistral_keys.py', 498, 1033),
6
+ ('parsers/ocr/ocr_utils.py', 1034, 1815),
7
+ ('multimarkdown/multimarkdown.py', 1816, 2128),
8
+ ('parsers/html/table_cleaning.py', 2129, 4211),
9
+ ('parsers/xml/ownership.py', 4212, 4509),
10
+ ('parsers/plaintext/plaintext_parser.py', 4510, 4592),
11
+ ('parsers/xml/fund_and_ownership.py', 4593, 7342),
12
+ ('parsers/xml/regulatory_forms.py', 7343, 11388),
13
+ ('parsers/html/preprocessing.py', 11389, 13085),
14
+ ('parsers/html/html.py', 13086, 13850),
15
+ ('parsers/html/postprocessing.py', 13851, 14531),
16
+ ('parsers/plaintext/legacy_form_parsers.py', 14532, 14675),
17
+ ('parsers/ocr/ocr.py', 14676, 14794),
18
+ ('parsers/sgml/sgml_utils.py', 14795, 15053),
19
+ ('orchestrator.py', 15054, 15460),
20
+ ]
@@ -0,0 +1,7 @@
1
+ """Mutable parser state shared by import-native parser modules."""
2
+
3
+ CURRENT_PROCESSING_FILE = "Unknown"
4
+ CURRENT_OCR_LOGGED_FILINGS = set()
5
+ CURRENT_SOURCE_DOCUMENT_URL = None
6
+ LAST_PARSE_STATS = None
7
+ LAST_POSITIONED_HTML_OCR_PAGE_COUNT = 0
@@ -0,0 +1,38 @@
1
+ ---
2
+ name: stanford-edgar-parser
3
+ description: Use the Stanford EDGAR Filings Dataset parser to parse SEC filings into layout-faithful MultiMarkdown, render and inspect examples, debug table/indentation/link/OCR issues, run showcase checks, and use the local MCP server.
4
+ ---
5
+
6
+ # Stanford EDGAR Parser
7
+
8
+ Use this skill in the Stanford EDGAR parser repository or in an environment
9
+ where the `stanford-edgar-parser` Python package is installed.
10
+
11
+ ## Commands
12
+
13
+ - Parse to MultiMarkdown:
14
+ `python -m stanford_edgar_parser path/to/filing.txt --to_mmd`
15
+ or `stanford-edgar-parser path/to/filing.txt --to_mmd`
16
+ - Remove final indentation markers:
17
+ `python -m stanford_edgar_parser path/to/filing.txt --to_mmd --disable-indentation`
18
+ - Render:
19
+ `node multimarkdown.js path/to/parsed.md > /tmp/sefd.html`
20
+ `node html-to-pdf.mjs /tmp/sefd.html path/to/rendered.pdf`
21
+ - Static showcase checks:
22
+ `python tools/check_showcase_tables.py examples`
23
+ - Raw-vs-parsed review:
24
+ `python tools/review_snippet.py <example-dir-or-accession> "<needle text>"`
25
+ - MCP server:
26
+ `python -m stanford_edgar_parser.mcp_server`
27
+ or `stanford-edgar-mcp`
28
+
29
+ The repo includes `.mcp.json` for clients that support project-local MCP configuration.
30
+ Package installs always expose `parse_filing`; repo-only render/review tools are exposed only when helper scripts are present.
31
+
32
+ ## Review Standard
33
+
34
+ Compare parser output to the raw browser view. Do not approve output just because rendered Markdown looks plausible.
35
+
36
+ Prioritize table fidelity: visible columns/rows, merged headers, indentation hierarchy, `$`, `%`, `)`, `bp`, accounting parentheses, superscripts, subscripts, same-target links, and image placeholders.
37
+
38
+ Fix parser root causes. Do not introduce phrase-specific showcase hardcodes.
@@ -0,0 +1,50 @@
1
+ ---
2
+ name: stanford-edgar-parser
3
+ description: Parse, render, inspect, and debug SEC EDGAR filings with the Stanford EDGAR Filings Dataset parser. Use when Codex is asked to convert local SEC filing TXT/HTML/SGML/XML/PDF-containing submissions into layout-faithful MultiMarkdown, review parser output against raw browser layout, diagnose table/indentation/link/OCR artifacts, run showcase checks, or use the repo's MCP server.
4
+ ---
5
+
6
+ # Stanford EDGAR Parser
7
+
8
+ Use this skill inside a Stanford EDGAR parser repo clone or in an environment
9
+ where the `stanford-edgar-parser` Python package is installed.
10
+
11
+ ## Quick Workflow
12
+
13
+ 1. Parse local filings with:
14
+ `python -m stanford_edgar_parser path/to/filing.txt --to_mmd`
15
+ or, from an installed package:
16
+ `stanford-edgar-parser path/to/filing.txt --to_mmd`
17
+ 2. Use `--disable-indentation` only when the caller wants final `&nbsp;` indentation markers removed.
18
+ 3. Render Markdown when visual QA matters:
19
+ `node multimarkdown.js path/to/parsed.md > /tmp/sefd.html`
20
+ `node html-to-pdf.mjs /tmp/sefd.html path/to/rendered.pdf`
21
+ 4. Run showcase checks before accepting examples or parser changes:
22
+ `python tools/check_showcase_tables.py examples`
23
+ 5. For suspicious output, compare raw HTML and parsed Markdown with:
24
+ `python tools/review_snippet.py <example-dir-or-accession> "<needle text>"`
25
+
26
+ ## Debugging Rules
27
+
28
+ - Fix source-level reconstruction logic, not local phrase hardcodes.
29
+ - Preserve filer text if the raw source itself has a typo, missing space, or odd punctuation.
30
+ - Never assume rendered Markdown alone is correct; compare against the raw browser view.
31
+ - Watch especially for detached `$`, `%`, `)`, `bp`, lost negative parentheses, dropped columns, row/col span drift, malformed emphasis, broken same-URL links, and missing indentation in lists/tables.
32
+ - Keep scratch review artifacts out of published `examples/` unless explicitly requested.
33
+
34
+ ## MCP
35
+
36
+ Start the local MCP server with:
37
+ `python -m stanford_edgar_parser.mcp_server`
38
+
39
+ or, from an installed package:
40
+ `stanford-edgar-mcp`
41
+
42
+ It exposes parser-oriented tools for parsing filings, rendering Markdown to PDF, running showcase checks, and generating raw-vs-parsed review snippets.
43
+
44
+ Package installs always expose `parse_filing`. Repo-only render/review tools are exposed only when `multimarkdown.js`, `html-to-pdf.mjs`, and `tools/` are present.
45
+
46
+ The repo includes `.mcp.json` for clients that support project-local MCP configuration.
47
+
48
+ ## Reference
49
+
50
+ Read `references/review.md` when doing a meticulous parser-output review or preparing showcase examples.
@@ -0,0 +1,4 @@
1
+ interface:
2
+ display_name: "Stanford EDGAR Parser"
3
+ short_description: "Parse SEC filings into SEFD MultiMarkdown"
4
+ default_prompt: "Use $stanford-edgar-parser to parse an SEC filing into layout-faithful MultiMarkdown and review the output."
@@ -0,0 +1,34 @@
1
+ # Parser Output Review
2
+
3
+ Use this checklist when reviewing parsed filings or showcase examples.
4
+
5
+ ## Compare Against Source
6
+
7
+ - Inspect the raw browser view for the same section, table, or paragraph.
8
+ - If the raw source itself has a typo or missing space, preserve it unless the task is explicit normalization.
9
+ - If the browser visually joins split HTML fragments, the parser should usually reconstruct the same semantic value.
10
+
11
+ ## Tables
12
+
13
+ - Check that every visible column and row appears.
14
+ - Check that merged headers remain grouped with MultiMarkdown `||` and `^^`.
15
+ - Check that currency/percent/parentheses modifiers attach to numbers: `$7700`, `75.0%`, `(200)`, `)bp`.
16
+ - Check that empty body rows are not artifacts from rowspan/colspan scaffolding.
17
+ - Check that numeric signs and accounting parentheses are not flipped or dropped.
18
+ - Check that row labels keep indentation levels where they carry hierarchy.
19
+
20
+ ## Text And Lists
21
+
22
+ - Check numbered, alphabetic, and parenthesized list markers for a visible space or indentation after the marker.
23
+ - Check that paragraph indentation is preserved where it expresses hierarchy.
24
+ - Check that adjacent styled spans do not create malformed Markdown emphasis.
25
+ - Check that adjacent links with the same target preserve readable spacing.
26
+ - Check image placeholders are explicit and not confused with body text.
27
+
28
+ ## Useful Commands
29
+
30
+ - `python tools/check_showcase_tables.py examples`
31
+ - `python tools/review_snippet.py examples/<accession> "<needle text>"`
32
+ - `python -m stanford_edgar_parser examples/<accession>/raw.txt --to_mmd`
33
+ - `node multimarkdown.js examples/<accession>/parsed.md > /tmp/sefd.html`
34
+ - `node html-to-pdf.mjs /tmp/sefd.html /tmp/sefd.pdf`