ipo-mine 0.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ipo_mine-0.0.0/PKG-INFO +150 -0
- ipo_mine-0.0.0/README.md +132 -0
- ipo_mine-0.0.0/pyproject.toml +36 -0
- ipo_mine-0.0.0/setup.cfg +4 -0
- ipo_mine-0.0.0/src/ipo_mine/__init__.py +3 -0
- ipo_mine-0.0.0/src/ipo_mine/download/__init__.py +7 -0
- ipo_mine-0.0.0/src/ipo_mine/download/company.py +112 -0
- ipo_mine-0.0.0/src/ipo_mine/download/dataset.py +193 -0
- ipo_mine-0.0.0/src/ipo_mine/download/downloader.py +347 -0
- ipo_mine-0.0.0/src/ipo_mine/entities/CompanyFilings.py +27 -0
- ipo_mine-0.0.0/src/ipo_mine/entities/Filing.py +17 -0
- ipo_mine-0.0.0/src/ipo_mine/entities/FilingImage.py +9 -0
- ipo_mine-0.0.0/src/ipo_mine/entities/FormType.py +8 -0
- ipo_mine-0.0.0/src/ipo_mine/entities/S1Filing.py +52 -0
- ipo_mine-0.0.0/src/ipo_mine/entities/S1FilingImage.py +11 -0
- ipo_mine-0.0.0/src/ipo_mine/entities/__init__.py +12 -0
- ipo_mine-0.0.0/src/ipo_mine/mappings/__init__.py +1 -0
- ipo_mine-0.0.0/src/ipo_mine/mappings/global_mapping.py +44 -0
- ipo_mine-0.0.0/src/ipo_mine/parse/__init__.py +12 -0
- ipo_mine-0.0.0/src/ipo_mine/parse/ipo_parser.py +276 -0
- ipo_mine-0.0.0/src/ipo_mine/parse/process_text_image.py +309 -0
- ipo_mine-0.0.0/src/ipo_mine/parse/section_parser.py +355 -0
- ipo_mine-0.0.0/src/ipo_mine/parse/table_parser.py +329 -0
- ipo_mine-0.0.0/src/ipo_mine/parse/toc_parser.py +1249 -0
- ipo_mine-0.0.0/src/ipo_mine/resources/__init__.py +31 -0
- ipo_mine-0.0.0/src/ipo_mine/utils/__init__.py +41 -0
- ipo_mine-0.0.0/src/ipo_mine/utils/config.py +191 -0
- ipo_mine-0.0.0/src/ipo_mine/utils/helpers.py +126 -0
- ipo_mine-0.0.0/src/ipo_mine.egg-info/PKG-INFO +150 -0
- ipo_mine-0.0.0/src/ipo_mine.egg-info/SOURCES.txt +32 -0
- ipo_mine-0.0.0/src/ipo_mine.egg-info/dependency_links.txt +1 -0
- ipo_mine-0.0.0/src/ipo_mine.egg-info/requires.txt +11 -0
- ipo_mine-0.0.0/src/ipo_mine.egg-info/top_level.txt +1 -0
- ipo_mine-0.0.0/tests/test.py +0 -0
ipo_mine-0.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ipo-mine
|
|
3
|
+
Version: 0.0.0
|
|
4
|
+
Summary: Mining and parsing S-1 IPO filings
|
|
5
|
+
Author: Michael Galarnyk
|
|
6
|
+
Requires-Python: >=3.9
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: pandas>=1.5
|
|
9
|
+
Requires-Dist: numpy>=1.21
|
|
10
|
+
Requires-Dist: beautifulsoup4>=4.11
|
|
11
|
+
Requires-Dist: lxml>=4.9
|
|
12
|
+
Requires-Dist: requests>=2.28
|
|
13
|
+
Requires-Dist: fuzzywuzzy>=0.18.0
|
|
14
|
+
Requires-Dist: python-levenshtein
|
|
15
|
+
Provides-Extra: dev
|
|
16
|
+
Requires-Dist: pytest; extra == "dev"
|
|
17
|
+
Requires-Dist: jupyter; extra == "dev"
|
|
18
|
+
|
|
19
|
+
# IPO-Mine: S-1 (IPO) Filings Toolkit
|
|
20
|
+
|
|
21
|
+
GitHub Repository: https://github.com/gtfintechlab/S1-Filings
|
|
22
|
+
Project Website: https://ipo-mine.web.app/
|
|
23
|
+
|
|
24
|
+
## Overview
|
|
25
|
+
|
|
26
|
+
IPO-Mine is a Python package for downloading, parsing, and structuring S-1 IPO filings from the U.S. Securities and Exchange Commission (SEC) EDGAR system.
|
|
27
|
+
|
|
28
|
+
This repository implements the data processing pipeline used to construct the IPO-Mine dataset, a section-structured corpus introduced in the research paper:
|
|
29
|
+
|
|
30
|
+
IPO-Mine: A Section-Structured Dataset for Analyzing Long and Complex IPO Filings
|
|
31
|
+
|
|
32
|
+
The objective of this project is to transform raw SEC filings into clean, standardized, and section-aligned textual representations suitable for large-scale analysis in natural language processing, information retrieval, and long-document modeling.
|
|
33
|
+
|
|
34
|
+
## Motivation
|
|
35
|
+
|
|
36
|
+
S-1 filings are among the most complex regulatory documents used in empirical research. They exhibit several challenges:
|
|
37
|
+
|
|
38
|
+
- Extreme document length, often exceeding 100–300 pages
|
|
39
|
+
- Substantial variation in section headers across firms and time
|
|
40
|
+
- Heterogeneous formats, including HTML, plain text, and scanned images
|
|
41
|
+
- Limited structural consistency despite regulatory guidance
|
|
42
|
+
|
|
43
|
+
These characteristics complicate tasks such as section segmentation, cross-firm comparison, longitudinal analysis, and long-context modeling.
|
|
44
|
+
|
|
45
|
+
IPO-Mine addresses these challenges by providing a unified and reproducible pipeline that converts raw EDGAR filings into structured, research-ready data.
|
|
46
|
+
|
|
47
|
+
## Features
|
|
48
|
+
|
|
49
|
+
- Automated downloading of S-1 and S-1/A filings from SEC EDGAR
|
|
50
|
+
- Parsing of Tables of Contents (TOCs) for filings dating back to 1997
|
|
51
|
+
- Extraction and normalization of key IPO sections, including:
|
|
52
|
+
- Risk Factors
|
|
53
|
+
- Business
|
|
54
|
+
- Use of Proceeds
|
|
55
|
+
- Management’s Discussion and Analysis (MD&A)
|
|
56
|
+
- Financial Statements
|
|
57
|
+
- Support for multiple filing formats:
|
|
58
|
+
- HTML
|
|
59
|
+
- plain text
|
|
60
|
+
- image-based filings via OCR
|
|
61
|
+
- Fuzzy matching of section headers using global section mappings
|
|
62
|
+
- Deterministic outputs suitable for reproducible dataset construction
|
|
63
|
+
|
|
64
|
+
## IPO-Mine Dataset
|
|
65
|
+
|
|
66
|
+
Using this toolkit, the IPO-Mine dataset is constructed as a large-scale corpus of IPO filings with:
|
|
67
|
+
|
|
68
|
+
- Section-aligned text across firms
|
|
69
|
+
- Standardized section nomenclature
|
|
70
|
+
- Clean document boundaries
|
|
71
|
+
- Compatibility with long-document modeling and retrieval frameworks
|
|
72
|
+
|
|
73
|
+
Additional details and examples are available at:
|
|
74
|
+
|
|
75
|
+
https://ipo-mine.web.app/
|
|
76
|
+
|
|
77
|
+
## Installation
|
|
78
|
+
|
|
79
|
+
The package is available on PyPI under the name `ipo-mine`.
|
|
80
|
+
|
|
81
|
+
```
|
|
82
|
+
pip install ipo-mine
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## OCR Dependency
|
|
86
|
+
|
|
87
|
+
Parsing image-based filings requires a local installation of Tesseract OCR.
|
|
88
|
+
|
|
89
|
+
### Tesseract Installation
|
|
90
|
+
|
|
91
|
+
| Operating System | Installation Method |
|
|
92
|
+
|------------------|---------------------|
|
|
93
|
+
| macOS | `brew install tesseract` |
|
|
94
|
+
| Ubuntu / Debian | `sudo apt install tesseract-ocr` |
|
|
95
|
+
| Windows | UB Mannheim Tesseract installer |
|
|
96
|
+
| Conda environments | Included automatically |
|
|
97
|
+
|
|
98
|
+
## Example Usage
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
from ipo_mine.download.company import Company
|
|
102
|
+
from ipo_mine.download import S1Downloader
|
|
103
|
+
from ipo_mine.parse.s1_parser import S1Parser
|
|
104
|
+
from ipo_mine.resources import GLOBAL_SECTIONS_JSON
|
|
105
|
+
from ipo_mine.utils.config import PARSED_DIR
|
|
106
|
+
|
|
107
|
+
downloader = S1Downloader(
|
|
108
|
+
email="your_email@domain.com",
|
|
109
|
+
company="Your Institution"
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
ticker = "SNOW"
|
|
113
|
+
filing = downloader.download_s1(Company.from_ticker(ticker))
|
|
114
|
+
|
|
115
|
+
parser = S1Parser(
|
|
116
|
+
filing=filing,
|
|
117
|
+
mappings_path=GLOBAL_SECTIONS_JSON,
|
|
118
|
+
output_base_path=PARSED_DIR
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
risk_factors = parser.parse_section("Risk Factors", ticker)
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## Research-Oriented Design
|
|
125
|
+
|
|
126
|
+
This library is designed primarily for dataset construction and reproducible empirical research rather than ad-hoc scraping.
|
|
127
|
+
|
|
128
|
+
Typical use cases include:
|
|
129
|
+
|
|
130
|
+
- Building section-aligned IPO corpora
|
|
131
|
+
- Comparing disclosure language across firms and time
|
|
132
|
+
- Training and evaluation of long-document language models
|
|
133
|
+
- Large-scale studies of regulatory disclosures
|
|
134
|
+
|
|
135
|
+
## Citation
|
|
136
|
+
|
|
137
|
+
If you use this package or the IPO-Mine dataset in your research, please cite:
|
|
138
|
+
|
|
139
|
+
```
|
|
140
|
+
@inproceedings{ipomine2025,
|
|
141
|
+
title = {IPO-Mine: A Section-Structured Dataset for Analyzing Long and Complex IPO Filings},
|
|
142
|
+
author = {Author names},
|
|
143
|
+
booktitle = {Proceedings of the ACM SIGKDD Conference},
|
|
144
|
+
year = {2025}
|
|
145
|
+
}
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
## License
|
|
149
|
+
|
|
150
|
+
This project is released under the MIT License.
|
ipo_mine-0.0.0/README.md
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
# IPO-Mine: S-1 (IPO) Filings Toolkit
|
|
2
|
+
|
|
3
|
+
GitHub Repository: https://github.com/gtfintechlab/S1-Filings
|
|
4
|
+
Project Website: https://ipo-mine.web.app/
|
|
5
|
+
|
|
6
|
+
## Overview
|
|
7
|
+
|
|
8
|
+
IPO-Mine is a Python package for downloading, parsing, and structuring S-1 IPO filings from the U.S. Securities and Exchange Commission (SEC) EDGAR system.
|
|
9
|
+
|
|
10
|
+
This repository implements the data processing pipeline used to construct the IPO-Mine dataset, a section-structured corpus introduced in the research paper:
|
|
11
|
+
|
|
12
|
+
IPO-Mine: A Section-Structured Dataset for Analyzing Long and Complex IPO Filings
|
|
13
|
+
|
|
14
|
+
The objective of this project is to transform raw SEC filings into clean, standardized, and section-aligned textual representations suitable for large-scale analysis in natural language processing, information retrieval, and long-document modeling.
|
|
15
|
+
|
|
16
|
+
## Motivation
|
|
17
|
+
|
|
18
|
+
S-1 filings are among the most complex regulatory documents used in empirical research. They exhibit several challenges:
|
|
19
|
+
|
|
20
|
+
- Extreme document length, often exceeding 100–300 pages
|
|
21
|
+
- Substantial variation in section headers across firms and time
|
|
22
|
+
- Heterogeneous formats, including HTML, plain text, and scanned images
|
|
23
|
+
- Limited structural consistency despite regulatory guidance
|
|
24
|
+
|
|
25
|
+
These characteristics complicate tasks such as section segmentation, cross-firm comparison, longitudinal analysis, and long-context modeling.
|
|
26
|
+
|
|
27
|
+
IPO-Mine addresses these challenges by providing a unified and reproducible pipeline that converts raw EDGAR filings into structured, research-ready data.
|
|
28
|
+
|
|
29
|
+
## Features
|
|
30
|
+
|
|
31
|
+
- Automated downloading of S-1 and S-1/A filings from SEC EDGAR
|
|
32
|
+
- Parsing of Tables of Contents (TOCs) for filings dating back to 1997
|
|
33
|
+
- Extraction and normalization of key IPO sections, including:
|
|
34
|
+
- Risk Factors
|
|
35
|
+
- Business
|
|
36
|
+
- Use of Proceeds
|
|
37
|
+
- Management’s Discussion and Analysis (MD&A)
|
|
38
|
+
- Financial Statements
|
|
39
|
+
- Support for multiple filing formats:
|
|
40
|
+
- HTML
|
|
41
|
+
- plain text
|
|
42
|
+
- image-based filings via OCR
|
|
43
|
+
- Fuzzy matching of section headers using global section mappings
|
|
44
|
+
- Deterministic outputs suitable for reproducible dataset construction
|
|
45
|
+
|
|
46
|
+
## IPO-Mine Dataset
|
|
47
|
+
|
|
48
|
+
Using this toolkit, the IPO-Mine dataset is constructed as a large-scale corpus of IPO filings with:
|
|
49
|
+
|
|
50
|
+
- Section-aligned text across firms
|
|
51
|
+
- Standardized section nomenclature
|
|
52
|
+
- Clean document boundaries
|
|
53
|
+
- Compatibility with long-document modeling and retrieval frameworks
|
|
54
|
+
|
|
55
|
+
Additional details and examples are available at:
|
|
56
|
+
|
|
57
|
+
https://ipo-mine.web.app/
|
|
58
|
+
|
|
59
|
+
## Installation
|
|
60
|
+
|
|
61
|
+
The package is available on PyPI under the name `ipo-mine`.
|
|
62
|
+
|
|
63
|
+
```
|
|
64
|
+
pip install ipo-mine
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## OCR Dependency
|
|
68
|
+
|
|
69
|
+
Parsing image-based filings requires a local installation of Tesseract OCR.
|
|
70
|
+
|
|
71
|
+
### Tesseract Installation
|
|
72
|
+
|
|
73
|
+
| Operating System | Installation Method |
|
|
74
|
+
|------------------|---------------------|
|
|
75
|
+
| macOS | `brew install tesseract` |
|
|
76
|
+
| Ubuntu / Debian | `sudo apt install tesseract-ocr` |
|
|
77
|
+
| Windows | UB Mannheim Tesseract installer |
|
|
78
|
+
| Conda environments | Included automatically |
|
|
79
|
+
|
|
80
|
+
## Example Usage
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
from ipo_mine.download.company import Company
|
|
84
|
+
from ipo_mine.download import S1Downloader
|
|
85
|
+
from ipo_mine.parse.s1_parser import S1Parser
|
|
86
|
+
from ipo_mine.resources import GLOBAL_SECTIONS_JSON
|
|
87
|
+
from ipo_mine.utils.config import PARSED_DIR
|
|
88
|
+
|
|
89
|
+
downloader = S1Downloader(
|
|
90
|
+
email="your_email@domain.com",
|
|
91
|
+
company="Your Institution"
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
ticker = "SNOW"
|
|
95
|
+
filing = downloader.download_s1(Company.from_ticker(ticker))
|
|
96
|
+
|
|
97
|
+
parser = S1Parser(
|
|
98
|
+
filing=filing,
|
|
99
|
+
mappings_path=GLOBAL_SECTIONS_JSON,
|
|
100
|
+
output_base_path=PARSED_DIR
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
risk_factors = parser.parse_section("Risk Factors", ticker)
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## Research-Oriented Design
|
|
107
|
+
|
|
108
|
+
This library is designed primarily for dataset construction and reproducible empirical research rather than ad-hoc scraping.
|
|
109
|
+
|
|
110
|
+
Typical use cases include:
|
|
111
|
+
|
|
112
|
+
- Building section-aligned IPO corpora
|
|
113
|
+
- Comparing disclosure language across firms and time
|
|
114
|
+
- Training and evaluation of long-document language models
|
|
115
|
+
- Large-scale studies of regulatory disclosures
|
|
116
|
+
|
|
117
|
+
## Citation
|
|
118
|
+
|
|
119
|
+
If you use this package or the IPO-Mine dataset in your research, please cite:
|
|
120
|
+
|
|
121
|
+
```
|
|
122
|
+
@inproceedings{ipomine2025,
|
|
123
|
+
title = {IPO-Mine: A Section-Structured Dataset for Analyzing Long and Complex IPO Filings},
|
|
124
|
+
author = {Author names},
|
|
125
|
+
booktitle = {Proceedings of the ACM SIGKDD Conference},
|
|
126
|
+
year = {2025}
|
|
127
|
+
}
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## License
|
|
131
|
+
|
|
132
|
+
This project is released under the MIT License.
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "ipo-mine"
|
|
7
|
+
version = "0.0.0"
|
|
8
|
+
description = "Mining and parsing S-1 IPO filings"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Michael Galarnyk" }
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
dependencies = [
|
|
17
|
+
"pandas>=1.5",
|
|
18
|
+
"numpy>=1.21",
|
|
19
|
+
"beautifulsoup4>=4.11",
|
|
20
|
+
"lxml>=4.9",
|
|
21
|
+
"requests>=2.28",
|
|
22
|
+
"fuzzywuzzy>=0.18.0",
|
|
23
|
+
"python-levenshtein",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
[project.optional-dependencies]
|
|
27
|
+
dev = [
|
|
28
|
+
"pytest",
|
|
29
|
+
"jupyter",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[tool.setuptools]
|
|
33
|
+
package-dir = {"" = "src"}
|
|
34
|
+
|
|
35
|
+
[tool.setuptools.packages.find]
|
|
36
|
+
where = ["src"]
|
ipo_mine-0.0.0/setup.cfg
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import requests
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Optional, Dict, Tuple
|
|
5
|
+
|
|
6
|
+
from resources import get_resource_path
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class Company:
|
|
10
|
+
|
|
11
|
+
ticker: str
|
|
12
|
+
cik: str
|
|
13
|
+
active: bool
|
|
14
|
+
|
|
15
|
+
_ticker_to_cik: Optional[Dict[str, Tuple[str, bool]]] = None
|
|
16
|
+
_cik_to_ticker: Optional[Dict[str, Tuple[str, bool]]] = None
|
|
17
|
+
|
|
18
|
+
@classmethod
|
|
19
|
+
def _load_mapping(cls):
|
|
20
|
+
"""
|
|
21
|
+
Loads and caches the ticker-CIK mapping from file(s) into dictionaries.
|
|
22
|
+
This method is designed to run only once.
|
|
23
|
+
"""
|
|
24
|
+
if cls._ticker_to_cik is not None and cls._cik_to_ticker is not None:
|
|
25
|
+
return
|
|
26
|
+
|
|
27
|
+
# print("Loading and caching Ticker-CIK mapping for the first time...")
|
|
28
|
+
try:
|
|
29
|
+
csv_path = get_resource_path("company_tickers_to_cik.csv")
|
|
30
|
+
df = pd.read_csv(csv_path, dtype=str)
|
|
31
|
+
df['active'] = True # Mark entries from the SEC file as active
|
|
32
|
+
except FileNotFoundError:
|
|
33
|
+
cls._download_ticker_cik_mapping()
|
|
34
|
+
csv_path = get_resource_path("company_tickers_to_cik.csv")
|
|
35
|
+
df = pd.read_csv(csv_path, dtype=str)
|
|
36
|
+
df['active'] = True # Mark entries from the SEC file as active
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
upgraded_df = pd.read_csv("upgraded_mapping.csv", dtype=str)
|
|
40
|
+
upgraded_df['active'] = False # Mark entries from the custom file as inactive
|
|
41
|
+
# Combine the two DataFrames. If there are duplicate tickers,
|
|
42
|
+
# the one from the SEC (marked active=True) will be kept.
|
|
43
|
+
df = pd.concat([upgraded_df, df]).drop_duplicates(subset=['ticker'], keep='last')
|
|
44
|
+
# print("Successfully merged upgraded mapping.")
|
|
45
|
+
# except FileNotFoundError:
|
|
46
|
+
# print("Warning: 'upgraded_mapping' file not found. Using standard mapping only.")
|
|
47
|
+
except Exception as e:
|
|
48
|
+
pass
|
|
49
|
+
# print(f"Warning: Could not load 'upgraded_mapping' due to an error: {e}")
|
|
50
|
+
|
|
51
|
+
# Create fast dictionary lookups storing a tuple of (value, active_status)
|
|
52
|
+
cls._ticker_to_cik = {
|
|
53
|
+
str(row.ticker): (str(row.cik_str), bool(row.active))
|
|
54
|
+
for row in df.itertuples(index=False)
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
cls._cik_to_ticker = {
|
|
58
|
+
str(row.cik_str): (str(row.ticker), bool(row.active))
|
|
59
|
+
for row in df.itertuples(index=False)
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
@classmethod
|
|
63
|
+
def from_ticker(cls, ticker: str) -> 'Company':
|
|
64
|
+
"""Create Company from ticker, looking up the CIK and active status."""
|
|
65
|
+
lookup_result = cls._lookup_cik_from_ticker(ticker.upper())
|
|
66
|
+
if not lookup_result:
|
|
67
|
+
raise ValueError(f"No CIK found for ticker: {ticker}")
|
|
68
|
+
|
|
69
|
+
cik, active = lookup_result
|
|
70
|
+
return cls(ticker=ticker.upper(), cik=cik, active=active)
|
|
71
|
+
|
|
72
|
+
@classmethod
|
|
73
|
+
def from_cik(cls, cik: str) -> 'Company':
|
|
74
|
+
"""Create Company from CIK, looking up the ticker and active status."""
|
|
75
|
+
formatted_cik = cik.zfill(10)
|
|
76
|
+
lookup_result = cls._lookup_ticker_from_cik(formatted_cik)
|
|
77
|
+
if not lookup_result:
|
|
78
|
+
return cls(ticker="", cik=formatted_cik, active=False)
|
|
79
|
+
|
|
80
|
+
ticker, active = lookup_result
|
|
81
|
+
return cls(ticker=ticker, cik=formatted_cik, active=active)
|
|
82
|
+
|
|
83
|
+
@classmethod
|
|
84
|
+
def _lookup_cik_from_ticker(cls, ticker: str) -> Optional[str]:
|
|
85
|
+
"""Look up CIK from the cached dictionary. 🚀"""
|
|
86
|
+
cls._load_mapping() # Ensures mapping is loaded before lookup
|
|
87
|
+
return cls._ticker_to_cik.get(ticker)
|
|
88
|
+
|
|
89
|
+
@classmethod
|
|
90
|
+
def _lookup_ticker_from_cik(cls, cik: str) -> Optional[str]:
|
|
91
|
+
"""Look up ticker from the cached dictionary. 🚀"""
|
|
92
|
+
cls._load_mapping() # Ensures mapping is loaded before lookup
|
|
93
|
+
return cls._cik_to_ticker.get(cik)
|
|
94
|
+
|
|
95
|
+
@staticmethod
|
|
96
|
+
def _download_ticker_cik_mapping():
|
|
97
|
+
"""Download the ticker-CIK mapping from SEC."""
|
|
98
|
+
headers = {"User-Agent": "Company Lookup Tool contact@example.com"}
|
|
99
|
+
try:
|
|
100
|
+
response = requests.get(
|
|
101
|
+
"https://www.sec.gov/files/company_tickers.json",
|
|
102
|
+
headers=headers
|
|
103
|
+
)
|
|
104
|
+
response.raise_for_status()
|
|
105
|
+
df = pd.DataFrame.from_dict(response.json(), orient="index")
|
|
106
|
+
df["cik_str"] = df["cik_str"].astype(str).str.zfill(10)
|
|
107
|
+
|
|
108
|
+
save_path = get_resource_path("company_tickers_to_cik.csv")
|
|
109
|
+
df.to_csv(save_path, index=False)
|
|
110
|
+
print(f"✅ Successfully downloaded ticker-CIK mapping → {save_path}")
|
|
111
|
+
except Exception as e:
|
|
112
|
+
raise RuntimeError(f"Failed to download ticker-CIK mapping: {e}")
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Dict, Any, List, Union, Optional, Tuple
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from entities import S1Filing, S1FilingImage, CompanyFilings, Filing, FilingImage, FormType
|
|
5
|
+
import random
|
|
6
|
+
|
|
7
|
+
class Dataset:
|
|
8
|
+
"""
|
|
9
|
+
Manages the collection of CompanyFilings objects, handling JSON loading and sampling.
|
|
10
|
+
"""
|
|
11
|
+
def __init__(self, raw_data_or_path: Union[str, Dict[str, Dict[str, Any]]]):
|
|
12
|
+
"""
|
|
13
|
+
Initializes the dataset by either loading from a file path or processing
|
|
14
|
+
an existing dictionary of raw data.
|
|
15
|
+
"""
|
|
16
|
+
if isinstance(raw_data_or_path, str):
|
|
17
|
+
raw_data = self._load_json(raw_data_or_path)
|
|
18
|
+
else:
|
|
19
|
+
raw_data = raw_data_or_path
|
|
20
|
+
|
|
21
|
+
self.companies: Dict[str, CompanyFilings] = self._parse_data(raw_data)
|
|
22
|
+
|
|
23
|
+
def _load_json(self, file_path: str) -> Dict[str, Dict[str, Any]]:
|
|
24
|
+
"""Handles opening and parsing the JSON file."""
|
|
25
|
+
with open(file_path, 'r') as f:
|
|
26
|
+
return json.load(f)
|
|
27
|
+
|
|
28
|
+
def _parse_data(self, raw_data: Dict[str, Dict[str, Any]]) -> Dict[str, CompanyFilings]:
|
|
29
|
+
"""Iterates through the raw data dictionary and creates CompanyFilings objects."""
|
|
30
|
+
parsed_companies = {}
|
|
31
|
+
|
|
32
|
+
for cik, company_data in raw_data.items():
|
|
33
|
+
# Extract and parse filings
|
|
34
|
+
raw_filings = company_data.get("filings", [])
|
|
35
|
+
filings_list = []
|
|
36
|
+
|
|
37
|
+
for filing_data in raw_filings:
|
|
38
|
+
# Parse images for this filing
|
|
39
|
+
raw_images = filing_data.get("images", [])
|
|
40
|
+
images_list = [
|
|
41
|
+
FilingImage(
|
|
42
|
+
img_name=img.get('img_name'),
|
|
43
|
+
url=img.get('url'),
|
|
44
|
+
local_path=img.get('local_path')
|
|
45
|
+
) for img in raw_images
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
# Parse form_type enum
|
|
49
|
+
form_type_str = filing_data.get('form_type', '')
|
|
50
|
+
try:
|
|
51
|
+
form_type = FormType(form_type_str)
|
|
52
|
+
except ValueError:
|
|
53
|
+
print(f"Warning: Unknown form type '{form_type_str}' for CIK {cik}")
|
|
54
|
+
continue
|
|
55
|
+
|
|
56
|
+
# Create Filing object
|
|
57
|
+
filing = Filing(
|
|
58
|
+
form_type=form_type,
|
|
59
|
+
acession_number=filing_data.get('acession_number', ''),
|
|
60
|
+
filing_date=filing_data.get('filing_date', ''),
|
|
61
|
+
primary_document=filing_data.get('primary_document', ''),
|
|
62
|
+
filing_url=filing_data.get('filing_url', ''),
|
|
63
|
+
local_path=filing_data.get('local_path'),
|
|
64
|
+
images=images_list,
|
|
65
|
+
raw_content=filing_data.get('raw_content')
|
|
66
|
+
)
|
|
67
|
+
filings_list.append(filing)
|
|
68
|
+
|
|
69
|
+
# Create CompanyFilings object
|
|
70
|
+
company = CompanyFilings(
|
|
71
|
+
tickers=company_data.get('tickers', []),
|
|
72
|
+
cik=company_data.get('cik', cik),
|
|
73
|
+
name=company_data.get('name', ''),
|
|
74
|
+
sic=company_data.get('sic'),
|
|
75
|
+
industry=company_data.get('industry'),
|
|
76
|
+
office=company_data.get('office'),
|
|
77
|
+
exchanges=company_data.get('exchanges'),
|
|
78
|
+
filings=filings_list
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
parsed_companies[cik] = company
|
|
82
|
+
|
|
83
|
+
return parsed_companies
|
|
84
|
+
|
|
85
|
+
def sample_filings_by_year(self, num_samples: int, start_year: Optional[int] = None, end_year: Optional[int] = None) -> 'Dataset':
|
|
86
|
+
"""
|
|
87
|
+
Samples up to 'num_samples' filings per year and returns a new
|
|
88
|
+
Dataset instance containing only the sampled filings.
|
|
89
|
+
|
|
90
|
+
:param num_samples: The maximum number of filings to sample per year.
|
|
91
|
+
:param start_year: The start year to sample (inclusive)
|
|
92
|
+
:param end_year: The end year to sample (exclusive)
|
|
93
|
+
:return: A new Dataset instance with the sampled data.
|
|
94
|
+
"""
|
|
95
|
+
|
|
96
|
+
# Collect all filings across all companies, grouped by year
|
|
97
|
+
filings_by_year: Dict[int, List[Tuple[str, Filing]]] = defaultdict(list)
|
|
98
|
+
|
|
99
|
+
for cik, company in self.companies.items():
|
|
100
|
+
for filing in company.filings:
|
|
101
|
+
# Extract year from filing_date (format: "YYYY-MM-DD")
|
|
102
|
+
try:
|
|
103
|
+
year = int(filing.filing_date.split('-')[0])
|
|
104
|
+
except (ValueError, IndexError, AttributeError):
|
|
105
|
+
print(f"Warning: Could not parse filing_date for CIK {cik}")
|
|
106
|
+
continue
|
|
107
|
+
|
|
108
|
+
if (start_year is not None and year < start_year):
|
|
109
|
+
continue
|
|
110
|
+
if (end_year is not None and year >= end_year):
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
filings_by_year[year].append((cik, filing))
|
|
114
|
+
|
|
115
|
+
# Sample filings per year
|
|
116
|
+
sampled_filings: Dict[str, List[Filing]] = defaultdict(list)
|
|
117
|
+
sorted_years = sorted(filings_by_year.keys())
|
|
118
|
+
|
|
119
|
+
for year in sorted_years:
|
|
120
|
+
filing_list = filings_by_year[year]
|
|
121
|
+
k = min(num_samples, len(filing_list))
|
|
122
|
+
yearly_sample = random.sample(filing_list, k)
|
|
123
|
+
|
|
124
|
+
for cik, filing in yearly_sample:
|
|
125
|
+
sampled_filings[cik].append(filing)
|
|
126
|
+
|
|
127
|
+
# Reconstruct raw data structure with only sampled filings
|
|
128
|
+
new_raw_data = {}
|
|
129
|
+
|
|
130
|
+
for cik, filing_list in sampled_filings.items():
|
|
131
|
+
company = self.companies[cik]
|
|
132
|
+
|
|
133
|
+
# Convert filings to dictionaries
|
|
134
|
+
filings_data = []
|
|
135
|
+
for filing in filing_list:
|
|
136
|
+
filing_dict = {
|
|
137
|
+
'form_type': filing.form_type.value,
|
|
138
|
+
'acession_number': filing.acession_number,
|
|
139
|
+
'filing_date': filing.filing_date,
|
|
140
|
+
'primary_document': filing.primary_document,
|
|
141
|
+
'filing_url': filing.filing_url,
|
|
142
|
+
'local_path': filing.local_path,
|
|
143
|
+
'images': [
|
|
144
|
+
{
|
|
145
|
+
'img_name': img.img_name,
|
|
146
|
+
'url': img.url,
|
|
147
|
+
'local_path': img.local_path
|
|
148
|
+
} for img in filing.images
|
|
149
|
+
],
|
|
150
|
+
'raw_content': filing.raw_content
|
|
151
|
+
}
|
|
152
|
+
filings_data.append(filing_dict)
|
|
153
|
+
|
|
154
|
+
# Reconstruct company data
|
|
155
|
+
new_raw_data[cik] = {
|
|
156
|
+
'tickers': company.tickers,
|
|
157
|
+
'cik': company.cik,
|
|
158
|
+
'name': company.name,
|
|
159
|
+
'sic': company.sic,
|
|
160
|
+
'industry': company.industry,
|
|
161
|
+
'office': company.office,
|
|
162
|
+
'exchanges': company.exchanges,
|
|
163
|
+
'filings': filings_data
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
total_sampled = sum(len(filings) for filings in sampled_filings.values())
|
|
167
|
+
print(f"Sampled {total_sampled} filings across {len(filings_by_year)} years from {len(sampled_filings)} companies.")
|
|
168
|
+
|
|
169
|
+
return Dataset(new_raw_data)
|
|
170
|
+
|
|
171
|
+
def get_company_by_cik(self, cik: str) -> Optional[CompanyFilings]:
|
|
172
|
+
"""Retrieves the CompanyFilings object for a given CIK."""
|
|
173
|
+
return self.companies.get(cik)
|
|
174
|
+
|
|
175
|
+
def get_company_by_ticker(self, ticker: str) -> Optional[CompanyFilings]:
|
|
176
|
+
"""Retrieves the CompanyFilings object for a given ticker."""
|
|
177
|
+
ticker_upper = ticker.upper()
|
|
178
|
+
for company in self.companies.values():
|
|
179
|
+
if ticker_upper in [t.upper() for t in company.tickers]:
|
|
180
|
+
return company
|
|
181
|
+
return None
|
|
182
|
+
|
|
183
|
+
def __len__(self) -> int:
|
|
184
|
+
"""Returns the number of companies in the dataset."""
|
|
185
|
+
return len(self.companies)
|
|
186
|
+
|
|
187
|
+
def __iter__(self):
|
|
188
|
+
"""Allows iteration over the CompanyFilings objects in the dataset."""
|
|
189
|
+
return iter(self.companies.values())
|
|
190
|
+
|
|
191
|
+
def __repr__(self):
|
|
192
|
+
total_filings = sum(len(company.filings) for company in self.companies.values())
|
|
193
|
+
return f"Dataset(total_companies={len(self.companies)}, total_filings={total_filings})"
|