ipo-mine 0.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. ipo_mine-0.0.0/PKG-INFO +150 -0
  2. ipo_mine-0.0.0/README.md +132 -0
  3. ipo_mine-0.0.0/pyproject.toml +36 -0
  4. ipo_mine-0.0.0/setup.cfg +4 -0
  5. ipo_mine-0.0.0/src/ipo_mine/__init__.py +3 -0
  6. ipo_mine-0.0.0/src/ipo_mine/download/__init__.py +7 -0
  7. ipo_mine-0.0.0/src/ipo_mine/download/company.py +112 -0
  8. ipo_mine-0.0.0/src/ipo_mine/download/dataset.py +193 -0
  9. ipo_mine-0.0.0/src/ipo_mine/download/downloader.py +347 -0
  10. ipo_mine-0.0.0/src/ipo_mine/entities/CompanyFilings.py +27 -0
  11. ipo_mine-0.0.0/src/ipo_mine/entities/Filing.py +17 -0
  12. ipo_mine-0.0.0/src/ipo_mine/entities/FilingImage.py +9 -0
  13. ipo_mine-0.0.0/src/ipo_mine/entities/FormType.py +8 -0
  14. ipo_mine-0.0.0/src/ipo_mine/entities/S1Filing.py +52 -0
  15. ipo_mine-0.0.0/src/ipo_mine/entities/S1FilingImage.py +11 -0
  16. ipo_mine-0.0.0/src/ipo_mine/entities/__init__.py +12 -0
  17. ipo_mine-0.0.0/src/ipo_mine/mappings/__init__.py +1 -0
  18. ipo_mine-0.0.0/src/ipo_mine/mappings/global_mapping.py +44 -0
  19. ipo_mine-0.0.0/src/ipo_mine/parse/__init__.py +12 -0
  20. ipo_mine-0.0.0/src/ipo_mine/parse/ipo_parser.py +276 -0
  21. ipo_mine-0.0.0/src/ipo_mine/parse/process_text_image.py +309 -0
  22. ipo_mine-0.0.0/src/ipo_mine/parse/section_parser.py +355 -0
  23. ipo_mine-0.0.0/src/ipo_mine/parse/table_parser.py +329 -0
  24. ipo_mine-0.0.0/src/ipo_mine/parse/toc_parser.py +1249 -0
  25. ipo_mine-0.0.0/src/ipo_mine/resources/__init__.py +31 -0
  26. ipo_mine-0.0.0/src/ipo_mine/utils/__init__.py +41 -0
  27. ipo_mine-0.0.0/src/ipo_mine/utils/config.py +191 -0
  28. ipo_mine-0.0.0/src/ipo_mine/utils/helpers.py +126 -0
  29. ipo_mine-0.0.0/src/ipo_mine.egg-info/PKG-INFO +150 -0
  30. ipo_mine-0.0.0/src/ipo_mine.egg-info/SOURCES.txt +32 -0
  31. ipo_mine-0.0.0/src/ipo_mine.egg-info/dependency_links.txt +1 -0
  32. ipo_mine-0.0.0/src/ipo_mine.egg-info/requires.txt +11 -0
  33. ipo_mine-0.0.0/src/ipo_mine.egg-info/top_level.txt +1 -0
  34. ipo_mine-0.0.0/tests/test.py +0 -0
@@ -0,0 +1,150 @@
1
+ Metadata-Version: 2.4
2
+ Name: ipo-mine
3
+ Version: 0.0.0
4
+ Summary: Mining and parsing S-1 IPO filings
5
+ Author: Michael Galarnyk
6
+ Requires-Python: >=3.9
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: pandas>=1.5
9
+ Requires-Dist: numpy>=1.21
10
+ Requires-Dist: beautifulsoup4>=4.11
11
+ Requires-Dist: lxml>=4.9
12
+ Requires-Dist: requests>=2.28
13
+ Requires-Dist: fuzzywuzzy>=0.18.0
14
+ Requires-Dist: python-levenshtein
15
+ Provides-Extra: dev
16
+ Requires-Dist: pytest; extra == "dev"
17
+ Requires-Dist: jupyter; extra == "dev"
18
+
19
+ # IPO-Mine: S-1 (IPO) Filings Toolkit
20
+
21
+ GitHub Repository: https://github.com/gtfintechlab/S1-Filings
22
+ Project Website: https://ipo-mine.web.app/
23
+
24
+ ## Overview
25
+
26
+ IPO-Mine is a Python package for downloading, parsing, and structuring S-1 IPO filings from the U.S. Securities and Exchange Commission (SEC) EDGAR system.
27
+
28
+ This repository implements the data processing pipeline used to construct the IPO-Mine dataset, a section-structured corpus introduced in the research paper:
29
+
30
+ IPO-Mine: A Section-Structured Dataset for Analyzing Long and Complex IPO Filings
31
+
32
+ The objective of this project is to transform raw SEC filings into clean, standardized, and section-aligned textual representations suitable for large-scale analysis in natural language processing, information retrieval, and long-document modeling.
33
+
34
+ ## Motivation
35
+
36
+ S-1 filings are among the most complex regulatory documents used in empirical research. They exhibit several challenges:
37
+
38
+ - Extreme document length, often exceeding 100–300 pages
39
+ - Substantial variation in section headers across firms and time
40
+ - Heterogeneous formats, including HTML, plain text, and scanned images
41
+ - Limited structural consistency despite regulatory guidance
42
+
43
+ These characteristics complicate tasks such as section segmentation, cross-firm comparison, longitudinal analysis, and long-context modeling.
44
+
45
+ IPO-Mine addresses these challenges by providing a unified and reproducible pipeline that converts raw EDGAR filings into structured, research-ready data.
46
+
47
+ ## Features
48
+
49
+ - Automated downloading of S-1 and S-1/A filings from SEC EDGAR
50
+ - Parsing of Tables of Contents (TOCs) for filings dating back to 1997
51
+ - Extraction and normalization of key IPO sections, including:
52
+ - Risk Factors
53
+ - Business
54
+ - Use of Proceeds
55
+ - Management’s Discussion and Analysis (MD&A)
56
+ - Financial Statements
57
+ - Support for multiple filing formats:
58
+ - HTML
59
+ - plain text
60
+ - image-based filings via OCR
61
+ - Fuzzy matching of section headers using global section mappings
62
+ - Deterministic outputs suitable for reproducible dataset construction
63
+
64
+ ## IPO-Mine Dataset
65
+
66
+ Using this toolkit, the IPO-Mine dataset is constructed as a large-scale corpus of IPO filings with:
67
+
68
+ - Section-aligned text across firms
69
+ - Standardized section nomenclature
70
+ - Clean document boundaries
71
+ - Compatibility with long-document modeling and retrieval frameworks
72
+
73
+ Additional details and examples are available at:
74
+
75
+ https://ipo-mine.web.app/
76
+
77
+ ## Installation
78
+
79
+ The package is available on PyPI under the name `ipo-mine`.
80
+
81
+ ```
82
+ pip install ipo-mine
83
+ ```
84
+
85
+ ## OCR Dependency
86
+
87
+ Parsing image-based filings requires a local installation of Tesseract OCR.
88
+
89
+ ### Tesseract Installation
90
+
91
+ | Operating System | Installation Method |
92
+ |------------------|---------------------|
93
+ | macOS | `brew install tesseract` |
94
+ | Ubuntu / Debian | `sudo apt install tesseract-ocr` |
95
+ | Windows | UB Mannheim Tesseract installer |
96
+ | Conda environments | Included automatically |
97
+
98
+ ## Example Usage
99
+
100
+ ```python
101
+ from ipo_mine.download.company import Company
102
+ from ipo_mine.download import S1Downloader
103
+ from ipo_mine.parse.s1_parser import S1Parser
104
+ from ipo_mine.resources import GLOBAL_SECTIONS_JSON
105
+ from ipo_mine.utils.config import PARSED_DIR
106
+
107
+ downloader = S1Downloader(
108
+ email="your_email@domain.com",
109
+ company="Your Institution"
110
+ )
111
+
112
+ ticker = "SNOW"
113
+ filing = downloader.download_s1(Company.from_ticker(ticker))
114
+
115
+ parser = S1Parser(
116
+ filing=filing,
117
+ mappings_path=GLOBAL_SECTIONS_JSON,
118
+ output_base_path=PARSED_DIR
119
+ )
120
+
121
+ risk_factors = parser.parse_section("Risk Factors", ticker)
122
+ ```
123
+
124
+ ## Research-Oriented Design
125
+
126
+ This library is designed primarily for dataset construction and reproducible empirical research rather than ad-hoc scraping.
127
+
128
+ Typical use cases include:
129
+
130
+ - Building section-aligned IPO corpora
131
+ - Comparing disclosure language across firms and time
132
+ - Training and evaluation of long-document language models
133
+ - Large-scale studies of regulatory disclosures
134
+
135
+ ## Citation
136
+
137
+ If you use this package or the IPO-Mine dataset in your research, please cite:
138
+
139
+ ```
140
+ @inproceedings{ipomine2025,
141
+ title = {IPO-Mine: A Section-Structured Dataset for Analyzing Long and Complex IPO Filings},
142
+ author = {Author names},
143
+ booktitle = {Proceedings of the ACM SIGKDD Conference},
144
+ year = {2025}
145
+ }
146
+ ```
147
+
148
+ ## License
149
+
150
+ This project is released under the MIT License.
@@ -0,0 +1,132 @@
1
+ # IPO-Mine: S-1 (IPO) Filings Toolkit
2
+
3
+ GitHub Repository: https://github.com/gtfintechlab/S1-Filings
4
+ Project Website: https://ipo-mine.web.app/
5
+
6
+ ## Overview
7
+
8
+ IPO-Mine is a Python package for downloading, parsing, and structuring S-1 IPO filings from the U.S. Securities and Exchange Commission (SEC) EDGAR system.
9
+
10
+ This repository implements the data processing pipeline used to construct the IPO-Mine dataset, a section-structured corpus introduced in the research paper:
11
+
12
+ IPO-Mine: A Section-Structured Dataset for Analyzing Long and Complex IPO Filings
13
+
14
+ The objective of this project is to transform raw SEC filings into clean, standardized, and section-aligned textual representations suitable for large-scale analysis in natural language processing, information retrieval, and long-document modeling.
15
+
16
+ ## Motivation
17
+
18
+ S-1 filings are among the most complex regulatory documents used in empirical research. They exhibit several challenges:
19
+
20
+ - Extreme document length, often exceeding 100–300 pages
21
+ - Substantial variation in section headers across firms and time
22
+ - Heterogeneous formats, including HTML, plain text, and scanned images
23
+ - Limited structural consistency despite regulatory guidance
24
+
25
+ These characteristics complicate tasks such as section segmentation, cross-firm comparison, longitudinal analysis, and long-context modeling.
26
+
27
+ IPO-Mine addresses these challenges by providing a unified and reproducible pipeline that converts raw EDGAR filings into structured, research-ready data.
28
+
29
+ ## Features
30
+
31
+ - Automated downloading of S-1 and S-1/A filings from SEC EDGAR
32
+ - Parsing of Tables of Contents (TOCs) for filings dating back to 1997
33
+ - Extraction and normalization of key IPO sections, including:
34
+ - Risk Factors
35
+ - Business
36
+ - Use of Proceeds
37
+ - Management’s Discussion and Analysis (MD&A)
38
+ - Financial Statements
39
+ - Support for multiple filing formats:
40
+ - HTML
41
+ - plain text
42
+ - image-based filings via OCR
43
+ - Fuzzy matching of section headers using global section mappings
44
+ - Deterministic outputs suitable for reproducible dataset construction
45
+
46
+ ## IPO-Mine Dataset
47
+
48
+ Using this toolkit, the IPO-Mine dataset is constructed as a large-scale corpus of IPO filings with:
49
+
50
+ - Section-aligned text across firms
51
+ - Standardized section nomenclature
52
+ - Clean document boundaries
53
+ - Compatibility with long-document modeling and retrieval frameworks
54
+
55
+ Additional details and examples are available at:
56
+
57
+ https://ipo-mine.web.app/
58
+
59
+ ## Installation
60
+
61
+ The package is available on PyPI under the name `ipo-mine`.
62
+
63
+ ```
64
+ pip install ipo-mine
65
+ ```
66
+
67
+ ## OCR Dependency
68
+
69
+ Parsing image-based filings requires a local installation of Tesseract OCR.
70
+
71
+ ### Tesseract Installation
72
+
73
+ | Operating System | Installation Method |
74
+ |------------------|---------------------|
75
+ | macOS | `brew install tesseract` |
76
+ | Ubuntu / Debian | `sudo apt install tesseract-ocr` |
77
+ | Windows | UB Mannheim Tesseract installer |
78
+ | Conda environments | Included automatically |
79
+
80
+ ## Example Usage
81
+
82
+ ```python
83
+ from ipo_mine.download.company import Company
84
+ from ipo_mine.download import S1Downloader
85
+ from ipo_mine.parse.s1_parser import S1Parser
86
+ from ipo_mine.resources import GLOBAL_SECTIONS_JSON
87
+ from ipo_mine.utils.config import PARSED_DIR
88
+
89
+ downloader = S1Downloader(
90
+ email="your_email@domain.com",
91
+ company="Your Institution"
92
+ )
93
+
94
+ ticker = "SNOW"
95
+ filing = downloader.download_s1(Company.from_ticker(ticker))
96
+
97
+ parser = S1Parser(
98
+ filing=filing,
99
+ mappings_path=GLOBAL_SECTIONS_JSON,
100
+ output_base_path=PARSED_DIR
101
+ )
102
+
103
+ risk_factors = parser.parse_section("Risk Factors", ticker)
104
+ ```
105
+
106
+ ## Research-Oriented Design
107
+
108
+ This library is designed primarily for dataset construction and reproducible empirical research rather than ad-hoc scraping.
109
+
110
+ Typical use cases include:
111
+
112
+ - Building section-aligned IPO corpora
113
+ - Comparing disclosure language across firms and time
114
+ - Training and evaluation of long-document language models
115
+ - Large-scale studies of regulatory disclosures
116
+
117
+ ## Citation
118
+
119
+ If you use this package or the IPO-Mine dataset in your research, please cite:
120
+
121
+ ```
122
+ @inproceedings{ipomine2025,
123
+ title = {IPO-Mine: A Section-Structured Dataset for Analyzing Long and Complex IPO Filings},
124
+ author = {Author names},
125
+ booktitle = {Proceedings of the ACM SIGKDD Conference},
126
+ year = {2025}
127
+ }
128
+ ```
129
+
130
+ ## License
131
+
132
+ This project is released under the MIT License.
@@ -0,0 +1,36 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "ipo-mine"
7
+ version = "0.0.0"
8
+ description = "Mining and parsing S-1 IPO filings"
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+
12
+ authors = [
13
+ { name = "Michael Galarnyk" }
14
+ ]
15
+
16
+ dependencies = [
17
+ "pandas>=1.5",
18
+ "numpy>=1.21",
19
+ "beautifulsoup4>=4.11",
20
+ "lxml>=4.9",
21
+ "requests>=2.28",
22
+ "fuzzywuzzy>=0.18.0",
23
+ "python-levenshtein",
24
+ ]
25
+
26
+ [project.optional-dependencies]
27
+ dev = [
28
+ "pytest",
29
+ "jupyter",
30
+ ]
31
+
32
+ [tool.setuptools]
33
+ package-dir = {"" = "src"}
34
+
35
+ [tool.setuptools.packages.find]
36
+ where = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,3 @@
1
+ """IPO Mining toolkit for downloading and parsing S-1 SEC filings."""
2
+
3
+ __version__ = "0.0.0"
@@ -0,0 +1,7 @@
1
+ from .downloader import IPODownloader
2
+ from .company import Company
3
+ from .dataset import Dataset
4
+
5
+
6
+
7
+ __all__ = ["IPODownloader", "Company", "Dataset"]
@@ -0,0 +1,112 @@
1
+ import pandas as pd
2
+ import requests
3
+ from dataclasses import dataclass
4
+ from typing import Optional, Dict, Tuple
5
+
6
+ from resources import get_resource_path
7
+
8
+ @dataclass
9
+ class Company:
10
+
11
+ ticker: str
12
+ cik: str
13
+ active: bool
14
+
15
+ _ticker_to_cik: Optional[Dict[str, Tuple[str, bool]]] = None
16
+ _cik_to_ticker: Optional[Dict[str, Tuple[str, bool]]] = None
17
+
18
+ @classmethod
19
+ def _load_mapping(cls):
20
+ """
21
+ Loads and caches the ticker-CIK mapping from file(s) into dictionaries.
22
+ This method is designed to run only once.
23
+ """
24
+ if cls._ticker_to_cik is not None and cls._cik_to_ticker is not None:
25
+ return
26
+
27
+ # print("Loading and caching Ticker-CIK mapping for the first time...")
28
+ try:
29
+ csv_path = get_resource_path("company_tickers_to_cik.csv")
30
+ df = pd.read_csv(csv_path, dtype=str)
31
+ df['active'] = True # Mark entries from the SEC file as active
32
+ except FileNotFoundError:
33
+ cls._download_ticker_cik_mapping()
34
+ csv_path = get_resource_path("company_tickers_to_cik.csv")
35
+ df = pd.read_csv(csv_path, dtype=str)
36
+ df['active'] = True # Mark entries from the SEC file as active
37
+
38
+ try:
39
+ upgraded_df = pd.read_csv("upgraded_mapping.csv", dtype=str)
40
+ upgraded_df['active'] = False # Mark entries from the custom file as inactive
41
+ # Combine the two DataFrames. If there are duplicate tickers,
42
+ # the one from the SEC (marked active=True) will be kept.
43
+ df = pd.concat([upgraded_df, df]).drop_duplicates(subset=['ticker'], keep='last')
44
+ # print("Successfully merged upgraded mapping.")
45
+ # except FileNotFoundError:
46
+ # print("Warning: 'upgraded_mapping' file not found. Using standard mapping only.")
47
+ except Exception as e:
48
+ pass
49
+ # print(f"Warning: Could not load 'upgraded_mapping' due to an error: {e}")
50
+
51
+ # Create fast dictionary lookups storing a tuple of (value, active_status)
52
+ cls._ticker_to_cik = {
53
+ str(row.ticker): (str(row.cik_str), bool(row.active))
54
+ for row in df.itertuples(index=False)
55
+ }
56
+
57
+ cls._cik_to_ticker = {
58
+ str(row.cik_str): (str(row.ticker), bool(row.active))
59
+ for row in df.itertuples(index=False)
60
+ }
61
+
62
+ @classmethod
63
+ def from_ticker(cls, ticker: str) -> 'Company':
64
+ """Create Company from ticker, looking up the CIK and active status."""
65
+ lookup_result = cls._lookup_cik_from_ticker(ticker.upper())
66
+ if not lookup_result:
67
+ raise ValueError(f"No CIK found for ticker: {ticker}")
68
+
69
+ cik, active = lookup_result
70
+ return cls(ticker=ticker.upper(), cik=cik, active=active)
71
+
72
+ @classmethod
73
+ def from_cik(cls, cik: str) -> 'Company':
74
+ """Create Company from CIK, looking up the ticker and active status."""
75
+ formatted_cik = cik.zfill(10)
76
+ lookup_result = cls._lookup_ticker_from_cik(formatted_cik)
77
+ if not lookup_result:
78
+ return cls(ticker="", cik=formatted_cik, active=False)
79
+
80
+ ticker, active = lookup_result
81
+ return cls(ticker=ticker, cik=formatted_cik, active=active)
82
+
83
+ @classmethod
84
+ def _lookup_cik_from_ticker(cls, ticker: str) -> Optional[str]:
85
+ """Look up CIK from the cached dictionary. 🚀"""
86
+ cls._load_mapping() # Ensures mapping is loaded before lookup
87
+ return cls._ticker_to_cik.get(ticker)
88
+
89
+ @classmethod
90
+ def _lookup_ticker_from_cik(cls, cik: str) -> Optional[str]:
91
+ """Look up ticker from the cached dictionary. 🚀"""
92
+ cls._load_mapping() # Ensures mapping is loaded before lookup
93
+ return cls._cik_to_ticker.get(cik)
94
+
95
+ @staticmethod
96
+ def _download_ticker_cik_mapping():
97
+ """Download the ticker-CIK mapping from SEC."""
98
+ headers = {"User-Agent": "Company Lookup Tool contact@example.com"}
99
+ try:
100
+ response = requests.get(
101
+ "https://www.sec.gov/files/company_tickers.json",
102
+ headers=headers
103
+ )
104
+ response.raise_for_status()
105
+ df = pd.DataFrame.from_dict(response.json(), orient="index")
106
+ df["cik_str"] = df["cik_str"].astype(str).str.zfill(10)
107
+
108
+ save_path = get_resource_path("company_tickers_to_cik.csv")
109
+ df.to_csv(save_path, index=False)
110
+ print(f"✅ Successfully downloaded ticker-CIK mapping → {save_path}")
111
+ except Exception as e:
112
+ raise RuntimeError(f"Failed to download ticker-CIK mapping: {e}")
@@ -0,0 +1,193 @@
1
+ import json
2
+ from typing import Dict, Any, List, Union, Optional, Tuple
3
+ from collections import defaultdict
4
+ from entities import S1Filing, S1FilingImage, CompanyFilings, Filing, FilingImage, FormType
5
+ import random
6
+
7
+ class Dataset:
8
+ """
9
+ Manages the collection of CompanyFilings objects, handling JSON loading and sampling.
10
+ """
11
+ def __init__(self, raw_data_or_path: Union[str, Dict[str, Dict[str, Any]]]):
12
+ """
13
+ Initializes the dataset by either loading from a file path or processing
14
+ an existing dictionary of raw data.
15
+ """
16
+ if isinstance(raw_data_or_path, str):
17
+ raw_data = self._load_json(raw_data_or_path)
18
+ else:
19
+ raw_data = raw_data_or_path
20
+
21
+ self.companies: Dict[str, CompanyFilings] = self._parse_data(raw_data)
22
+
23
+ def _load_json(self, file_path: str) -> Dict[str, Dict[str, Any]]:
24
+ """Handles opening and parsing the JSON file."""
25
+ with open(file_path, 'r') as f:
26
+ return json.load(f)
27
+
28
+ def _parse_data(self, raw_data: Dict[str, Dict[str, Any]]) -> Dict[str, CompanyFilings]:
29
+ """Iterates through the raw data dictionary and creates CompanyFilings objects."""
30
+ parsed_companies = {}
31
+
32
+ for cik, company_data in raw_data.items():
33
+ # Extract and parse filings
34
+ raw_filings = company_data.get("filings", [])
35
+ filings_list = []
36
+
37
+ for filing_data in raw_filings:
38
+ # Parse images for this filing
39
+ raw_images = filing_data.get("images", [])
40
+ images_list = [
41
+ FilingImage(
42
+ img_name=img.get('img_name'),
43
+ url=img.get('url'),
44
+ local_path=img.get('local_path')
45
+ ) for img in raw_images
46
+ ]
47
+
48
+ # Parse form_type enum
49
+ form_type_str = filing_data.get('form_type', '')
50
+ try:
51
+ form_type = FormType(form_type_str)
52
+ except ValueError:
53
+ print(f"Warning: Unknown form type '{form_type_str}' for CIK {cik}")
54
+ continue
55
+
56
+ # Create Filing object
57
+ filing = Filing(
58
+ form_type=form_type,
59
+ acession_number=filing_data.get('acession_number', ''),
60
+ filing_date=filing_data.get('filing_date', ''),
61
+ primary_document=filing_data.get('primary_document', ''),
62
+ filing_url=filing_data.get('filing_url', ''),
63
+ local_path=filing_data.get('local_path'),
64
+ images=images_list,
65
+ raw_content=filing_data.get('raw_content')
66
+ )
67
+ filings_list.append(filing)
68
+
69
+ # Create CompanyFilings object
70
+ company = CompanyFilings(
71
+ tickers=company_data.get('tickers', []),
72
+ cik=company_data.get('cik', cik),
73
+ name=company_data.get('name', ''),
74
+ sic=company_data.get('sic'),
75
+ industry=company_data.get('industry'),
76
+ office=company_data.get('office'),
77
+ exchanges=company_data.get('exchanges'),
78
+ filings=filings_list
79
+ )
80
+
81
+ parsed_companies[cik] = company
82
+
83
+ return parsed_companies
84
+
85
+ def sample_filings_by_year(self, num_samples: int, start_year: Optional[int] = None, end_year: Optional[int] = None) -> 'Dataset':
86
+ """
87
+ Samples up to 'num_samples' filings per year and returns a new
88
+ Dataset instance containing only the sampled filings.
89
+
90
+ :param num_samples: The maximum number of filings to sample per year.
91
+ :param start_year: The start year to sample (inclusive)
92
+ :param end_year: The end year to sample (exclusive)
93
+ :return: A new Dataset instance with the sampled data.
94
+ """
95
+
96
+ # Collect all filings across all companies, grouped by year
97
+ filings_by_year: Dict[int, List[Tuple[str, Filing]]] = defaultdict(list)
98
+
99
+ for cik, company in self.companies.items():
100
+ for filing in company.filings:
101
+ # Extract year from filing_date (format: "YYYY-MM-DD")
102
+ try:
103
+ year = int(filing.filing_date.split('-')[0])
104
+ except (ValueError, IndexError, AttributeError):
105
+ print(f"Warning: Could not parse filing_date for CIK {cik}")
106
+ continue
107
+
108
+ if (start_year is not None and year < start_year):
109
+ continue
110
+ if (end_year is not None and year >= end_year):
111
+ continue
112
+
113
+ filings_by_year[year].append((cik, filing))
114
+
115
+ # Sample filings per year
116
+ sampled_filings: Dict[str, List[Filing]] = defaultdict(list)
117
+ sorted_years = sorted(filings_by_year.keys())
118
+
119
+ for year in sorted_years:
120
+ filing_list = filings_by_year[year]
121
+ k = min(num_samples, len(filing_list))
122
+ yearly_sample = random.sample(filing_list, k)
123
+
124
+ for cik, filing in yearly_sample:
125
+ sampled_filings[cik].append(filing)
126
+
127
+ # Reconstruct raw data structure with only sampled filings
128
+ new_raw_data = {}
129
+
130
+ for cik, filing_list in sampled_filings.items():
131
+ company = self.companies[cik]
132
+
133
+ # Convert filings to dictionaries
134
+ filings_data = []
135
+ for filing in filing_list:
136
+ filing_dict = {
137
+ 'form_type': filing.form_type.value,
138
+ 'acession_number': filing.acession_number,
139
+ 'filing_date': filing.filing_date,
140
+ 'primary_document': filing.primary_document,
141
+ 'filing_url': filing.filing_url,
142
+ 'local_path': filing.local_path,
143
+ 'images': [
144
+ {
145
+ 'img_name': img.img_name,
146
+ 'url': img.url,
147
+ 'local_path': img.local_path
148
+ } for img in filing.images
149
+ ],
150
+ 'raw_content': filing.raw_content
151
+ }
152
+ filings_data.append(filing_dict)
153
+
154
+ # Reconstruct company data
155
+ new_raw_data[cik] = {
156
+ 'tickers': company.tickers,
157
+ 'cik': company.cik,
158
+ 'name': company.name,
159
+ 'sic': company.sic,
160
+ 'industry': company.industry,
161
+ 'office': company.office,
162
+ 'exchanges': company.exchanges,
163
+ 'filings': filings_data
164
+ }
165
+
166
+ total_sampled = sum(len(filings) for filings in sampled_filings.values())
167
+ print(f"Sampled {total_sampled} filings across {len(filings_by_year)} years from {len(sampled_filings)} companies.")
168
+
169
+ return Dataset(new_raw_data)
170
+
171
+ def get_company_by_cik(self, cik: str) -> Optional[CompanyFilings]:
172
+ """Retrieves the CompanyFilings object for a given CIK."""
173
+ return self.companies.get(cik)
174
+
175
+ def get_company_by_ticker(self, ticker: str) -> Optional[CompanyFilings]:
176
+ """Retrieves the CompanyFilings object for a given ticker."""
177
+ ticker_upper = ticker.upper()
178
+ for company in self.companies.values():
179
+ if ticker_upper in [t.upper() for t in company.tickers]:
180
+ return company
181
+ return None
182
+
183
+ def __len__(self) -> int:
184
+ """Returns the number of companies in the dataset."""
185
+ return len(self.companies)
186
+
187
+ def __iter__(self):
188
+ """Allows iteration over the CompanyFilings objects in the dataset."""
189
+ return iter(self.companies.values())
190
+
191
+ def __repr__(self):
192
+ total_filings = sum(len(company.filings) for company in self.companies.values())
193
+ return f"Dataset(total_companies={len(self.companies)}, total_filings={total_filings})"