datadock 1.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. datadock-1.0.2/LICENSE +21 -0
  2. datadock-1.0.2/PKG-INFO +179 -0
  3. datadock-1.0.2/README.md +139 -0
  4. datadock-1.0.2/datadock/__init__.py +19 -0
  5. datadock-1.0.2/datadock/config.py +4 -0
  6. datadock-1.0.2/datadock/controllers/__init__.py +4 -0
  7. datadock-1.0.2/datadock/controllers/_base_controllers.py +120 -0
  8. datadock-1.0.2/datadock/controllers/_controller_10k.py +157 -0
  9. datadock-1.0.2/datadock/controllers/_controller_8k.py +204 -0
  10. datadock-1.0.2/datadock/controllers/_factory.py +39 -0
  11. datadock-1.0.2/datadock/controllers/control.py +76 -0
  12. datadock-1.0.2/datadock/core.py +88 -0
  13. datadock-1.0.2/datadock/ddhtml/__init__.py +5 -0
  14. datadock-1.0.2/datadock/ddhtml/doc_headers.py +128 -0
  15. datadock-1.0.2/datadock/ddhtml/extract_data.py +222 -0
  16. datadock-1.0.2/datadock/ddhtml/extract_tables.py +41 -0
  17. datadock-1.0.2/datadock/ddhtml/scrape_r_html.py +208 -0
  18. datadock-1.0.2/datadock/filings.py +22 -0
  19. datadock-1.0.2/datadock/financials/__init__.py +5 -0
  20. datadock-1.0.2/datadock/financials/base_table.py +176 -0
  21. datadock-1.0.2/datadock/financials/doc_links.py +219 -0
  22. datadock-1.0.2/datadock/metadata/__init__.py +0 -0
  23. datadock-1.0.2/datadock/metadata/__version__.py +4 -0
  24. datadock-1.0.2/datadock/src/__init__.py +14 -0
  25. datadock-1.0.2/datadock/src/_rich_.py +220 -0
  26. datadock-1.0.2/datadock/src/_tabulate_.py +104 -0
  27. datadock-1.0.2/datadock/src/_xml.py +138 -0
  28. datadock-1.0.2/datadock/src/api_base.py +214 -0
  29. datadock-1.0.2/datadock/src/api_errors.py +148 -0
  30. datadock-1.0.2/datadock/src/api_response.py +16 -0
  31. datadock-1.0.2/datadock/src/blob_storage.py +102 -0
  32. datadock-1.0.2/datadock/src/constants.py +55 -0
  33. datadock-1.0.2/datadock/src/custom_logger.py +85 -0
  34. datadock-1.0.2/datadock/src/dataclasses.py +22 -0
  35. datadock-1.0.2/datadock/src/document.py +55 -0
  36. datadock-1.0.2/datadock/src/filings.py +287 -0
  37. datadock-1.0.2/datadock/src/filters.py +126 -0
  38. datadock-1.0.2/datadock/src/models.py +322 -0
  39. datadock-1.0.2/datadock/src/utils.py +179 -0
  40. datadock-1.0.2/datadock/utils/__init__.py +0 -0
  41. datadock-1.0.2/datadock/utils/_compact.py +11 -0
  42. datadock-1.0.2/pyproject.toml +88 -0
datadock-1.0.2/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 PETER MBACHU
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,179 @@
1
+ Metadata-Version: 2.1
2
+ Name: datadock
3
+ Version: 1.0.2
4
+ Summary: DataDock python library to access SEC edgar fillings.
5
+ License: MIT
6
+ Keywords: datadockpy,datadockAI,python,SEC
7
+ Author: Peter Mbachu
8
+ Author-email: peter.mbachu@datadock.ai.com
9
+ Requires-Python: >=3.10,<4.0
10
+ Classifier: Development Status :: 5 - Production/Stable
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Natural Language :: English
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Requires-Dist: azure-core (>=1.31.0,<2.0.0)
22
+ Requires-Dist: azure-storage-blob (>=12.23.1,<13.0.0)
23
+ Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
24
+ Requires-Dist: colorama (>=0.4.6,<0.5.0)
25
+ Requires-Dist: fastcore (>=1.7.17,<2.0.0)
26
+ Requires-Dist: humanize (>=4.11.0,<5.0.0)
27
+ Requires-Dist: lxml (>=5.3.0,<6.0.0)
28
+ Requires-Dist: pandas (>=2.2.3,<3.0.0)
29
+ Requires-Dist: pyarrow (>=17.0.0,<18.0.0)
30
+ Requires-Dist: python-decouple (>=3.8,<4.0)
31
+ Requires-Dist: requests (>=2.32.3,<3.0.0)
32
+ Requires-Dist: rich (>=13.8.1,<14.0.0)
33
+ Requires-Dist: tabulate (>=0.9.0,<0.10.0)
34
+ Project-URL: Bug Tracker, https://github.com/cla-bit/PayStackEase/issues
35
+ Project-URL: Documentation, https://paystackease.readthedocs.io/en/latest/
36
+ Project-URL: Homepage, https://github.com/cla-bit/PayStackEase
37
+ Project-URL: Source Code, https://github.com/cla-bit/PayStackEase
38
+ Description-Content-Type: text/markdown
39
+
40
+ # DataDockPy Library
41
+
42
+ --------------
43
+
44
+ ![Python Versions](https://img.shields.io/badge/python-3.9|3.10|3.11|3.12-blue)
45
+ ![License](https://img.shields.io/pypi/l/paystackease.svg)
46
+ ![PyPi - Version](https://img.shields.io/pypi/v/paystackease.svg)
47
+
48
+ <p align="center">
49
+ <a href="https://github.com/DataDock-AI/DataDockPy">
50
+ <img src="media/20241001135438-ezgif.com-video-to-gif-converter.gif" alt="DataDock demo" height="350">
51
+ </a>
52
+ </p>
53
+
54
+
55
+ ## Introduction
56
+
57
+ DataDockPy is a powerful and user-friendly Python package designed to enhance data analysis.
58
+ It simplifies the extraction and presentation of information from the SEC Edgar database,
59
+ offering enriched data display for SEC form types, form controls, and financial analysis,
60
+ making it an essential tool for professionals working with regulatory filings.
61
+
62
+ This library uses `Poetry` as a dependency manager.
63
+
64
+
65
+
66
+ ## Features
67
+
68
+ - 📁 **Access any SEC filing**: You can access any filings on SEC forms of Form 8-K and Form 10-K
69
+ - 📅 **List filings for any date range**: List filings by date e.g. or date range `2024-02-29:2024-03-15`
70
+ - 🌟 **User Friendly library**: Uses **[rich](https://rich.readthedocs.io/en/stable/introduction.html) & [tabulate](https://github.com/astanin/python-tabulate)** library to display SEC Edgar data in a beautiful way.
71
+ - 🔄 **Page through filings**: Use `filings.next()` and `filings.previous()` to page through filings
72
+ - 🏗️ **Filter filings data**: Build data filtering by cik, accession number, form type, filing date
73
+ - ✅ **Select a filing**: You can select a filing from the list of filings.
74
+ - 🔍 **Preview the text data for a filing**: You can preview the filing (sections) in the terminal or a notebook.
75
+ - 📊 **Parse to Dataframe**: You can parse filings to a dataframe.
76
+ - 📈 **Financial Statement**: Get financial statements of Form 8-K and Form 10-K of various companies.
77
+
78
+
79
+ ## Get Started on Windows/MacOS/Linux Terminal
80
+
81
+ 1. Open your terminal and install poetry using `pip`.
82
+
83
+ ```commandline
84
+ pip install poetry
85
+ ```
86
+ or
87
+
88
+ Install poetry using `pipx`.
89
+
90
+ ```commandline
91
+ pipx install poetry
92
+ ```
93
+
94
+ 2. Create a project and clone DataDockPy github repository.
95
+
96
+ ```git
97
+ git clone https://github.com/DataDock-AI/DataDockPy.git
98
+ ```
99
+
100
+ 3. Change directory to `DataDockPy`
101
+
102
+ ```commandline
103
+ cd DataDockPy
104
+ ```
105
+
106
+ 4. Run the poetry command to install dependencies:
107
+
108
+ ```commandline
109
+ poetry install
110
+ ```
111
+
112
+ 5. Activate virtual environment using poetry:
113
+
114
+ ```commandline
115
+ poetry shell
116
+ ```
117
+
118
+ 6. Set up your SEC_IDENTITY in an `.env` file
119
+
120
+ ```dotenv
121
+ SEC_IDENTITY=<your email or usernmae for SEC IDENTITY>
122
+ ```
123
+
124
+ 7. See the following scripts on how to use the package: `run_checks.py` and `run_checks2.py`
125
+
126
+
127
+ ## Use DataDockPy with Jupyter
128
+
129
+ 1. Open terminal and install poetry using `pip`.
130
+
131
+ ```commandline
132
+ pip install poetry
133
+ ```
134
+ or
135
+
136
+ Install poetry using `pipx`.
137
+
138
+ ```commandline
139
+ pipx install poetry
140
+ ```
141
+
142
+ 2. Create a project and clone `DataDockPy` github repository
143
+
144
+ ```git
145
+ git clone https://github.com/DataDock-AI/DataDockPy.git
146
+ ```
147
+
148
+ 3. Change directory to `DataDockPy`
149
+
150
+ ```commandline
151
+ cd DataDockPy
152
+ ```
153
+
154
+ 4. Open your project directory on Anaconda or Visual Studio Code.
155
+
156
+ 5. Choose a python environment (recommended python environment), where poetry was installed. `Ctrl+Shift+P`
157
+ 6. If asked to install `ipykernel`, see here on installation: [ipykernel installation](https://devinschumacher.com/how-to-setup-jupyter-notebook-virtual-environment-vs-code-kernels/)
158
+ 7. Check to see if poetry is installed:
159
+
160
+ ```bash
161
+ !poetry --version
162
+ ```
163
+
164
+ 8. Run the poetry command to install dependencies:
165
+
166
+ ```commandline
167
+ poetry install
168
+ ```
169
+
170
+
171
+ ## Download DataDockPy Source Code
172
+
173
+ You can download any of the source codes: `zip` or `tar.gz` here: [DataDockPy Source Code](https://github.com/DataDock-AI/DataDockPy/releases/tag/v0.1.0).
174
+
175
+
176
+ If you have any issue or contribution, please write an issue with this link: https://github.com/DataDock-AI/DataDockPy/issues
177
+
178
+
179
+
@@ -0,0 +1,139 @@
1
+ # DataDockPy Library
2
+
3
+ --------------
4
+
5
+ ![Python Versions](https://img.shields.io/badge/python-3.9|3.10|3.11|3.12-blue)
6
+ ![License](https://img.shields.io/pypi/l/paystackease.svg)
7
+ ![PyPi - Version](https://img.shields.io/pypi/v/paystackease.svg)
8
+
9
+ <p align="center">
10
+ <a href="https://github.com/DataDock-AI/DataDockPy">
11
+ <img src="media/20241001135438-ezgif.com-video-to-gif-converter.gif" alt="DataDock demo" height="350">
12
+ </a>
13
+ </p>
14
+
15
+
16
+ ## Introduction
17
+
18
+ DataDockPy is a powerful and user-friendly Python package designed to enhance data analysis.
19
+ It simplifies the extraction and presentation of information from the SEC Edgar database,
20
+ offering enriched data display for SEC form types, form controls, and financial analysis,
21
+ making it an essential tool for professionals working with regulatory filings.
22
+
23
+ This library uses `Poetry` as a dependency manager.
24
+
25
+
26
+
27
+ ## Features
28
+
29
+ - 📁 **Access any SEC filing**: You can access any filings on SEC forms of Form 8-K and Form 10-K
30
+ - 📅 **List filings for any date range**: List filings by date e.g. or date range `2024-02-29:2024-03-15`
31
+ - 🌟 **User Friendly library**: Uses **[rich](https://rich.readthedocs.io/en/stable/introduction.html) & [tabulate](https://github.com/astanin/python-tabulate)** library to display SEC Edgar data in a beautiful way.
32
+ - 🔄 **Page through filings**: Use `filings.next()` and `filings.previous()` to page through filings
33
+ - 🏗️ **Filter filings data**: Build data filtering by cik, accession number, form type, filing date
34
+ - ✅ **Select a filing**: You can select a filing from the list of filings.
35
+ - 🔍 **Preview the text data for a filing**: You can preview the filing (sections) in the terminal or a notebook.
36
+ - 📊 **Parse to Dataframe**: You can parse filings to a dataframe.
37
+ - 📈 **Financial Statement**: Get financial statements of Form 8-K and Form 10-K of various companies.
38
+
39
+
40
+ ## Get Started on Windows/MacOS/Linux Terminal
41
+
42
+ 1. Open your terminal and install poetry using `pip`.
43
+
44
+ ```commandline
45
+ pip install poetry
46
+ ```
47
+ or
48
+
49
+ Install poetry using `pipx`.
50
+
51
+ ```commandline
52
+ pipx install poetry
53
+ ```
54
+
55
+ 2. Create a project and clone DataDockPy github repository.
56
+
57
+ ```git
58
+ git clone https://github.com/DataDock-AI/DataDockPy.git
59
+ ```
60
+
61
+ 3. Change directory to `DataDockPy`
62
+
63
+ ```commandline
64
+ cd DataDockPy
65
+ ```
66
+
67
+ 4. Run the poetry command to install dependencies:
68
+
69
+ ```commandline
70
+ poetry install
71
+ ```
72
+
73
+ 5. Activate virtual environment using poetry:
74
+
75
+ ```commandline
76
+ poetry shell
77
+ ```
78
+
79
+ 6. Set up your SEC_IDENTITY in an `.env` file
80
+
81
+ ```dotenv
82
+ SEC_IDENTITY=<your email or usernmae for SEC IDENTITY>
83
+ ```
84
+
85
+ 7. See the following scripts on how to use the package: `run_checks.py` and `run_checks2.py`
86
+
87
+
88
+ ## Use DataDockPy with Jupyter
89
+
90
+ 1. Open terminal and install poetry using `pip`.
91
+
92
+ ```commandline
93
+ pip install poetry
94
+ ```
95
+ or
96
+
97
+ Install poetry using `pipx`.
98
+
99
+ ```commandline
100
+ pipx install poetry
101
+ ```
102
+
103
+ 2. Create a project and clone `DataDockPy` github repository
104
+
105
+ ```git
106
+ git clone https://github.com/DataDock-AI/DataDockPy.git
107
+ ```
108
+
109
+ 3. Change directory to `DataDockPy`
110
+
111
+ ```commandline
112
+ cd DataDockPy
113
+ ```
114
+
115
+ 4. Open your project directory on Anaconda or Visual Studio Code.
116
+
117
+ 5. Choose a python environment (recommended python environment), where poetry was installed. `Ctrl+Shift+P`
118
+ 6. If asked to install `ipykernel`, see here on installation: [ipykernel installation](https://devinschumacher.com/how-to-setup-jupyter-notebook-virtual-environment-vs-code-kernels/)
119
+ 7. Check to see if poetry is installed:
120
+
121
+ ```bash
122
+ !poetry --version
123
+ ```
124
+
125
+ 8. Run the poetry command to install dependencies:
126
+
127
+ ```commandline
128
+ poetry install
129
+ ```
130
+
131
+
132
+ ## Download DataDockPy Source Code
133
+
134
+ You can download any of the source codes: `zip` or `tar.gz` here: [DataDockPy Source Code](https://github.com/DataDock-AI/DataDockPy/releases/tag/v0.1.0).
135
+
136
+
137
+ If you have any issue or contribution, please write an issue with this link: https://github.com/DataDock-AI/DataDockPy/issues
138
+
139
+
@@ -0,0 +1,19 @@
1
+ from datadock.src import (
2
+ DataDockError,
3
+ CustomLogger,
4
+ SECRequestHandler,
5
+ # CurrentFile,
6
+ display_table_with_rich,
7
+ # TextSummaryModel,
8
+ # SentimentAnalysisModel,
9
+ # EntityRecognitionModel,
10
+ )
11
+ from datadock.core import DataPager, table_array
12
+ from datadock.filings import CurrentFilings
13
+ from datadock.controllers import FormControl
14
+ from datadock.financials import parse_html_new, DocumentFinancial, DataDocFiling
15
+ from datadock.ddhtml import CompanyFilingInfo
16
+ from datadock.metadata.__version__ import __version__
17
+
18
+
19
+ VERSION = __version__
@@ -0,0 +1,4 @@
1
+ from decouple import config
2
+
3
+
4
+ sec_identity: str = config("SEC_IDENTITY")
@@ -0,0 +1,4 @@
1
+ from datadock.controllers.control import FormControl
2
+
3
+
4
+ __all__ = ["FormControl"]
@@ -0,0 +1,120 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Optional, List, Union
3
+ import pyarrow as pa
4
+ import pandas as pd
5
+ from rich.console import Group
6
+ from rich.panel import Panel
7
+ from rich.text import Text
8
+
9
+ from datadock.src import DocumentProcessor
10
+
11
+ from datadock.src.custom_logger import CustomLogger
12
+ from datadock.core import DataPager
13
+ from datadock.src._rich_ import repr_rich
14
+ from datadock.src.constants import IntString
15
+ from datadock.src.filters import filter_by_section_titles
16
+ from datadock.src.dataclasses import FilingsState
17
+
18
+
19
+ class FormBaseController(ABC):
20
+ def __init__(
21
+ self,
22
+ cik: str,
23
+ accession_number: str,
24
+ form_type: str,
25
+ logger: CustomLogger,
26
+ document_processor: DocumentProcessor,
27
+ ):
28
+ self.cik = cik
29
+ self.accession_number = accession_number
30
+ self.form_type = form_type
31
+ self._logger = logger
32
+ self._document_processor = document_processor
33
+ self.data_table: Optional[pa.Table] = self._process()
34
+ self.data: FormDataSections = FormDataSections(self.data_table)
35
+
36
+ @abstractmethod
37
+ def _process(self) -> Optional[pa.Table]:
38
+ pass
39
+
40
+ def to_pandas(self, *columns) -> Optional[pd.DataFrame]:
41
+ if not self.data:
42
+ return None
43
+ data_frame = self.data.data.to_pandas()
44
+ return data_frame.filter(columns) if len(columns) > 0 else data_frame
45
+
46
+ """
47
+ Next step is to implement using rich or tabulate to display the pyarrow table result to users
48
+ """
49
+
50
+ def filter(
51
+ self,
52
+ titles: Optional[Union[str, List[IntString]]] = None,
53
+ ) -> Optional["FormDataSections"]:
54
+ filing_index = self.data.data
55
+ section_titles = titles
56
+
57
+ if isinstance(section_titles, list):
58
+ section_titles = [str(title) for title in section_titles]
59
+
60
+ # Filter by form
61
+ if section_titles:
62
+ filing_index = filter_by_section_titles(filing_index, titles=section_titles)
63
+ return FormDataSections(filing_index)
64
+
65
+ @property
66
+ def get_all_sections(self):
67
+ return self.data
68
+
69
+
70
+ class FormDataSections:
71
+
72
+ def __init__(
73
+ self,
74
+ filing_index: pa.Table,
75
+ original_state: Optional[FilingsState] = None,
76
+ logger: Optional[CustomLogger] = None,
77
+ ) -> None:
78
+ self.data = filing_index
79
+ self.original_state = original_state or FilingsState(0, len(filing_index))
80
+ self.data_pager = DataPager(self.data)
81
+ self._logger: CustomLogger = logger or CustomLogger().logger
82
+
83
+ def _page_index(self) -> range:
84
+ """Create the range index to set on the page dataframe depending on where in the data we are"""
85
+ if self.original_state:
86
+ return range(
87
+ self.original_state.page_start,
88
+ self.original_state.page_start
89
+ + min(self.data_pager.page_size, len(self.data)),
90
+ ) # set the index to the size of the page
91
+ else:
92
+ return range(*self.data_pager.current_range)
93
+
94
+ def __rich__(self) -> Panel:
95
+ # Convert the PyArrow table to a pandas DataFrame for easier processing
96
+ df = self.data.to_pandas()
97
+
98
+ # Create a list to hold all sections
99
+ sections_content = []
100
+
101
+ # Add each section ID and its content to the list
102
+ for section_id, section_text in zip(df["Section Title"], df["Section Content"]):
103
+ section_id_text = Text(section_id, style="bold blue")
104
+ section_content_text = Text(section_text)
105
+
106
+ # Create a sub-panel for each section
107
+ section_sub_panel = Panel(section_content_text, title=section_id_text)
108
+ sections_content.append(section_sub_panel)
109
+
110
+ # Create a main panel that contains all sections
111
+ sections_panel = Panel(
112
+ Group(*sections_content),
113
+ title="DataDock Filings Sections and Contents",
114
+ border_style="green",
115
+ )
116
+
117
+ return sections_panel
118
+
119
+ def __repr__(self):
120
+ return repr_rich(self.__rich__())
@@ -0,0 +1,157 @@
1
+ import re
2
+ import pyarrow as pa
3
+ from typing import Optional, Dict
4
+
5
+ from bs4 import BeautifulSoup
6
+
7
+ from datadock.controllers._base_controllers import FormBaseController
8
+ from datadock.src.utils import clean
9
+ from datadock.core import table_array
10
+
11
+
12
+ class Clean10KController(FormBaseController):
13
+ _SECTION_NAMES = [
14
+ ("item1", "Business"),
15
+ ("item1a", "Risk Factors"),
16
+ ("item1b", "Unresolved Staff Comments"),
17
+ ("item1c", "Cybersecurity"),
18
+ ("item2", "Properties"),
19
+ ("item3", "Legal Proceedings"),
20
+ ("item4", "Mine Safety Disclosures"),
21
+ (
22
+ "item5",
23
+ "Market for Registrant’s Common Equity, Related Stockholder Matters, and Issuer Purchases of Equity Securities",
24
+ ),
25
+ ("item6", "Selected Financial Data"),
26
+ (
27
+ "item7",
28
+ "Management’s Discussion and Analysis of Financial Condition and Results of Operations",
29
+ ),
30
+ ("item7a", "Quantitative and Qualitative Disclosures about Market Risk"),
31
+ ("item8", "Financial Statements and Supplementary Data"),
32
+ (
33
+ "item9",
34
+ "Changes in and Disagreements with Accountants on Accounting and Financial Disclosure",
35
+ ),
36
+ ("item9a", "Controls and Procedures"),
37
+ ("item9b", "Other Information"),
38
+ (
39
+ "item9c",
40
+ "Disclosure Regarding Foreign Jurisdictions that Prevent Inspections",
41
+ ),
42
+ ("item10", "Directors, Executive Officers, and Corporate Governance"),
43
+ ("item11", "Executive Compensation"),
44
+ (
45
+ "item12",
46
+ "Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters",
47
+ ),
48
+ (
49
+ "item13",
50
+ "Certain Relationships and Related Transactions, and Director Independence",
51
+ ),
52
+ ("item14", "Principal Accountant Fees and Services"),
53
+ ("item15", "Exhibit and Financial Statement Schedules"),
54
+ ("item16", "Form 10-K Summary"),
55
+ ]
56
+
57
+ def _process(self) -> Optional[pa.Table]:
58
+ raw_sections = {}
59
+ doc_sections = self._document_processor.extract_sections("10-K")
60
+
61
+ if "10-K" in doc_sections:
62
+ document_10k = doc_sections["10-K"]
63
+
64
+ # Parse the document with BeautifulSoup
65
+ soup = BeautifulSoup(document_10k, "html.parser")
66
+
67
+ # Extract section names from <td> tags with an <a> tag
68
+ for td_tag in soup.find_all("td"):
69
+ a_tag = td_tag.find("a", href=True)
70
+ if not a_tag:
71
+ continue # Skip if there is no <a> tag
72
+
73
+ # Get the section name text from the <a> tag
74
+ section_text = a_tag.get_text(strip=True)
75
+
76
+ for item_pattern, section_name in self._SECTION_NAMES:
77
+ if re.search(section_name, section_text, re.IGNORECASE):
78
+ # Match the href with the corresponding content <p> tag by id
79
+ href_value = a_tag["href"].lstrip("#")
80
+ content_tag = soup.find(id=href_value)
81
+ section_content = ""
82
+
83
+ if content_tag:
84
+ # Check if the content_tag is part of a <span> tag with no direct text
85
+ if content_tag.name == "span" and not content_tag.get_text(
86
+ strip=True
87
+ ):
88
+ # Check if the parent contains the text (e.g., a <p> tag or another wrapping element)
89
+ parent_tag = content_tag.find_parent(["p", "span"])
90
+ if parent_tag:
91
+ section_content = parent_tag.get_text(
92
+ separator=" ", strip=True
93
+ )
94
+ else:
95
+ section_content = content_tag.get_text(
96
+ separator=" ", strip=True
97
+ )
98
+
99
+ # Gather all the content until the next section id is encountered
100
+ next_tag = content_tag.find_next_sibling()
101
+ while next_tag and not self._is_new_section(next_tag):
102
+ section_content += " " + next_tag.get_text(
103
+ separator=" ", strip=True
104
+ )
105
+ next_tag = next_tag.find_next_sibling()
106
+
107
+ # Save the accumulated content for the current section
108
+ raw_sections[
109
+ f"{self._document_processor.unique_id}-{section_name}"
110
+ ] = section_content
111
+ break
112
+
113
+ # Clean up sections by removing overlaps where a section contains the next section's name
114
+ raw_sections = self._clean_overlapping_sections(raw_sections)
115
+
116
+ # Clean the extracted section contents before returning
117
+ for section_key, raw_content in raw_sections.items():
118
+ cleaned_text = clean(raw_content)
119
+ raw_sections[section_key] = cleaned_text
120
+
121
+ return table_array(raw_sections) if raw_sections else None
122
+
123
+ def _is_new_section(self, element) -> bool:
124
+ """
125
+ Checks if the current element contains the start of a new section.
126
+ """
127
+ if element.name in ["p", "span"] and element.get("id"):
128
+ # Check if the id matches any section header
129
+ for _, section_name in self._SECTION_NAMES:
130
+ if re.search(section_name, element.get_text(), re.IGNORECASE):
131
+ return True
132
+ return False
133
+
134
+ def _clean_overlapping_sections(self, sections: Dict[str, str]) -> Dict[str, str]:
135
+ """
136
+ Ensure each section's content is unique and doesn't include the content of subsequent sections.
137
+ """
138
+ section_keys = list(sections.keys())
139
+ for i, current_section_key in enumerate(section_keys):
140
+ current_content = sections[current_section_key]
141
+
142
+ # Look ahead to the next section to find and remove overlaps
143
+ if i + 1 < len(section_keys):
144
+ next_section_name = self._SECTION_NAMES[i + 1][1]
145
+ next_section_pattern = re.escape(next_section_name)
146
+
147
+ # Remove anything from the current content that includes the next section name
148
+ next_section_match = re.search(
149
+ next_section_pattern, current_content, re.IGNORECASE
150
+ )
151
+ if next_section_match:
152
+ # Cut the content from the start of the next section's name
153
+ sections[current_section_key] = current_content[
154
+ : next_section_match.start()
155
+ ].strip()
156
+
157
+ return sections