datadock 1.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datadock-1.0.2/LICENSE +21 -0
- datadock-1.0.2/PKG-INFO +179 -0
- datadock-1.0.2/README.md +139 -0
- datadock-1.0.2/datadock/__init__.py +19 -0
- datadock-1.0.2/datadock/config.py +4 -0
- datadock-1.0.2/datadock/controllers/__init__.py +4 -0
- datadock-1.0.2/datadock/controllers/_base_controllers.py +120 -0
- datadock-1.0.2/datadock/controllers/_controller_10k.py +157 -0
- datadock-1.0.2/datadock/controllers/_controller_8k.py +204 -0
- datadock-1.0.2/datadock/controllers/_factory.py +39 -0
- datadock-1.0.2/datadock/controllers/control.py +76 -0
- datadock-1.0.2/datadock/core.py +88 -0
- datadock-1.0.2/datadock/ddhtml/__init__.py +5 -0
- datadock-1.0.2/datadock/ddhtml/doc_headers.py +128 -0
- datadock-1.0.2/datadock/ddhtml/extract_data.py +222 -0
- datadock-1.0.2/datadock/ddhtml/extract_tables.py +41 -0
- datadock-1.0.2/datadock/ddhtml/scrape_r_html.py +208 -0
- datadock-1.0.2/datadock/filings.py +22 -0
- datadock-1.0.2/datadock/financials/__init__.py +5 -0
- datadock-1.0.2/datadock/financials/base_table.py +176 -0
- datadock-1.0.2/datadock/financials/doc_links.py +219 -0
- datadock-1.0.2/datadock/metadata/__init__.py +0 -0
- datadock-1.0.2/datadock/metadata/__version__.py +4 -0
- datadock-1.0.2/datadock/src/__init__.py +14 -0
- datadock-1.0.2/datadock/src/_rich_.py +220 -0
- datadock-1.0.2/datadock/src/_tabulate_.py +104 -0
- datadock-1.0.2/datadock/src/_xml.py +138 -0
- datadock-1.0.2/datadock/src/api_base.py +214 -0
- datadock-1.0.2/datadock/src/api_errors.py +148 -0
- datadock-1.0.2/datadock/src/api_response.py +16 -0
- datadock-1.0.2/datadock/src/blob_storage.py +102 -0
- datadock-1.0.2/datadock/src/constants.py +55 -0
- datadock-1.0.2/datadock/src/custom_logger.py +85 -0
- datadock-1.0.2/datadock/src/dataclasses.py +22 -0
- datadock-1.0.2/datadock/src/document.py +55 -0
- datadock-1.0.2/datadock/src/filings.py +287 -0
- datadock-1.0.2/datadock/src/filters.py +126 -0
- datadock-1.0.2/datadock/src/models.py +322 -0
- datadock-1.0.2/datadock/src/utils.py +179 -0
- datadock-1.0.2/datadock/utils/__init__.py +0 -0
- datadock-1.0.2/datadock/utils/_compact.py +11 -0
- datadock-1.0.2/pyproject.toml +88 -0
datadock-1.0.2/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 PETER MBACHU
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
datadock-1.0.2/PKG-INFO
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: datadock
|
|
3
|
+
Version: 1.0.2
|
|
4
|
+
Summary: DataDock python library to access SEC edgar fillings.
|
|
5
|
+
License: MIT
|
|
6
|
+
Keywords: datadockpy,datadockAI,python,SEC
|
|
7
|
+
Author: Peter Mbachu
|
|
8
|
+
Author-email: peter.mbachu@datadock.ai.com
|
|
9
|
+
Requires-Python: >=3.10,<4.0
|
|
10
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Natural Language :: English
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Requires-Dist: azure-core (>=1.31.0,<2.0.0)
|
|
22
|
+
Requires-Dist: azure-storage-blob (>=12.23.1,<13.0.0)
|
|
23
|
+
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
|
24
|
+
Requires-Dist: colorama (>=0.4.6,<0.5.0)
|
|
25
|
+
Requires-Dist: fastcore (>=1.7.17,<2.0.0)
|
|
26
|
+
Requires-Dist: humanize (>=4.11.0,<5.0.0)
|
|
27
|
+
Requires-Dist: lxml (>=5.3.0,<6.0.0)
|
|
28
|
+
Requires-Dist: pandas (>=2.2.3,<3.0.0)
|
|
29
|
+
Requires-Dist: pyarrow (>=17.0.0,<18.0.0)
|
|
30
|
+
Requires-Dist: python-decouple (>=3.8,<4.0)
|
|
31
|
+
Requires-Dist: requests (>=2.32.3,<3.0.0)
|
|
32
|
+
Requires-Dist: rich (>=13.8.1,<14.0.0)
|
|
33
|
+
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
|
34
|
+
Project-URL: Bug Tracker, https://github.com/cla-bit/PayStackEase/issues
|
|
35
|
+
Project-URL: Documentation, https://paystackease.readthedocs.io/en/latest/
|
|
36
|
+
Project-URL: Homepage, https://github.com/cla-bit/PayStackEase
|
|
37
|
+
Project-URL: Source Code, https://github.com/cla-bit/PayStackEase
|
|
38
|
+
Description-Content-Type: text/markdown
|
|
39
|
+
|
|
40
|
+
# DataDockPy Library
|
|
41
|
+
|
|
42
|
+
--------------
|
|
43
|
+
|
|
44
|
+

|
|
45
|
+

|
|
46
|
+

|
|
47
|
+
|
|
48
|
+
<p align="center">
|
|
49
|
+
<a href="https://github.com/DataDock-AI/DataDockPy">
|
|
50
|
+
<img src="media/20241001135438-ezgif.com-video-to-gif-converter.gif" alt="DataDock demo" height="350">
|
|
51
|
+
</a>
|
|
52
|
+
</p>
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
## Introduction
|
|
56
|
+
|
|
57
|
+
DataDockPy is a powerful and user-friendly Python package designed to enhance data analysis.
|
|
58
|
+
It simplifies the extraction and presentation of information from the SEC Edgar database,
|
|
59
|
+
offering enriched data display for SEC form types, form controls, and financial analysis,
|
|
60
|
+
making it an essential tool for professionals working with regulatory filings.
|
|
61
|
+
|
|
62
|
+
This library uses `Poetry` as a dependency manager.
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
## Features
|
|
67
|
+
|
|
68
|
+
- 📁 **Access any SEC filing**: You can access any filings on SEC forms of Form 8-K and Form 10-K
|
|
69
|
+
- 📅 **List filings for any date range**: List filings by date e.g. or date range `2024-02-29:2024-03-15`
|
|
70
|
+
- 🌟 **User Friendly library**: Uses **[rich](https://rich.readthedocs.io/en/stable/introduction.html) & [tabulate](https://github.com/astanin/python-tabulate)** library to display SEC Edgar data in a beautiful way.
|
|
71
|
+
- 🔄 **Page through filings**: Use `filings.next()` and `filings.previous()` to page through filings
|
|
72
|
+
- 🏗️ **Filter filings data**: Build data filtering by cik, accession number, form type, filing date
|
|
73
|
+
- ✅ **Select a filing**: You can select a filing from the list of filings.
|
|
74
|
+
- 🔍 **Preview the text data for a filing**: You can preview the filing (sections) in the terminal or a notebook.
|
|
75
|
+
- 📊 **Parse to Dataframe**: You can parse filings to a dataframe.
|
|
76
|
+
- 📈 **Financial Statement**: Get financial statements of Form 8-K and Form 10-K of various companies.
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
## Get Started on Windows/MacOS/Linux Terminal
|
|
80
|
+
|
|
81
|
+
1. Open your terminal and install poetry using `pip`.
|
|
82
|
+
|
|
83
|
+
```commandline
|
|
84
|
+
pip install poetry
|
|
85
|
+
```
|
|
86
|
+
or
|
|
87
|
+
|
|
88
|
+
Install poetry using `pipx`.
|
|
89
|
+
|
|
90
|
+
```commandline
|
|
91
|
+
pipx install poetry
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
2. Create a project and clone DataDockPy github repository.
|
|
95
|
+
|
|
96
|
+
```git
|
|
97
|
+
git clone https://github.com/DataDock-AI/DataDockPy.git
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
3. Change directory to `DataDockPy`
|
|
101
|
+
|
|
102
|
+
```commandline
|
|
103
|
+
cd DataDockPy
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
4. Run the poetry command to install dependencies:
|
|
107
|
+
|
|
108
|
+
```commandline
|
|
109
|
+
poetry install
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
5. Activate virtual environment using poetry:
|
|
113
|
+
|
|
114
|
+
```commandline
|
|
115
|
+
poetry shell
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
6. Set up your SEC_IDENTITY in an `.env` file
|
|
119
|
+
|
|
120
|
+
```dotenv
|
|
121
|
+
SEC_IDENTITY=<your email or usernmae for SEC IDENTITY>
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
7. See the following scripts on how to use the package: `run_checks.py` and `run_checks2.py`
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
## Use DataDockPy with Jupyter
|
|
128
|
+
|
|
129
|
+
1. Open terminal and install poetry using `pip`.
|
|
130
|
+
|
|
131
|
+
```commandline
|
|
132
|
+
pip install poetry
|
|
133
|
+
```
|
|
134
|
+
or
|
|
135
|
+
|
|
136
|
+
Install poetry using `pipx`.
|
|
137
|
+
|
|
138
|
+
```commandline
|
|
139
|
+
pipx install poetry
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
2. Create a project and clone `DataDockPy` github repository
|
|
143
|
+
|
|
144
|
+
```git
|
|
145
|
+
git clone https://github.com/DataDock-AI/DataDockPy.git
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
3. Change directory to `DataDockPy`
|
|
149
|
+
|
|
150
|
+
```commandline
|
|
151
|
+
cd DataDockPy
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
4. Open your project directory on Anaconda or Visual Studio Code.
|
|
155
|
+
|
|
156
|
+
5. Choose a python environment (recommended python environment), where poetry was installed. `Ctrl+Shift+P`
|
|
157
|
+
6. If asked to install `ipykernel`, see here on installation: [ipykernel installation](https://devinschumacher.com/how-to-setup-jupyter-notebook-virtual-environment-vs-code-kernels/)
|
|
158
|
+
7. Check to see if poetry is installed:
|
|
159
|
+
|
|
160
|
+
```bash
|
|
161
|
+
!poetry --version
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
8. Run the poetry command to install dependencies:
|
|
165
|
+
|
|
166
|
+
```commandline
|
|
167
|
+
poetry install
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
## Download DataDockPy Source Code
|
|
172
|
+
|
|
173
|
+
You can download any of the source codes: `zip` or `tar.gz` here: [DataDockPy Source Code](https://github.com/DataDock-AI/DataDockPy/releases/tag/v0.1.0).
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
If you have any issue or contribution, please write an issue with this link: https://github.com/DataDock-AI/DataDockPy/issues
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
|
datadock-1.0.2/README.md
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# DataDockPy Library
|
|
2
|
+
|
|
3
|
+
--------------
|
|
4
|
+
|
|
5
|
+

|
|
6
|
+

|
|
7
|
+

|
|
8
|
+
|
|
9
|
+
<p align="center">
|
|
10
|
+
<a href="https://github.com/DataDock-AI/DataDockPy">
|
|
11
|
+
<img src="media/20241001135438-ezgif.com-video-to-gif-converter.gif" alt="DataDock demo" height="350">
|
|
12
|
+
</a>
|
|
13
|
+
</p>
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
## Introduction
|
|
17
|
+
|
|
18
|
+
DataDockPy is a powerful and user-friendly Python package designed to enhance data analysis.
|
|
19
|
+
It simplifies the extraction and presentation of information from the SEC Edgar database,
|
|
20
|
+
offering enriched data display for SEC form types, form controls, and financial analysis,
|
|
21
|
+
making it an essential tool for professionals working with regulatory filings.
|
|
22
|
+
|
|
23
|
+
This library uses `Poetry` as a dependency manager.
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
## Features
|
|
28
|
+
|
|
29
|
+
- 📁 **Access any SEC filing**: You can access any filings on SEC forms of Form 8-K and Form 10-K
|
|
30
|
+
- 📅 **List filings for any date range**: List filings by date e.g. or date range `2024-02-29:2024-03-15`
|
|
31
|
+
- 🌟 **User Friendly library**: Uses **[rich](https://rich.readthedocs.io/en/stable/introduction.html) & [tabulate](https://github.com/astanin/python-tabulate)** library to display SEC Edgar data in a beautiful way.
|
|
32
|
+
- 🔄 **Page through filings**: Use `filings.next()` and `filings.previous()` to page through filings
|
|
33
|
+
- 🏗️ **Filter filings data**: Build data filtering by cik, accession number, form type, filing date
|
|
34
|
+
- ✅ **Select a filing**: You can select a filing from the list of filings.
|
|
35
|
+
- 🔍 **Preview the text data for a filing**: You can preview the filing (sections) in the terminal or a notebook.
|
|
36
|
+
- 📊 **Parse to Dataframe**: You can parse filings to a dataframe.
|
|
37
|
+
- 📈 **Financial Statement**: Get financial statements of Form 8-K and Form 10-K of various companies.
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
## Get Started on Windows/MacOS/Linux Terminal
|
|
41
|
+
|
|
42
|
+
1. Open your terminal and install poetry using `pip`.
|
|
43
|
+
|
|
44
|
+
```commandline
|
|
45
|
+
pip install poetry
|
|
46
|
+
```
|
|
47
|
+
or
|
|
48
|
+
|
|
49
|
+
Install poetry using `pipx`.
|
|
50
|
+
|
|
51
|
+
```commandline
|
|
52
|
+
pipx install poetry
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
2. Create a project and clone DataDockPy github repository.
|
|
56
|
+
|
|
57
|
+
```git
|
|
58
|
+
git clone https://github.com/DataDock-AI/DataDockPy.git
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
3. Change directory to `DataDockPy`
|
|
62
|
+
|
|
63
|
+
```commandline
|
|
64
|
+
cd DataDockPy
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
4. Run the poetry command to install dependencies:
|
|
68
|
+
|
|
69
|
+
```commandline
|
|
70
|
+
poetry install
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
5. Activate virtual environment using poetry:
|
|
74
|
+
|
|
75
|
+
```commandline
|
|
76
|
+
poetry shell
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
6. Set up your SEC_IDENTITY in an `.env` file
|
|
80
|
+
|
|
81
|
+
```dotenv
|
|
82
|
+
SEC_IDENTITY=<your email or usernmae for SEC IDENTITY>
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
7. See the following scripts on how to use the package: `run_checks.py` and `run_checks2.py`
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
## Use DataDockPy with Jupyter
|
|
89
|
+
|
|
90
|
+
1. Open terminal and install poetry using `pip`.
|
|
91
|
+
|
|
92
|
+
```commandline
|
|
93
|
+
pip install poetry
|
|
94
|
+
```
|
|
95
|
+
or
|
|
96
|
+
|
|
97
|
+
Install poetry using `pipx`.
|
|
98
|
+
|
|
99
|
+
```commandline
|
|
100
|
+
pipx install poetry
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
2. Create a project and clone `DataDockPy` github repository
|
|
104
|
+
|
|
105
|
+
```git
|
|
106
|
+
git clone https://github.com/DataDock-AI/DataDockPy.git
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
3. Change directory to `DataDockPy`
|
|
110
|
+
|
|
111
|
+
```commandline
|
|
112
|
+
cd DataDockPy
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
4. Open your project directory on Anaconda or Visual Studio Code.
|
|
116
|
+
|
|
117
|
+
5. Choose a python environment (recommended python environment), where poetry was installed. `Ctrl+Shift+P`
|
|
118
|
+
6. If asked to install `ipykernel`, see here on installation: [ipykernel installation](https://devinschumacher.com/how-to-setup-jupyter-notebook-virtual-environment-vs-code-kernels/)
|
|
119
|
+
7. Check to see if poetry is installed:
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
!poetry --version
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
8. Run the poetry command to install dependencies:
|
|
126
|
+
|
|
127
|
+
```commandline
|
|
128
|
+
poetry install
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
## Download DataDockPy Source Code
|
|
133
|
+
|
|
134
|
+
You can download any of the source codes: `zip` or `tar.gz` here: [DataDockPy Source Code](https://github.com/DataDock-AI/DataDockPy/releases/tag/v0.1.0).
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
If you have any issue or contribution, please write an issue with this link: https://github.com/DataDock-AI/DataDockPy/issues
|
|
138
|
+
|
|
139
|
+
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from datadock.src import (
|
|
2
|
+
DataDockError,
|
|
3
|
+
CustomLogger,
|
|
4
|
+
SECRequestHandler,
|
|
5
|
+
# CurrentFile,
|
|
6
|
+
display_table_with_rich,
|
|
7
|
+
# TextSummaryModel,
|
|
8
|
+
# SentimentAnalysisModel,
|
|
9
|
+
# EntityRecognitionModel,
|
|
10
|
+
)
|
|
11
|
+
from datadock.core import DataPager, table_array
|
|
12
|
+
from datadock.filings import CurrentFilings
|
|
13
|
+
from datadock.controllers import FormControl
|
|
14
|
+
from datadock.financials import parse_html_new, DocumentFinancial, DataDocFiling
|
|
15
|
+
from datadock.ddhtml import CompanyFilingInfo
|
|
16
|
+
from datadock.metadata.__version__ import __version__
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
VERSION = __version__
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Optional, List, Union
|
|
3
|
+
import pyarrow as pa
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from rich.console import Group
|
|
6
|
+
from rich.panel import Panel
|
|
7
|
+
from rich.text import Text
|
|
8
|
+
|
|
9
|
+
from datadock.src import DocumentProcessor
|
|
10
|
+
|
|
11
|
+
from datadock.src.custom_logger import CustomLogger
|
|
12
|
+
from datadock.core import DataPager
|
|
13
|
+
from datadock.src._rich_ import repr_rich
|
|
14
|
+
from datadock.src.constants import IntString
|
|
15
|
+
from datadock.src.filters import filter_by_section_titles
|
|
16
|
+
from datadock.src.dataclasses import FilingsState
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class FormBaseController(ABC):
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
cik: str,
|
|
23
|
+
accession_number: str,
|
|
24
|
+
form_type: str,
|
|
25
|
+
logger: CustomLogger,
|
|
26
|
+
document_processor: DocumentProcessor,
|
|
27
|
+
):
|
|
28
|
+
self.cik = cik
|
|
29
|
+
self.accession_number = accession_number
|
|
30
|
+
self.form_type = form_type
|
|
31
|
+
self._logger = logger
|
|
32
|
+
self._document_processor = document_processor
|
|
33
|
+
self.data_table: Optional[pa.Table] = self._process()
|
|
34
|
+
self.data: FormDataSections = FormDataSections(self.data_table)
|
|
35
|
+
|
|
36
|
+
@abstractmethod
|
|
37
|
+
def _process(self) -> Optional[pa.Table]:
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
def to_pandas(self, *columns) -> Optional[pd.DataFrame]:
|
|
41
|
+
if not self.data:
|
|
42
|
+
return None
|
|
43
|
+
data_frame = self.data.data.to_pandas()
|
|
44
|
+
return data_frame.filter(columns) if len(columns) > 0 else data_frame
|
|
45
|
+
|
|
46
|
+
"""
|
|
47
|
+
Next step is to implement using rich or tabulate to display the pyarrow table result to users
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def filter(
|
|
51
|
+
self,
|
|
52
|
+
titles: Optional[Union[str, List[IntString]]] = None,
|
|
53
|
+
) -> Optional["FormDataSections"]:
|
|
54
|
+
filing_index = self.data.data
|
|
55
|
+
section_titles = titles
|
|
56
|
+
|
|
57
|
+
if isinstance(section_titles, list):
|
|
58
|
+
section_titles = [str(title) for title in section_titles]
|
|
59
|
+
|
|
60
|
+
# Filter by form
|
|
61
|
+
if section_titles:
|
|
62
|
+
filing_index = filter_by_section_titles(filing_index, titles=section_titles)
|
|
63
|
+
return FormDataSections(filing_index)
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def get_all_sections(self):
|
|
67
|
+
return self.data
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class FormDataSections:
|
|
71
|
+
|
|
72
|
+
def __init__(
|
|
73
|
+
self,
|
|
74
|
+
filing_index: pa.Table,
|
|
75
|
+
original_state: Optional[FilingsState] = None,
|
|
76
|
+
logger: Optional[CustomLogger] = None,
|
|
77
|
+
) -> None:
|
|
78
|
+
self.data = filing_index
|
|
79
|
+
self.original_state = original_state or FilingsState(0, len(filing_index))
|
|
80
|
+
self.data_pager = DataPager(self.data)
|
|
81
|
+
self._logger: CustomLogger = logger or CustomLogger().logger
|
|
82
|
+
|
|
83
|
+
def _page_index(self) -> range:
|
|
84
|
+
"""Create the range index to set on the page dataframe depending on where in the data we are"""
|
|
85
|
+
if self.original_state:
|
|
86
|
+
return range(
|
|
87
|
+
self.original_state.page_start,
|
|
88
|
+
self.original_state.page_start
|
|
89
|
+
+ min(self.data_pager.page_size, len(self.data)),
|
|
90
|
+
) # set the index to the size of the page
|
|
91
|
+
else:
|
|
92
|
+
return range(*self.data_pager.current_range)
|
|
93
|
+
|
|
94
|
+
def __rich__(self) -> Panel:
|
|
95
|
+
# Convert the PyArrow table to a pandas DataFrame for easier processing
|
|
96
|
+
df = self.data.to_pandas()
|
|
97
|
+
|
|
98
|
+
# Create a list to hold all sections
|
|
99
|
+
sections_content = []
|
|
100
|
+
|
|
101
|
+
# Add each section ID and its content to the list
|
|
102
|
+
for section_id, section_text in zip(df["Section Title"], df["Section Content"]):
|
|
103
|
+
section_id_text = Text(section_id, style="bold blue")
|
|
104
|
+
section_content_text = Text(section_text)
|
|
105
|
+
|
|
106
|
+
# Create a sub-panel for each section
|
|
107
|
+
section_sub_panel = Panel(section_content_text, title=section_id_text)
|
|
108
|
+
sections_content.append(section_sub_panel)
|
|
109
|
+
|
|
110
|
+
# Create a main panel that contains all sections
|
|
111
|
+
sections_panel = Panel(
|
|
112
|
+
Group(*sections_content),
|
|
113
|
+
title="DataDock Filings Sections and Contents",
|
|
114
|
+
border_style="green",
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
return sections_panel
|
|
118
|
+
|
|
119
|
+
def __repr__(self):
|
|
120
|
+
return repr_rich(self.__rich__())
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import pyarrow as pa
|
|
3
|
+
from typing import Optional, Dict
|
|
4
|
+
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
|
|
7
|
+
from datadock.controllers._base_controllers import FormBaseController
|
|
8
|
+
from datadock.src.utils import clean
|
|
9
|
+
from datadock.core import table_array
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Clean10KController(FormBaseController):
|
|
13
|
+
_SECTION_NAMES = [
|
|
14
|
+
("item1", "Business"),
|
|
15
|
+
("item1a", "Risk Factors"),
|
|
16
|
+
("item1b", "Unresolved Staff Comments"),
|
|
17
|
+
("item1c", "Cybersecurity"),
|
|
18
|
+
("item2", "Properties"),
|
|
19
|
+
("item3", "Legal Proceedings"),
|
|
20
|
+
("item4", "Mine Safety Disclosures"),
|
|
21
|
+
(
|
|
22
|
+
"item5",
|
|
23
|
+
"Market for Registrant’s Common Equity, Related Stockholder Matters, and Issuer Purchases of Equity Securities",
|
|
24
|
+
),
|
|
25
|
+
("item6", "Selected Financial Data"),
|
|
26
|
+
(
|
|
27
|
+
"item7",
|
|
28
|
+
"Management’s Discussion and Analysis of Financial Condition and Results of Operations",
|
|
29
|
+
),
|
|
30
|
+
("item7a", "Quantitative and Qualitative Disclosures about Market Risk"),
|
|
31
|
+
("item8", "Financial Statements and Supplementary Data"),
|
|
32
|
+
(
|
|
33
|
+
"item9",
|
|
34
|
+
"Changes in and Disagreements with Accountants on Accounting and Financial Disclosure",
|
|
35
|
+
),
|
|
36
|
+
("item9a", "Controls and Procedures"),
|
|
37
|
+
("item9b", "Other Information"),
|
|
38
|
+
(
|
|
39
|
+
"item9c",
|
|
40
|
+
"Disclosure Regarding Foreign Jurisdictions that Prevent Inspections",
|
|
41
|
+
),
|
|
42
|
+
("item10", "Directors, Executive Officers, and Corporate Governance"),
|
|
43
|
+
("item11", "Executive Compensation"),
|
|
44
|
+
(
|
|
45
|
+
"item12",
|
|
46
|
+
"Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters",
|
|
47
|
+
),
|
|
48
|
+
(
|
|
49
|
+
"item13",
|
|
50
|
+
"Certain Relationships and Related Transactions, and Director Independence",
|
|
51
|
+
),
|
|
52
|
+
("item14", "Principal Accountant Fees and Services"),
|
|
53
|
+
("item15", "Exhibit and Financial Statement Schedules"),
|
|
54
|
+
("item16", "Form 10-K Summary"),
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
def _process(self) -> Optional[pa.Table]:
|
|
58
|
+
raw_sections = {}
|
|
59
|
+
doc_sections = self._document_processor.extract_sections("10-K")
|
|
60
|
+
|
|
61
|
+
if "10-K" in doc_sections:
|
|
62
|
+
document_10k = doc_sections["10-K"]
|
|
63
|
+
|
|
64
|
+
# Parse the document with BeautifulSoup
|
|
65
|
+
soup = BeautifulSoup(document_10k, "html.parser")
|
|
66
|
+
|
|
67
|
+
# Extract section names from <td> tags with an <a> tag
|
|
68
|
+
for td_tag in soup.find_all("td"):
|
|
69
|
+
a_tag = td_tag.find("a", href=True)
|
|
70
|
+
if not a_tag:
|
|
71
|
+
continue # Skip if there is no <a> tag
|
|
72
|
+
|
|
73
|
+
# Get the section name text from the <a> tag
|
|
74
|
+
section_text = a_tag.get_text(strip=True)
|
|
75
|
+
|
|
76
|
+
for item_pattern, section_name in self._SECTION_NAMES:
|
|
77
|
+
if re.search(section_name, section_text, re.IGNORECASE):
|
|
78
|
+
# Match the href with the corresponding content <p> tag by id
|
|
79
|
+
href_value = a_tag["href"].lstrip("#")
|
|
80
|
+
content_tag = soup.find(id=href_value)
|
|
81
|
+
section_content = ""
|
|
82
|
+
|
|
83
|
+
if content_tag:
|
|
84
|
+
# Check if the content_tag is part of a <span> tag with no direct text
|
|
85
|
+
if content_tag.name == "span" and not content_tag.get_text(
|
|
86
|
+
strip=True
|
|
87
|
+
):
|
|
88
|
+
# Check if the parent contains the text (e.g., a <p> tag or another wrapping element)
|
|
89
|
+
parent_tag = content_tag.find_parent(["p", "span"])
|
|
90
|
+
if parent_tag:
|
|
91
|
+
section_content = parent_tag.get_text(
|
|
92
|
+
separator=" ", strip=True
|
|
93
|
+
)
|
|
94
|
+
else:
|
|
95
|
+
section_content = content_tag.get_text(
|
|
96
|
+
separator=" ", strip=True
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# Gather all the content until the next section id is encountered
|
|
100
|
+
next_tag = content_tag.find_next_sibling()
|
|
101
|
+
while next_tag and not self._is_new_section(next_tag):
|
|
102
|
+
section_content += " " + next_tag.get_text(
|
|
103
|
+
separator=" ", strip=True
|
|
104
|
+
)
|
|
105
|
+
next_tag = next_tag.find_next_sibling()
|
|
106
|
+
|
|
107
|
+
# Save the accumulated content for the current section
|
|
108
|
+
raw_sections[
|
|
109
|
+
f"{self._document_processor.unique_id}-{section_name}"
|
|
110
|
+
] = section_content
|
|
111
|
+
break
|
|
112
|
+
|
|
113
|
+
# Clean up sections by removing overlaps where a section contains the next section's name
|
|
114
|
+
raw_sections = self._clean_overlapping_sections(raw_sections)
|
|
115
|
+
|
|
116
|
+
# Clean the extracted section contents before returning
|
|
117
|
+
for section_key, raw_content in raw_sections.items():
|
|
118
|
+
cleaned_text = clean(raw_content)
|
|
119
|
+
raw_sections[section_key] = cleaned_text
|
|
120
|
+
|
|
121
|
+
return table_array(raw_sections) if raw_sections else None
|
|
122
|
+
|
|
123
|
+
def _is_new_section(self, element) -> bool:
|
|
124
|
+
"""
|
|
125
|
+
Checks if the current element contains the start of a new section.
|
|
126
|
+
"""
|
|
127
|
+
if element.name in ["p", "span"] and element.get("id"):
|
|
128
|
+
# Check if the id matches any section header
|
|
129
|
+
for _, section_name in self._SECTION_NAMES:
|
|
130
|
+
if re.search(section_name, element.get_text(), re.IGNORECASE):
|
|
131
|
+
return True
|
|
132
|
+
return False
|
|
133
|
+
|
|
134
|
+
def _clean_overlapping_sections(self, sections: Dict[str, str]) -> Dict[str, str]:
|
|
135
|
+
"""
|
|
136
|
+
Ensure each section's content is unique and doesn't include the content of subsequent sections.
|
|
137
|
+
"""
|
|
138
|
+
section_keys = list(sections.keys())
|
|
139
|
+
for i, current_section_key in enumerate(section_keys):
|
|
140
|
+
current_content = sections[current_section_key]
|
|
141
|
+
|
|
142
|
+
# Look ahead to the next section to find and remove overlaps
|
|
143
|
+
if i + 1 < len(section_keys):
|
|
144
|
+
next_section_name = self._SECTION_NAMES[i + 1][1]
|
|
145
|
+
next_section_pattern = re.escape(next_section_name)
|
|
146
|
+
|
|
147
|
+
# Remove anything from the current content that includes the next section name
|
|
148
|
+
next_section_match = re.search(
|
|
149
|
+
next_section_pattern, current_content, re.IGNORECASE
|
|
150
|
+
)
|
|
151
|
+
if next_section_match:
|
|
152
|
+
# Cut the content from the start of the next section's name
|
|
153
|
+
sections[current_section_key] = current_content[
|
|
154
|
+
: next_section_match.start()
|
|
155
|
+
].strip()
|
|
156
|
+
|
|
157
|
+
return sections
|