datamule 2.3.5__py3-none-any.whl → 2.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamule might be problematic. Click here for more details.
- datamule/__init__.py +5 -3
- datamule/book/s3transfer.py +1 -1
- datamule/document/document.py +1 -1
- datamule/{portfolio.py → portfolio/portfolio.py} +14 -23
- datamule/{portfolio_compression_utils.py → portfolio/portfolio_compression_utils_legacy.py} +2 -0
- datamule/seclibrary/bq.py +2 -0
- datamule/sheet/__init__.py +0 -0
- datamule/{sheet.py → sheet/sheet.py} +4 -4
- datamule/submission/__init__.py +0 -0
- datamule/{submission.py → submission/submission.py} +150 -97
- datamule/submission/tar_submission.py +79 -0
- datamule/tables/__init__.py +0 -0
- {datamule-2.3.5.dist-info → datamule-2.3.7.dist-info}/METADATA +1 -1
- {datamule-2.3.5.dist-info → datamule-2.3.7.dist-info}/RECORD +28 -24
- /datamule/{document/tables → cloud}/__init__.py +0 -0
- /datamule/{sentiment → portfolio}/__init__.py +0 -0
- /datamule/{document/tables → tables}/tables.py +0 -0
- /datamule/{document/tables → tables}/tables_13fhr.py +0 -0
- /datamule/{document/tables → tables}/tables_25nse.py +0 -0
- /datamule/{document/tables → tables}/tables_informationtable.py +0 -0
- /datamule/{document/tables → tables}/tables_npx.py +0 -0
- /datamule/{document/tables → tables}/tables_ownership.py +0 -0
- /datamule/{document/tables → tables}/tables_proxyvotingrecord.py +0 -0
- /datamule/{document/tables → tables}/tables_sbsef.py +0 -0
- /datamule/{document/tables → tables}/tables_sdr.py +0 -0
- /datamule/{document/tables → tables}/utils.py +0 -0
- {datamule-2.3.5.dist-info → datamule-2.3.7.dist-info}/WHEEL +0 -0
- {datamule-2.3.5.dist-info → datamule-2.3.7.dist-info}/top_level.txt +0 -0
datamule/__init__.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
from .submission import Submission
|
|
2
|
-
from .portfolio import Portfolio
|
|
1
|
+
from .submission.submission import Submission
|
|
2
|
+
from .portfolio.portfolio import Portfolio
|
|
3
3
|
from .document.document import Document
|
|
4
4
|
from .helper import _load_package_csv, load_package_dataset
|
|
5
5
|
from .config import Config
|
|
6
|
-
from .sheet import Sheet
|
|
6
|
+
from .sheet.sheet import Sheet
|
|
7
7
|
from .index import Index
|
|
8
8
|
from .package_updater import PackageUpdater
|
|
9
9
|
from .utils.format_accession import format_accession
|
|
@@ -32,6 +32,8 @@ def _setup_notebook_env():
|
|
|
32
32
|
# Set up notebook environment
|
|
33
33
|
_setup_notebook_env()
|
|
34
34
|
|
|
35
|
+
|
|
36
|
+
# TODO, is this load bearing?
|
|
35
37
|
__all__ = [
|
|
36
38
|
'_load_package_csv',
|
|
37
39
|
'load_package_dataset',
|
datamule/book/s3transfer.py
CHANGED
datamule/document/document.py
CHANGED
|
@@ -10,7 +10,7 @@ from pathlib import Path
|
|
|
10
10
|
import webbrowser
|
|
11
11
|
from secsgml.utils import bytes_to_str
|
|
12
12
|
import tempfile
|
|
13
|
-
from
|
|
13
|
+
from ..tables.tables import Tables
|
|
14
14
|
|
|
15
15
|
from ..tags.utils import get_cusip_using_regex, get_isin_using_regex, get_figi_using_regex,get_all_tickers, get_full_names,get_full_names_dictionary_lookup, analyze_lm_sentiment_fragment
|
|
16
16
|
from ..utils.pdf import has_extractable_text
|
|
@@ -1,19 +1,19 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
2
|
from tqdm import tqdm
|
|
3
3
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
4
|
-
from .submission import Submission
|
|
5
|
-
from
|
|
6
|
-
from
|
|
7
|
-
from
|
|
4
|
+
from ..submission.submission import Submission
|
|
5
|
+
from ..sec.submissions.downloader import download as sec_download
|
|
6
|
+
from ..sec.submissions.textsearch import filter_text
|
|
7
|
+
from ..config import Config
|
|
8
8
|
import os
|
|
9
9
|
import tarfile
|
|
10
10
|
from threading import Lock
|
|
11
|
-
from
|
|
12
|
-
from
|
|
13
|
-
from
|
|
14
|
-
from
|
|
15
|
-
from .
|
|
16
|
-
from
|
|
11
|
+
from ..helper import _process_cik_and_metadata_filters
|
|
12
|
+
from ..datamule.downloader import download as seclibrary_download
|
|
13
|
+
from ..sec.xbrl.filter_xbrl import filter_xbrl
|
|
14
|
+
from ..sec.submissions.monitor import Monitor
|
|
15
|
+
from .portfolio_compression_utils_legacy import CompressionManager
|
|
16
|
+
from ..datamule.sec_connector import SecConnector
|
|
17
17
|
import shutil
|
|
18
18
|
|
|
19
19
|
|
|
@@ -31,6 +31,7 @@ class Portfolio:
|
|
|
31
31
|
|
|
32
32
|
self.monitor = Monitor()
|
|
33
33
|
|
|
34
|
+
|
|
34
35
|
if self.path.exists():
|
|
35
36
|
self._load_submissions()
|
|
36
37
|
self.submissions_loaded = True
|
|
@@ -47,6 +48,7 @@ class Portfolio:
|
|
|
47
48
|
regular_items = [f for f in self.path.iterdir() if (f.is_dir() or f.suffix=='.tar') and 'batch' not in f.name]
|
|
48
49
|
batch_tars = [f for f in self.path.iterdir() if f.is_file() and 'batch' in f.name and f.suffix == '.tar']
|
|
49
50
|
|
|
51
|
+
|
|
50
52
|
# Load regular submissions (existing logic)
|
|
51
53
|
def load_submission(folder):
|
|
52
54
|
return Submission(folder)
|
|
@@ -99,11 +101,12 @@ class Portfolio:
|
|
|
99
101
|
try:
|
|
100
102
|
submission = Submission(
|
|
101
103
|
batch_tar_path=batch_tar_path,
|
|
102
|
-
|
|
104
|
+
accession=accession_prefix,
|
|
103
105
|
portfolio_ref=self
|
|
104
106
|
)
|
|
105
107
|
submissions.append(submission)
|
|
106
108
|
except Exception as e:
|
|
109
|
+
print(f"Path: {batch_tar_path}. Exception: {e}")
|
|
107
110
|
pass
|
|
108
111
|
#print(f"Path: {batch_tar_path}. Exception: {e}")
|
|
109
112
|
pbar.update(1) # Update progress for each successful submission
|
|
@@ -111,18 +114,6 @@ class Portfolio:
|
|
|
111
114
|
return submissions
|
|
112
115
|
|
|
113
116
|
|
|
114
|
-
def compress(self, compression=None, compression_level=None, threshold=1048576, max_batch_size=1024*1024*1024):
|
|
115
|
-
"""
|
|
116
|
-
Compress all individual submissions into batch tar files.
|
|
117
|
-
|
|
118
|
-
Args:
|
|
119
|
-
compression: None, 'gzip', or 'zstd' for document compression (default: None)
|
|
120
|
-
compression_level: Compression level, if None uses defaults (gzip=6, zstd=3)
|
|
121
|
-
threshold: Size threshold for compressing individual documents (default: 1MB)
|
|
122
|
-
max_batch_size: Maximum size per batch tar file (default: 1GB)
|
|
123
|
-
"""
|
|
124
|
-
CompressionManager().compress_portfolio(self, compression, compression_level, threshold, max_batch_size, self.MAX_WORKERS)
|
|
125
|
-
|
|
126
117
|
def decompress(self):
|
|
127
118
|
"""
|
|
128
119
|
Decompress all batch tar files back to individual submission directories.
|
datamule/seclibrary/bq.py
CHANGED
|
File without changes
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
2
|
import csv
|
|
3
3
|
import os
|
|
4
|
-
from
|
|
5
|
-
from
|
|
6
|
-
from
|
|
7
|
-
from
|
|
4
|
+
from ..helper import _process_cik_and_metadata_filters, load_package_dataset
|
|
5
|
+
from ..sec.xbrl.downloadcompanyfacts import download_company_facts
|
|
6
|
+
from ..datamule.datamule_lookup import datamule_lookup
|
|
7
|
+
from ..datamule.datamule_mysql_rds import query_mysql_rds
|
|
8
8
|
from company_fundamentals.utils import get_fundamental_mappings
|
|
9
9
|
from company_fundamentals import construct_fundamentals
|
|
10
10
|
class Sheet:
|
|
File without changes
|
|
@@ -1,42 +1,103 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
2
|
import json
|
|
3
|
-
from
|
|
3
|
+
from ..document.document import Document
|
|
4
4
|
from secsgml import parse_sgml_content_into_memory
|
|
5
5
|
from secsgml.parse_sgml import transform_metadata_string
|
|
6
6
|
from secsgml.utils import bytes_to_str
|
|
7
|
-
from
|
|
7
|
+
from ..sec.utils import headers
|
|
8
8
|
import tarfile
|
|
9
|
-
import zstandard as zstd
|
|
10
|
-
import gzip
|
|
11
9
|
import urllib.request
|
|
12
10
|
from secxbrl import parse_inline_xbrl
|
|
13
11
|
from company_fundamentals import construct_fundamentals
|
|
14
12
|
from decimal import Decimal
|
|
15
|
-
from
|
|
16
|
-
|
|
13
|
+
from ..utils.format_accession import format_accession
|
|
14
|
+
from .tar_submission import tar_submission
|
|
15
|
+
|
|
16
|
+
# probably needs rework later
|
|
17
|
+
class FundamentalsAccessor:
|
|
18
|
+
def __init__(self, submission):
|
|
19
|
+
self.submission = submission
|
|
20
|
+
self._cache = {}
|
|
21
|
+
self._all_data = None
|
|
22
|
+
|
|
23
|
+
def __getattr__(self, name):
|
|
24
|
+
# Try as category first
|
|
25
|
+
try:
|
|
26
|
+
if name not in self._cache:
|
|
27
|
+
result = self.submission.parse_fundamentals(categories=[name])
|
|
28
|
+
if result: # Only cache if we got actual data
|
|
29
|
+
self._cache[name] = result
|
|
30
|
+
return result
|
|
31
|
+
except:
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
# Fall back to dict behavior
|
|
35
|
+
return getattr(self._get_all_data(), name)
|
|
36
|
+
|
|
37
|
+
def _get_all_data(self):
|
|
38
|
+
if self._all_data is None:
|
|
39
|
+
self._all_data = self.submission.parse_fundamentals(categories=None)
|
|
40
|
+
return self._all_data
|
|
41
|
+
|
|
42
|
+
# Make the accessor behave like the underlying data
|
|
43
|
+
def __getitem__(self, key):
|
|
44
|
+
return self._get_all_data()[key]
|
|
45
|
+
|
|
46
|
+
def __repr__(self):
|
|
47
|
+
return repr(self._get_all_data())
|
|
48
|
+
|
|
49
|
+
def __str__(self):
|
|
50
|
+
return str(self._get_all_data())
|
|
51
|
+
|
|
52
|
+
def __iter__(self):
|
|
53
|
+
return iter(self._get_all_data())
|
|
54
|
+
|
|
55
|
+
def __len__(self):
|
|
56
|
+
return len(self._get_all_data()) if self._get_all_data() else 0
|
|
57
|
+
|
|
58
|
+
def __bool__(self):
|
|
59
|
+
return bool(self._get_all_data())
|
|
17
60
|
|
|
18
61
|
class Submission:
|
|
19
62
|
def __init__(self, path=None, sgml_content=None, keep_document_types=None,
|
|
20
|
-
batch_tar_path=None,
|
|
63
|
+
batch_tar_path=None, accession=None, portfolio_ref=None,url=None):
|
|
21
64
|
|
|
65
|
+
# get accession number
|
|
66
|
+
# lets just use accesion-prefix, to get around malformed metadata files (1995 has a lot!)
|
|
67
|
+
if path is not None:
|
|
68
|
+
self.accession = format_accession(path.stem,'no-dash')
|
|
69
|
+
elif batch_tar_path is not None:
|
|
70
|
+
self.accession = format_accession(accession,'no-dash')
|
|
71
|
+
elif url is not None or sgml_content is not None:
|
|
72
|
+
if accession is None:
|
|
73
|
+
raise ValueError("If using url or sgml_content, accession must be specified.")
|
|
74
|
+
self.accession = format_accession(accession,'no-dash')
|
|
75
|
+
else:
|
|
76
|
+
raise ValueError("If this appears, please post an issue: https://github.com/john-friedman/datamule-python/issues.")
|
|
77
|
+
|
|
22
78
|
|
|
23
79
|
# declare vars to be filled later
|
|
24
80
|
self._xbrl = None
|
|
25
81
|
self._fundamentals_cache = {}
|
|
82
|
+
self._tar = None
|
|
83
|
+
self._tar_compression_type = 'zstd'
|
|
84
|
+
self._tar_compression_level = 3
|
|
85
|
+
self._accession_year_2d = None
|
|
86
|
+
self._documents = None
|
|
26
87
|
|
|
27
88
|
# Validate parameters
|
|
28
89
|
param_count = sum(x is not None for x in [path, sgml_content, batch_tar_path,url])
|
|
29
90
|
if param_count != 1:
|
|
30
91
|
raise ValueError("Exactly one of path, sgml_content, or batch_tar_path must be provided")
|
|
31
92
|
|
|
32
|
-
if batch_tar_path is not None and (
|
|
33
|
-
raise ValueError("batch_tar_path requires both
|
|
93
|
+
if batch_tar_path is not None and (self.accession is None or portfolio_ref is None):
|
|
94
|
+
raise ValueError("batch_tar_path requires both accession and portfolio_ref")
|
|
34
95
|
|
|
35
96
|
# Initialize batch tar attributes
|
|
36
97
|
self.batch_tar_path = batch_tar_path
|
|
37
|
-
self.accession_prefix = accession_prefix
|
|
38
98
|
self.portfolio_ref = portfolio_ref
|
|
39
99
|
|
|
100
|
+
# here should set accession either from url or make it a required argument if sgml content
|
|
40
101
|
if url is not None or sgml_content is not None:
|
|
41
102
|
if url is not None:
|
|
42
103
|
request = urllib.request.Request(url, headers=headers)
|
|
@@ -49,17 +110,15 @@ class Submission:
|
|
|
49
110
|
|
|
50
111
|
self.path = None
|
|
51
112
|
metadata, raw_documents = parse_sgml_content_into_memory(sgml_content)
|
|
52
|
-
metadata = bytes_to_str(metadata)
|
|
113
|
+
metadata = bytes_to_str(metadata,lower=False)
|
|
53
114
|
|
|
54
115
|
# standardize metadata
|
|
55
116
|
metadata = transform_metadata_string(metadata)
|
|
56
117
|
|
|
57
118
|
self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=None)
|
|
58
|
-
# code dupe
|
|
59
|
-
self.accession = self.metadata.content['accession-number']
|
|
60
119
|
self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
|
|
61
120
|
|
|
62
|
-
self.
|
|
121
|
+
self.documents_obj_list = []
|
|
63
122
|
filtered_metadata_documents = []
|
|
64
123
|
|
|
65
124
|
for idx,doc in enumerate(self.metadata.content['documents']):
|
|
@@ -72,7 +131,7 @@ class Submission:
|
|
|
72
131
|
# write as txt if not declared
|
|
73
132
|
filename = doc.get('filename','.txt')
|
|
74
133
|
extension = Path(filename).suffix
|
|
75
|
-
self.
|
|
134
|
+
self.documents_obj_list.append(Document(type=type, content=raw_documents[idx], extension=extension,filing_date=self.filing_date,accession=self.accession))
|
|
76
135
|
|
|
77
136
|
filtered_metadata_documents.append(doc)
|
|
78
137
|
|
|
@@ -85,24 +144,22 @@ class Submission:
|
|
|
85
144
|
# Load metadata from batch tar
|
|
86
145
|
with self.portfolio_ref.batch_tar_locks[batch_tar_path]:
|
|
87
146
|
tar_handle = self.portfolio_ref.batch_tar_handles[batch_tar_path]
|
|
88
|
-
metadata_obj = tar_handle.extractfile(f'{
|
|
147
|
+
metadata_obj = tar_handle.extractfile(f'{self.accession}/metadata.json')
|
|
89
148
|
metadata = json.loads(metadata_obj.read().decode('utf-8'))
|
|
90
149
|
|
|
91
150
|
# Set metadata path using :: notation
|
|
92
|
-
metadata_path = f"{batch_tar_path}::{
|
|
151
|
+
metadata_path = f"{batch_tar_path}::{self.accession}/metadata.json"
|
|
93
152
|
|
|
94
153
|
# standardize metadata
|
|
95
154
|
metadata = transform_metadata_string(metadata)
|
|
96
155
|
self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
|
|
97
|
-
|
|
98
|
-
# lets just use accesion-prefix, to get around malformed metadata files (1995 has a lot!)
|
|
99
|
-
self.accession = format_accession(self.accession_prefix,'dash')
|
|
100
156
|
|
|
101
|
-
#print(f"s: {self.metadata.content['accession-number']} : {batch_tar_path}")
|
|
102
157
|
self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
|
|
103
158
|
|
|
104
159
|
elif path is not None:
|
|
105
160
|
self.path = Path(path)
|
|
161
|
+
|
|
162
|
+
|
|
106
163
|
if self.path.suffix == '.tar':
|
|
107
164
|
with tarfile.open(self.path,'r') as tar:
|
|
108
165
|
metadata_obj = tar.extractfile('metadata.json')
|
|
@@ -118,65 +175,45 @@ class Submission:
|
|
|
118
175
|
# standardize metadata
|
|
119
176
|
metadata = transform_metadata_string(metadata)
|
|
120
177
|
self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
|
|
121
|
-
self.accession = self.metadata.content['accession-number']
|
|
122
178
|
self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
|
|
123
179
|
|
|
124
180
|
|
|
125
181
|
# booleans
|
|
126
|
-
self.
|
|
182
|
+
self._xbrl_bool = any(
|
|
127
183
|
doc['type'] in ('EX-100.INS', 'EX-101.INS') or
|
|
128
184
|
doc.get('filename', '').endswith('_htm.xml')
|
|
129
185
|
for doc in self.metadata.content['documents']
|
|
130
186
|
)
|
|
131
187
|
|
|
132
|
-
self._has_fundamentals = self.
|
|
188
|
+
self._has_fundamentals = self._xbrl_bool
|
|
133
189
|
|
|
190
|
+
|
|
191
|
+
# TODO rework for better metadata accessing
|
|
134
192
|
def _load_document_by_index(self, idx):
|
|
135
193
|
"""Load a document by its index in the metadata documents list."""
|
|
136
194
|
doc = self.metadata.content['documents'][idx]
|
|
137
195
|
|
|
138
196
|
# If loaded from sgml_content, return pre-loaded document
|
|
139
197
|
if self.path is None and self.batch_tar_path is None:
|
|
140
|
-
return self.
|
|
198
|
+
return self.documents_obj_list[idx]
|
|
141
199
|
|
|
142
200
|
# Get filename from metadata - this is the source of truth
|
|
143
201
|
filename = doc.get('filename')
|
|
144
202
|
if filename is None:
|
|
145
203
|
filename = doc['sequence'] + '.txt'
|
|
146
204
|
|
|
147
|
-
|
|
148
|
-
# If filename ends with .gz or .zst, the real extension is before that
|
|
149
|
-
if filename.endswith('.gz'):
|
|
150
|
-
extension = Path(filename[:-3]).suffix
|
|
151
|
-
is_compressed = 'gzip'
|
|
152
|
-
elif filename.endswith('.zst'):
|
|
153
|
-
extension = Path(filename[:-4]).suffix
|
|
154
|
-
is_compressed = 'zstd'
|
|
155
|
-
else:
|
|
156
|
-
extension = Path(filename).suffix
|
|
157
|
-
is_compressed = False
|
|
158
|
-
|
|
205
|
+
extension = Path(filename).suffix
|
|
159
206
|
# Handle batch tar case
|
|
160
207
|
if self.batch_tar_path is not None:
|
|
161
208
|
with self.portfolio_ref.batch_tar_locks[self.batch_tar_path]:
|
|
162
209
|
tar_handle = self.portfolio_ref.batch_tar_handles[self.batch_tar_path]
|
|
163
210
|
|
|
164
211
|
# Use exact filename from metadata
|
|
165
|
-
tar_path = f'{self.
|
|
212
|
+
tar_path = f'{self.accession}/{filename}'
|
|
166
213
|
content = tar_handle.extractfile(tar_path).read()
|
|
167
214
|
|
|
168
215
|
|
|
169
|
-
|
|
170
|
-
if is_compressed == 'gzip':
|
|
171
|
-
content = gzip.decompress(content)
|
|
172
|
-
elif is_compressed == 'zstd':
|
|
173
|
-
content = zstd.ZstdDecompressor().decompress(content)
|
|
174
|
-
|
|
175
|
-
# Decode text files
|
|
176
|
-
# if extension in ['.htm', '.html', '.txt', '.xml']:
|
|
177
|
-
# content = content.decode('utf-8', errors='replace')
|
|
178
|
-
|
|
179
|
-
document_path = f"{self.batch_tar_path}::{self.accession_prefix}/{filename}"
|
|
216
|
+
document_path = f"{self.batch_tar_path}::{self.accession}/{filename}"
|
|
180
217
|
|
|
181
218
|
# Handle regular path case
|
|
182
219
|
else:
|
|
@@ -188,27 +225,7 @@ class Submission:
|
|
|
188
225
|
content = tar.extractfile(filename).read()
|
|
189
226
|
actual_filename = filename
|
|
190
227
|
except:
|
|
191
|
-
|
|
192
|
-
content = tar.extractfile(filename + '.gz').read()
|
|
193
|
-
actual_filename = filename + '.gz'
|
|
194
|
-
is_compressed = 'gzip'
|
|
195
|
-
except:
|
|
196
|
-
try:
|
|
197
|
-
content = tar.extractfile(filename + '.zst').read()
|
|
198
|
-
actual_filename = filename + '.zst'
|
|
199
|
-
is_compressed = 'zstd'
|
|
200
|
-
except:
|
|
201
|
-
raise FileNotFoundError(f"Document file not found in tar: {filename}")
|
|
202
|
-
|
|
203
|
-
# Decompress if compressed
|
|
204
|
-
if is_compressed == 'gzip':
|
|
205
|
-
content = gzip.decompress(content)
|
|
206
|
-
elif is_compressed == 'zstd':
|
|
207
|
-
content = zstd.ZstdDecompressor().decompress(content)
|
|
208
|
-
|
|
209
|
-
# Decode text files
|
|
210
|
-
# if extension in ['.htm', '.html', '.txt', '.xml']:
|
|
211
|
-
# content = content.decode('utf-8', errors='replace')
|
|
228
|
+
raise FileNotFoundError(f"Document file not found in tar: {filename}")
|
|
212
229
|
|
|
213
230
|
document_path = f"{self.path}::{actual_filename}"
|
|
214
231
|
|
|
@@ -222,15 +239,6 @@ class Submission:
|
|
|
222
239
|
with document_path.open('rb') as f:
|
|
223
240
|
content = f.read()
|
|
224
241
|
|
|
225
|
-
# Decompress if needed based on filename extension
|
|
226
|
-
if is_compressed == 'gzip':
|
|
227
|
-
content = gzip.decompress(content)
|
|
228
|
-
elif is_compressed == 'zstd':
|
|
229
|
-
content = zstd.ZstdDecompressor().decompress(content)
|
|
230
|
-
|
|
231
|
-
# Decode text files
|
|
232
|
-
# if extension in ['.htm', '.html', '.txt', '.xml']:
|
|
233
|
-
# content = content.decode('utf-8', errors='replace')
|
|
234
242
|
|
|
235
243
|
return Document(
|
|
236
244
|
type=doc['type'],
|
|
@@ -260,20 +268,24 @@ class Submission:
|
|
|
260
268
|
def parse_xbrl(self):
|
|
261
269
|
if self._xbrl:
|
|
262
270
|
return
|
|
263
|
-
|
|
271
|
+
|
|
272
|
+
if not self._xbrl_bool:
|
|
273
|
+
print(f"Submission: {self.accession} has no xbrl")
|
|
274
|
+
return
|
|
275
|
+
|
|
264
276
|
for idx, doc in enumerate(self.metadata.content['documents']):
|
|
265
277
|
if doc['type'] in ['EX-100.INS','EX-101.INS']:
|
|
266
278
|
document = self._load_document_by_index(idx)
|
|
267
279
|
self._xbrl = parse_inline_xbrl(content=document.content,file_type='extracted_inline')
|
|
268
|
-
return
|
|
269
|
-
|
|
280
|
+
return
|
|
281
|
+
|
|
270
282
|
if doc['filename'].endswith('_htm.xml'):
|
|
271
283
|
document = self._load_document_by_index(idx)
|
|
272
284
|
self._xbrl = parse_inline_xbrl(content=document.content,file_type='extracted_inline')
|
|
273
285
|
return
|
|
274
286
|
|
|
275
287
|
@property
|
|
276
|
-
def xbrl(self):
|
|
288
|
+
def xbrl(self):
|
|
277
289
|
if self._xbrl is None:
|
|
278
290
|
self.parse_xbrl()
|
|
279
291
|
return self._xbrl
|
|
@@ -353,20 +365,61 @@ class Submission:
|
|
|
353
365
|
|
|
354
366
|
@property
|
|
355
367
|
def fundamentals(self):
|
|
356
|
-
"""
|
|
357
|
-
|
|
368
|
+
"""Access fundamentals via attributes: sub.fundamentals.incomeStatement"""
|
|
369
|
+
if not hasattr(self, '_fundamentals_accessor'):
|
|
370
|
+
self._fundamentals_accessor = FundamentalsAccessor(self)
|
|
371
|
+
return self._fundamentals_accessor
|
|
372
|
+
|
|
373
|
+
@property
|
|
374
|
+
def tar(self):
|
|
375
|
+
return self._tar_submission().getvalue()
|
|
376
|
+
|
|
377
|
+
def set_tar_compression(self,compression_type='zstd',level=3):
|
|
378
|
+
self._tar_compression_type = compression_type
|
|
379
|
+
self._tar_compression_level = level
|
|
380
|
+
|
|
381
|
+
def _tar_submission(self):
|
|
382
|
+
if self._tar is not None:
|
|
383
|
+
return self._tar
|
|
384
|
+
else:
|
|
385
|
+
documents_obj_list = self._get_documents_obj_list()
|
|
386
|
+
self._tar = tar_submission(
|
|
387
|
+
documents_obj_list=documents_obj_list,
|
|
388
|
+
metadata=self.metadata.content,
|
|
389
|
+
compression_type=self._tar_compression_type,
|
|
390
|
+
level=self._tar_compression_level
|
|
391
|
+
)
|
|
392
|
+
return self._tar
|
|
393
|
+
|
|
394
|
+
@property
|
|
395
|
+
def accession_year_2d(self):
|
|
396
|
+
return self._get_accession_year_2d()
|
|
397
|
+
|
|
398
|
+
def _get_accession_year_2d(self):
|
|
399
|
+
if self._accession_year_2d is not None:
|
|
400
|
+
return self._accession_year_2d
|
|
401
|
+
self._accession_year_2d = format_accession(self.accession,'dash').split('-')[1]
|
|
402
|
+
return self._accession_year_2d
|
|
403
|
+
|
|
404
|
+
@property
|
|
405
|
+
def documents(self):
|
|
406
|
+
return self._get_documents()
|
|
407
|
+
|
|
408
|
+
def _get_documents(self):
|
|
409
|
+
if self._documents is not None:
|
|
410
|
+
return self._documents
|
|
411
|
+
|
|
412
|
+
self._documents = self.metadata.content['documents']
|
|
413
|
+
return self._documents
|
|
358
414
|
|
|
359
|
-
def
|
|
360
|
-
|
|
361
|
-
if
|
|
362
|
-
|
|
363
|
-
return self.parse_fundamentals(categories=[category])
|
|
415
|
+
def _get_documents_obj_list(self):
|
|
416
|
+
"""Get all documents as Document objects"""
|
|
417
|
+
if hasattr(self, 'documents_obj_list'):
|
|
418
|
+
return self.documents_obj_list
|
|
364
419
|
|
|
365
|
-
#
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
return result
|
|
420
|
+
# Generate documents_obj_list for batch tar and path cases
|
|
421
|
+
documents_obj_list = []
|
|
422
|
+
for idx in range(len(self.metadata.content['documents'])):
|
|
423
|
+
documents_obj_list.append(self._load_document_by_index(idx))
|
|
370
424
|
|
|
371
|
-
|
|
372
|
-
raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
|
|
425
|
+
return documents_obj_list
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import zstandard as zstd
|
|
2
|
+
from secsgml.utils import calculate_documents_locations_in_tar
|
|
3
|
+
import tarfile
|
|
4
|
+
import io
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
# Note: we don't actually need accession at this level. TODO
|
|
8
|
+
|
|
9
|
+
def compress_content(content, compression_type, level):
|
|
10
|
+
if compression_type == 'zstd':
|
|
11
|
+
# Create compressor with specified level
|
|
12
|
+
compressor = zstd.ZstdCompressor(level=level)
|
|
13
|
+
|
|
14
|
+
# Handle string content
|
|
15
|
+
# This should never be called
|
|
16
|
+
if isinstance(content, str):
|
|
17
|
+
content_bytes = content.encode('utf-8')
|
|
18
|
+
else:
|
|
19
|
+
content_bytes = content
|
|
20
|
+
|
|
21
|
+
# Compress and return
|
|
22
|
+
return compressor.compress(content_bytes)
|
|
23
|
+
|
|
24
|
+
# Return uncompressed if not zstd
|
|
25
|
+
return content
|
|
26
|
+
|
|
27
|
+
def compress_content_list(document_tuple_list, compression_type, level):
|
|
28
|
+
if compression_type is None:
|
|
29
|
+
return document_tuple_list
|
|
30
|
+
|
|
31
|
+
if level is None:
|
|
32
|
+
level = 3
|
|
33
|
+
|
|
34
|
+
# Create new list to avoid modifying original
|
|
35
|
+
compressed_list = []
|
|
36
|
+
for document_tuple in document_tuple_list:
|
|
37
|
+
content = document_tuple[0]
|
|
38
|
+
accession = document_tuple[1]
|
|
39
|
+
compressed_content = compress_content(content, compression_type, level)
|
|
40
|
+
compressed_list.append((compressed_content, accession))
|
|
41
|
+
|
|
42
|
+
return compressed_list
|
|
43
|
+
|
|
44
|
+
def tar_content_list(metadata, document_tuple_list_compressed):
|
|
45
|
+
# Update metadata with compressed sizes
|
|
46
|
+
for i, (content, accession) in enumerate(document_tuple_list_compressed):
|
|
47
|
+
metadata['documents'][i]['secsgml_size_bytes'] = len(content)
|
|
48
|
+
|
|
49
|
+
metadata = calculate_documents_locations_in_tar(metadata)
|
|
50
|
+
|
|
51
|
+
tar_buffer = io.BytesIO()
|
|
52
|
+
with tarfile.open(fileobj=tar_buffer, mode='w') as tar:
|
|
53
|
+
# Add metadata first
|
|
54
|
+
metadata_json = json.dumps(metadata).encode('utf-8')
|
|
55
|
+
tarinfo = tarfile.TarInfo(f'metadata.json')
|
|
56
|
+
tarinfo.size = len(metadata_json)
|
|
57
|
+
tar.addfile(tarinfo, io.BytesIO(metadata_json))
|
|
58
|
+
|
|
59
|
+
# Add each content
|
|
60
|
+
for i, (content, accession) in enumerate(document_tuple_list_compressed):
|
|
61
|
+
doc = metadata['documents'][i]
|
|
62
|
+
filename = doc.get('filename', doc['sequence'] + '.txt')
|
|
63
|
+
|
|
64
|
+
tarinfo = tarfile.TarInfo(name=filename)
|
|
65
|
+
tarinfo.size = len(content)
|
|
66
|
+
tar.addfile(tarinfo, io.BytesIO(content))
|
|
67
|
+
|
|
68
|
+
# Return the tar buffer
|
|
69
|
+
tar_buffer.seek(0) # Reset buffer position to beginning
|
|
70
|
+
return tar_buffer
|
|
71
|
+
|
|
72
|
+
def tar_submission(metadata, documents_obj_list, compression_type=None, level=None):
|
|
73
|
+
"""Takes a list of documents, compresses them, then tars them."""
|
|
74
|
+
document_tuple_list = [(doc.content, doc.accession) for doc in documents_obj_list]
|
|
75
|
+
document_tuple_list_compressed = compress_content_list(document_tuple_list, # Fixed: correct parameter name
|
|
76
|
+
compression_type=compression_type,
|
|
77
|
+
level=level)
|
|
78
|
+
|
|
79
|
+
return tar_content_list(metadata, document_tuple_list_compressed)
|
|
File without changes
|
|
@@ -1,16 +1,13 @@
|
|
|
1
|
-
datamule/__init__.py,sha256=
|
|
1
|
+
datamule/__init__.py,sha256=fy8h9IQQqSqOvRXJ6Q7Q-8nWwdnw2THP6puqfGkIB4k,1278
|
|
2
2
|
datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
|
|
3
3
|
datamule/datasets.py,sha256=1A9PPPyLIQ51evXLSsiKmVxNmjbO6c2FGszrairREjc,2058
|
|
4
4
|
datamule/helper.py,sha256=KqhAmTMdvATEh3I-O4xLcAcrHB9zXQERBuwzue7zyQw,3674
|
|
5
5
|
datamule/index.py,sha256=Rrcna9FJV-Oh_K6O2IuUEIDmtay_7UZ4l4jgKCi7A7I,2079
|
|
6
6
|
datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
|
|
7
|
-
datamule/portfolio.py,sha256=0-E1ZSEjJ8hba7HxF8oCrRneNuF_KKISOY6K4dRg0Cg,12282
|
|
8
|
-
datamule/portfolio_compression_utils.py,sha256=8OPYEN5zAdV1FiTxgVN3S7cTKs99Elv74bwgoIJP4QY,12654
|
|
9
|
-
datamule/sheet.py,sha256=KD7yAgSB8BE-Z4GDuH58IV-2DJ673nMcEsrCyJbeYp8,10707
|
|
10
|
-
datamule/submission.py,sha256=phHmi9ScjWHtVLjEoEdAO7RieUSKN5gPr0onfg5R8wE,16139
|
|
11
7
|
datamule/book/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
8
|
datamule/book/book.py,sha256=Vw33JHhmulNDWRN2AQpUQrf8wgVqqUYg5QJgbKhBNak,773
|
|
13
|
-
datamule/book/s3transfer.py,sha256=
|
|
9
|
+
datamule/book/s3transfer.py,sha256=arftLhYThLSGvmBSNnU2rNpkqiyvwAL32OVAKP4HOAQ,12596
|
|
10
|
+
datamule/cloud/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
11
|
datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
|
|
15
12
|
datamule/datamule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
13
|
datamule/datamule/datamule_lookup.py,sha256=e8djAg-ctSyHiKk7BjbtgugZ3p8roUjzsym5z3AihUg,9468
|
|
@@ -18,22 +15,14 @@ datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3
|
|
|
18
15
|
datamule/datamule/downloader.py,sha256=Ss9mz0Jf5UAd-CZJ6oO96o9hN04xMQIF3-e1wahokdM,18581
|
|
19
16
|
datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
|
|
20
17
|
datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
|
-
datamule/document/document.py,sha256=
|
|
22
|
-
datamule/document/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
23
|
-
datamule/document/tables/tables.py,sha256=Z3Eu6bdjiaNx4pgXlTMwk2Q-DhpMpEAygF2kJdp-Pu8,5722
|
|
24
|
-
datamule/document/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
|
|
25
|
-
datamule/document/tables/tables_25nse.py,sha256=kpoOcIpra6i3Wx_6pUCj1fkx0wUbMhx7pc8yUkrBJb4,980
|
|
26
|
-
datamule/document/tables/tables_informationtable.py,sha256=3yjuxYuLoBjRd6O0BNd0jQDmS1XUDjA6xp51Csq2cH8,649
|
|
27
|
-
datamule/document/tables/tables_npx.py,sha256=tZDBAonAQWLsgecVK_OwIgNcUJhuV5L2gkTSNbXAgNE,6652
|
|
28
|
-
datamule/document/tables/tables_ownership.py,sha256=pRoFFRGLWp8gKAAvvUbVRxIU2xDFAQhwi9bgwddsT8A,11185
|
|
29
|
-
datamule/document/tables/tables_proxyvotingrecord.py,sha256=S_Th294-KWRL-QVXkexNWIksSaFePZGSVq6EU8iiK0o,896
|
|
30
|
-
datamule/document/tables/tables_sbsef.py,sha256=X6VKVnAdWxn2TgRmaAd1WWlxPhcLPQ-53s0qDokkPI0,635
|
|
31
|
-
datamule/document/tables/tables_sdr.py,sha256=BwHRJvtijiYvNJ2lIc_30kct6VEmLimIzX28JjZBBqo,4924
|
|
32
|
-
datamule/document/tables/utils.py,sha256=2-X_1NsiWj_XsD9djxCXwTeIVlg-ip78gG11xACJiDs,738
|
|
18
|
+
datamule/document/document.py,sha256=Oj_7OMIldWB9HxlBca2gqr5E8ykDQZkPuUlcZjGuzqw,23016
|
|
33
19
|
datamule/mapping_dicts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
34
20
|
datamule/mapping_dicts/html_mapping_dicts.py,sha256=pba3utMr2KldPeEGnMRkHyVw7D2WHSDpg_5u36pHMII,5411
|
|
35
21
|
datamule/mapping_dicts/txt_mapping_dicts.py,sha256=DQPrGYbAPQxomRUtt4iiMGrwuF7BHc_LeFBQuYBzU9o,6311
|
|
36
22
|
datamule/mapping_dicts/xml_mapping_dicts.py,sha256=Z22yDVwKYonUfM5foQP00dVDE8EHhhMKp0CLqVKV5OI,438
|
|
23
|
+
datamule/portfolio/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
|
+
datamule/portfolio/portfolio.py,sha256=YPIvS4KKuEtm8A1XvNqDF39f4LJHhAFWmtpJzjbGDhY,11680
|
|
25
|
+
datamule/portfolio/portfolio_compression_utils_legacy.py,sha256=1nlbz7JfBDrI0pwTyFiBF856xqGXvQRYBulLUpk7G1A,12695
|
|
37
26
|
datamule/sec/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
38
27
|
datamule/sec/utils.py,sha256=96bavyG2Kq1t8L1YA2vwYnAHKIKdRSoVXxBO5QH1HWo,2196
|
|
39
28
|
datamule/sec/infrastructure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -50,8 +39,23 @@ datamule/sec/xbrl/filter_xbrl.py,sha256=QiSfm7tsJVLIw2PFqGh8D01qsRe_ZB-mbFhr6KcB
|
|
|
50
39
|
datamule/sec/xbrl/streamcompanyfacts.py,sha256=Qq88PqW5_j1k3Aqrl0KRmKeF54D6Wbb6H5N2tbvKUzM,3307
|
|
51
40
|
datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTqW3Y,5848
|
|
52
41
|
datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
53
|
-
datamule/seclibrary/bq.py,sha256=
|
|
54
|
-
datamule/
|
|
42
|
+
datamule/seclibrary/bq.py,sha256=TOP0WA6agDKu4vE1eHd62NDpAc02LDDrOP-g1bJpxbw,18048
|
|
43
|
+
datamule/sheet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
44
|
+
datamule/sheet/sheet.py,sha256=Dw979JGygS566N0Iwsvqk0h1s26GfbrIHDWiBaS2oH8,10711
|
|
45
|
+
datamule/submission/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
46
|
+
datamule/submission/submission.py,sha256=4UtdViw-h_4Rqt09SFe8-WWdLqaD55T3vqTUVRB0CsE,17058
|
|
47
|
+
datamule/submission/tar_submission.py,sha256=lkm1neVLW2_-G26VylL6Rzx98Cavvml0Qd2wlJHD0bw,3075
|
|
48
|
+
datamule/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
49
|
+
datamule/tables/tables.py,sha256=Z3Eu6bdjiaNx4pgXlTMwk2Q-DhpMpEAygF2kJdp-Pu8,5722
|
|
50
|
+
datamule/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
|
|
51
|
+
datamule/tables/tables_25nse.py,sha256=kpoOcIpra6i3Wx_6pUCj1fkx0wUbMhx7pc8yUkrBJb4,980
|
|
52
|
+
datamule/tables/tables_informationtable.py,sha256=3yjuxYuLoBjRd6O0BNd0jQDmS1XUDjA6xp51Csq2cH8,649
|
|
53
|
+
datamule/tables/tables_npx.py,sha256=tZDBAonAQWLsgecVK_OwIgNcUJhuV5L2gkTSNbXAgNE,6652
|
|
54
|
+
datamule/tables/tables_ownership.py,sha256=pRoFFRGLWp8gKAAvvUbVRxIU2xDFAQhwi9bgwddsT8A,11185
|
|
55
|
+
datamule/tables/tables_proxyvotingrecord.py,sha256=S_Th294-KWRL-QVXkexNWIksSaFePZGSVq6EU8iiK0o,896
|
|
56
|
+
datamule/tables/tables_sbsef.py,sha256=X6VKVnAdWxn2TgRmaAd1WWlxPhcLPQ-53s0qDokkPI0,635
|
|
57
|
+
datamule/tables/tables_sdr.py,sha256=BwHRJvtijiYvNJ2lIc_30kct6VEmLimIzX28JjZBBqo,4924
|
|
58
|
+
datamule/tables/utils.py,sha256=2-X_1NsiWj_XsD9djxCXwTeIVlg-ip78gG11xACJiDs,738
|
|
55
59
|
datamule/tags/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
56
60
|
datamule/tags/config.py,sha256=rxawvOBDT2v72Aw-VkmnUOLsKSAIrZBrjz_E0hPU7MY,1677
|
|
57
61
|
datamule/tags/dictionaries.py,sha256=1v2OoN1KnM3HbFHxATxe7LhVRoXe64ecRRgA3oak210,4587
|
|
@@ -61,7 +65,7 @@ datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
61
65
|
datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
|
|
62
66
|
datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
|
|
63
67
|
datamule/utils/pdf.py,sha256=Z9xrdVhKex2YdvjYsaPaygRE_J6P_JNiUGkwflz2Hw0,735
|
|
64
|
-
datamule-2.3.
|
|
65
|
-
datamule-2.3.
|
|
66
|
-
datamule-2.3.
|
|
67
|
-
datamule-2.3.
|
|
68
|
+
datamule-2.3.7.dist-info/METADATA,sha256=1Igs40zdVpr6XPH4s2ToG5EIyAsI1lpdA1yiuGUjsx4,609
|
|
69
|
+
datamule-2.3.7.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
|
70
|
+
datamule-2.3.7.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
|
71
|
+
datamule-2.3.7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|