datamule 2.3.5__py3-none-any.whl → 2.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamule might be problematic. Click here for more details.
- datamule/__init__.py +5 -3
- datamule/book/s3transfer.py +1 -1
- datamule/document/document.py +1 -1
- datamule/{portfolio.py → portfolio/portfolio.py} +14 -23
- datamule/{portfolio_compression_utils.py → portfolio/portfolio_compression_utils_legacy.py} +2 -0
- datamule/seclibrary/bq.py +2 -0
- datamule/sheet/__init__.py +0 -0
- datamule/{sheet.py → sheet/sheet.py} +4 -4
- datamule/submission/__init__.py +0 -0
- datamule/{submission.py → submission/submission.py} +142 -96
- datamule/submission/tar_submission.py +79 -0
- datamule/tables/__init__.py +0 -0
- {datamule-2.3.5.dist-info → datamule-2.3.6.dist-info}/METADATA +1 -1
- {datamule-2.3.5.dist-info → datamule-2.3.6.dist-info}/RECORD +28 -24
- /datamule/{document/tables → cloud}/__init__.py +0 -0
- /datamule/{sentiment → portfolio}/__init__.py +0 -0
- /datamule/{document/tables → tables}/tables.py +0 -0
- /datamule/{document/tables → tables}/tables_13fhr.py +0 -0
- /datamule/{document/tables → tables}/tables_25nse.py +0 -0
- /datamule/{document/tables → tables}/tables_informationtable.py +0 -0
- /datamule/{document/tables → tables}/tables_npx.py +0 -0
- /datamule/{document/tables → tables}/tables_ownership.py +0 -0
- /datamule/{document/tables → tables}/tables_proxyvotingrecord.py +0 -0
- /datamule/{document/tables → tables}/tables_sbsef.py +0 -0
- /datamule/{document/tables → tables}/tables_sdr.py +0 -0
- /datamule/{document/tables → tables}/utils.py +0 -0
- {datamule-2.3.5.dist-info → datamule-2.3.6.dist-info}/WHEEL +0 -0
- {datamule-2.3.5.dist-info → datamule-2.3.6.dist-info}/top_level.txt +0 -0
datamule/__init__.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
from .submission import Submission
|
|
2
|
-
from .portfolio import Portfolio
|
|
1
|
+
from .submission.submission import Submission
|
|
2
|
+
from .portfolio.portfolio import Portfolio
|
|
3
3
|
from .document.document import Document
|
|
4
4
|
from .helper import _load_package_csv, load_package_dataset
|
|
5
5
|
from .config import Config
|
|
6
|
-
from .sheet import Sheet
|
|
6
|
+
from .sheet.sheet import Sheet
|
|
7
7
|
from .index import Index
|
|
8
8
|
from .package_updater import PackageUpdater
|
|
9
9
|
from .utils.format_accession import format_accession
|
|
@@ -32,6 +32,8 @@ def _setup_notebook_env():
|
|
|
32
32
|
# Set up notebook environment
|
|
33
33
|
_setup_notebook_env()
|
|
34
34
|
|
|
35
|
+
|
|
36
|
+
# TODO, is this load bearing?
|
|
35
37
|
__all__ = [
|
|
36
38
|
'_load_package_csv',
|
|
37
39
|
'load_package_dataset',
|
datamule/book/s3transfer.py
CHANGED
datamule/document/document.py
CHANGED
|
@@ -10,7 +10,7 @@ from pathlib import Path
|
|
|
10
10
|
import webbrowser
|
|
11
11
|
from secsgml.utils import bytes_to_str
|
|
12
12
|
import tempfile
|
|
13
|
-
from
|
|
13
|
+
from ..tables.tables import Tables
|
|
14
14
|
|
|
15
15
|
from ..tags.utils import get_cusip_using_regex, get_isin_using_regex, get_figi_using_regex,get_all_tickers, get_full_names,get_full_names_dictionary_lookup, analyze_lm_sentiment_fragment
|
|
16
16
|
from ..utils.pdf import has_extractable_text
|
|
@@ -1,19 +1,19 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
2
|
from tqdm import tqdm
|
|
3
3
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
4
|
-
from .submission import Submission
|
|
5
|
-
from
|
|
6
|
-
from
|
|
7
|
-
from
|
|
4
|
+
from ..submission.submission import Submission
|
|
5
|
+
from ..sec.submissions.downloader import download as sec_download
|
|
6
|
+
from ..sec.submissions.textsearch import filter_text
|
|
7
|
+
from ..config import Config
|
|
8
8
|
import os
|
|
9
9
|
import tarfile
|
|
10
10
|
from threading import Lock
|
|
11
|
-
from
|
|
12
|
-
from
|
|
13
|
-
from
|
|
14
|
-
from
|
|
15
|
-
from .
|
|
16
|
-
from
|
|
11
|
+
from ..helper import _process_cik_and_metadata_filters
|
|
12
|
+
from ..datamule.downloader import download as seclibrary_download
|
|
13
|
+
from ..sec.xbrl.filter_xbrl import filter_xbrl
|
|
14
|
+
from ..sec.submissions.monitor import Monitor
|
|
15
|
+
from .portfolio_compression_utils_legacy import CompressionManager
|
|
16
|
+
from ..datamule.sec_connector import SecConnector
|
|
17
17
|
import shutil
|
|
18
18
|
|
|
19
19
|
|
|
@@ -31,6 +31,7 @@ class Portfolio:
|
|
|
31
31
|
|
|
32
32
|
self.monitor = Monitor()
|
|
33
33
|
|
|
34
|
+
|
|
34
35
|
if self.path.exists():
|
|
35
36
|
self._load_submissions()
|
|
36
37
|
self.submissions_loaded = True
|
|
@@ -47,6 +48,7 @@ class Portfolio:
|
|
|
47
48
|
regular_items = [f for f in self.path.iterdir() if (f.is_dir() or f.suffix=='.tar') and 'batch' not in f.name]
|
|
48
49
|
batch_tars = [f for f in self.path.iterdir() if f.is_file() and 'batch' in f.name and f.suffix == '.tar']
|
|
49
50
|
|
|
51
|
+
|
|
50
52
|
# Load regular submissions (existing logic)
|
|
51
53
|
def load_submission(folder):
|
|
52
54
|
return Submission(folder)
|
|
@@ -99,11 +101,12 @@ class Portfolio:
|
|
|
99
101
|
try:
|
|
100
102
|
submission = Submission(
|
|
101
103
|
batch_tar_path=batch_tar_path,
|
|
102
|
-
|
|
104
|
+
accession=accession_prefix,
|
|
103
105
|
portfolio_ref=self
|
|
104
106
|
)
|
|
105
107
|
submissions.append(submission)
|
|
106
108
|
except Exception as e:
|
|
109
|
+
print(f"Path: {batch_tar_path}. Exception: {e}")
|
|
107
110
|
pass
|
|
108
111
|
#print(f"Path: {batch_tar_path}. Exception: {e}")
|
|
109
112
|
pbar.update(1) # Update progress for each successful submission
|
|
@@ -111,18 +114,6 @@ class Portfolio:
|
|
|
111
114
|
return submissions
|
|
112
115
|
|
|
113
116
|
|
|
114
|
-
def compress(self, compression=None, compression_level=None, threshold=1048576, max_batch_size=1024*1024*1024):
|
|
115
|
-
"""
|
|
116
|
-
Compress all individual submissions into batch tar files.
|
|
117
|
-
|
|
118
|
-
Args:
|
|
119
|
-
compression: None, 'gzip', or 'zstd' for document compression (default: None)
|
|
120
|
-
compression_level: Compression level, if None uses defaults (gzip=6, zstd=3)
|
|
121
|
-
threshold: Size threshold for compressing individual documents (default: 1MB)
|
|
122
|
-
max_batch_size: Maximum size per batch tar file (default: 1GB)
|
|
123
|
-
"""
|
|
124
|
-
CompressionManager().compress_portfolio(self, compression, compression_level, threshold, max_batch_size, self.MAX_WORKERS)
|
|
125
|
-
|
|
126
117
|
def decompress(self):
|
|
127
118
|
"""
|
|
128
119
|
Decompress all batch tar files back to individual submission directories.
|
datamule/seclibrary/bq.py
CHANGED
|
File without changes
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
2
|
import csv
|
|
3
3
|
import os
|
|
4
|
-
from
|
|
5
|
-
from
|
|
6
|
-
from
|
|
7
|
-
from
|
|
4
|
+
from ..helper import _process_cik_and_metadata_filters, load_package_dataset
|
|
5
|
+
from ..sec.xbrl.downloadcompanyfacts import download_company_facts
|
|
6
|
+
from ..datamule.datamule_lookup import datamule_lookup
|
|
7
|
+
from ..datamule.datamule_mysql_rds import query_mysql_rds
|
|
8
8
|
from company_fundamentals.utils import get_fundamental_mappings
|
|
9
9
|
from company_fundamentals import construct_fundamentals
|
|
10
10
|
class Sheet:
|
|
File without changes
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
2
|
import json
|
|
3
|
-
from
|
|
3
|
+
from ..document.document import Document
|
|
4
4
|
from secsgml import parse_sgml_content_into_memory
|
|
5
5
|
from secsgml.parse_sgml import transform_metadata_string
|
|
6
6
|
from secsgml.utils import bytes_to_str
|
|
7
|
-
from
|
|
7
|
+
from ..sec.utils import headers
|
|
8
8
|
import tarfile
|
|
9
9
|
import zstandard as zstd
|
|
10
10
|
import gzip
|
|
@@ -12,31 +12,85 @@ import urllib.request
|
|
|
12
12
|
from secxbrl import parse_inline_xbrl
|
|
13
13
|
from company_fundamentals import construct_fundamentals
|
|
14
14
|
from decimal import Decimal
|
|
15
|
-
from
|
|
16
|
-
|
|
15
|
+
from ..utils.format_accession import format_accession
|
|
16
|
+
from .tar_submission import tar_submission
|
|
17
|
+
|
|
18
|
+
# probably needs rework later
|
|
19
|
+
class FundamentalsAccessor:
|
|
20
|
+
def __init__(self, submission):
|
|
21
|
+
self.submission = submission
|
|
22
|
+
self._cache = {}
|
|
23
|
+
self._all_data = None
|
|
24
|
+
|
|
25
|
+
def __getattr__(self, category):
|
|
26
|
+
if category not in self._cache:
|
|
27
|
+
self._cache[category] = self.submission.parse_fundamentals(categories=[category])
|
|
28
|
+
return self._cache[category]
|
|
29
|
+
|
|
30
|
+
def _get_all_data(self):
|
|
31
|
+
if self._all_data is None:
|
|
32
|
+
self._all_data = self.submission.parse_fundamentals(categories=None)
|
|
33
|
+
return self._all_data
|
|
34
|
+
|
|
35
|
+
# Make the accessor behave like the underlying data
|
|
36
|
+
def __getitem__(self, key):
|
|
37
|
+
return self._get_all_data()[key]
|
|
38
|
+
|
|
39
|
+
def __repr__(self):
|
|
40
|
+
return repr(self._get_all_data())
|
|
41
|
+
|
|
42
|
+
def __str__(self):
|
|
43
|
+
return str(self._get_all_data())
|
|
44
|
+
|
|
45
|
+
def __iter__(self):
|
|
46
|
+
return iter(self._get_all_data())
|
|
47
|
+
|
|
48
|
+
def __len__(self):
|
|
49
|
+
return len(self._get_all_data()) if self._get_all_data() else 0
|
|
50
|
+
|
|
51
|
+
def __bool__(self):
|
|
52
|
+
return bool(self._get_all_data())
|
|
17
53
|
|
|
18
54
|
class Submission:
|
|
19
55
|
def __init__(self, path=None, sgml_content=None, keep_document_types=None,
|
|
20
|
-
batch_tar_path=None,
|
|
56
|
+
batch_tar_path=None, accession=None, portfolio_ref=None,url=None):
|
|
21
57
|
|
|
58
|
+
# get accession number
|
|
59
|
+
# lets just use accesion-prefix, to get around malformed metadata files (1995 has a lot!)
|
|
60
|
+
if path is not None:
|
|
61
|
+
self.accession = format_accession(path.stem,'no-dash')
|
|
62
|
+
elif batch_tar_path is not None:
|
|
63
|
+
self.accession = format_accession(accession,'no-dash')
|
|
64
|
+
elif url is not None or sgml_content is not None:
|
|
65
|
+
if accession is None:
|
|
66
|
+
raise ValueError("If using url or sgml_content, accession must be specified.")
|
|
67
|
+
self.accession = format_accession(accession,'no-dash')
|
|
68
|
+
else:
|
|
69
|
+
raise ValueError("If this appears, please post an issue: https://github.com/john-friedman/datamule-python/issues.")
|
|
70
|
+
|
|
22
71
|
|
|
23
72
|
# declare vars to be filled later
|
|
24
73
|
self._xbrl = None
|
|
25
74
|
self._fundamentals_cache = {}
|
|
75
|
+
self._tar = None
|
|
76
|
+
self._tar_compression_type = 'zstd'
|
|
77
|
+
self._tar_compression_level = 3
|
|
78
|
+
self._accession_year_2d = None
|
|
79
|
+
self._documents = None
|
|
26
80
|
|
|
27
81
|
# Validate parameters
|
|
28
82
|
param_count = sum(x is not None for x in [path, sgml_content, batch_tar_path,url])
|
|
29
83
|
if param_count != 1:
|
|
30
84
|
raise ValueError("Exactly one of path, sgml_content, or batch_tar_path must be provided")
|
|
31
85
|
|
|
32
|
-
if batch_tar_path is not None and (
|
|
33
|
-
raise ValueError("batch_tar_path requires both
|
|
86
|
+
if batch_tar_path is not None and (self.accession is None or portfolio_ref is None):
|
|
87
|
+
raise ValueError("batch_tar_path requires both accession and portfolio_ref")
|
|
34
88
|
|
|
35
89
|
# Initialize batch tar attributes
|
|
36
90
|
self.batch_tar_path = batch_tar_path
|
|
37
|
-
self.accession_prefix = accession_prefix
|
|
38
91
|
self.portfolio_ref = portfolio_ref
|
|
39
92
|
|
|
93
|
+
# here should set accession either from url or make it a required argument if sgml content
|
|
40
94
|
if url is not None or sgml_content is not None:
|
|
41
95
|
if url is not None:
|
|
42
96
|
request = urllib.request.Request(url, headers=headers)
|
|
@@ -49,17 +103,15 @@ class Submission:
|
|
|
49
103
|
|
|
50
104
|
self.path = None
|
|
51
105
|
metadata, raw_documents = parse_sgml_content_into_memory(sgml_content)
|
|
52
|
-
metadata = bytes_to_str(metadata)
|
|
106
|
+
metadata = bytes_to_str(metadata,lower=False)
|
|
53
107
|
|
|
54
108
|
# standardize metadata
|
|
55
109
|
metadata = transform_metadata_string(metadata)
|
|
56
110
|
|
|
57
111
|
self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=None)
|
|
58
|
-
# code dupe
|
|
59
|
-
self.accession = self.metadata.content['accession-number']
|
|
60
112
|
self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
|
|
61
113
|
|
|
62
|
-
self.
|
|
114
|
+
self.documents_obj_list = []
|
|
63
115
|
filtered_metadata_documents = []
|
|
64
116
|
|
|
65
117
|
for idx,doc in enumerate(self.metadata.content['documents']):
|
|
@@ -72,7 +124,7 @@ class Submission:
|
|
|
72
124
|
# write as txt if not declared
|
|
73
125
|
filename = doc.get('filename','.txt')
|
|
74
126
|
extension = Path(filename).suffix
|
|
75
|
-
self.
|
|
127
|
+
self.documents_obj_list.append(Document(type=type, content=raw_documents[idx], extension=extension,filing_date=self.filing_date,accession=self.accession))
|
|
76
128
|
|
|
77
129
|
filtered_metadata_documents.append(doc)
|
|
78
130
|
|
|
@@ -85,24 +137,22 @@ class Submission:
|
|
|
85
137
|
# Load metadata from batch tar
|
|
86
138
|
with self.portfolio_ref.batch_tar_locks[batch_tar_path]:
|
|
87
139
|
tar_handle = self.portfolio_ref.batch_tar_handles[batch_tar_path]
|
|
88
|
-
metadata_obj = tar_handle.extractfile(f'{
|
|
140
|
+
metadata_obj = tar_handle.extractfile(f'{self.accession}/metadata.json')
|
|
89
141
|
metadata = json.loads(metadata_obj.read().decode('utf-8'))
|
|
90
142
|
|
|
91
143
|
# Set metadata path using :: notation
|
|
92
|
-
metadata_path = f"{batch_tar_path}::{
|
|
144
|
+
metadata_path = f"{batch_tar_path}::{self.accession}/metadata.json"
|
|
93
145
|
|
|
94
146
|
# standardize metadata
|
|
95
147
|
metadata = transform_metadata_string(metadata)
|
|
96
148
|
self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
|
|
97
|
-
|
|
98
|
-
# lets just use accesion-prefix, to get around malformed metadata files (1995 has a lot!)
|
|
99
|
-
self.accession = format_accession(self.accession_prefix,'dash')
|
|
100
149
|
|
|
101
|
-
#print(f"s: {self.metadata.content['accession-number']} : {batch_tar_path}")
|
|
102
150
|
self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
|
|
103
151
|
|
|
104
152
|
elif path is not None:
|
|
105
153
|
self.path = Path(path)
|
|
154
|
+
|
|
155
|
+
|
|
106
156
|
if self.path.suffix == '.tar':
|
|
107
157
|
with tarfile.open(self.path,'r') as tar:
|
|
108
158
|
metadata_obj = tar.extractfile('metadata.json')
|
|
@@ -118,65 +168,45 @@ class Submission:
|
|
|
118
168
|
# standardize metadata
|
|
119
169
|
metadata = transform_metadata_string(metadata)
|
|
120
170
|
self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
|
|
121
|
-
self.accession = self.metadata.content['accession-number']
|
|
122
171
|
self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
|
|
123
172
|
|
|
124
173
|
|
|
125
174
|
# booleans
|
|
126
|
-
self.
|
|
175
|
+
self._xbrl_bool = any(
|
|
127
176
|
doc['type'] in ('EX-100.INS', 'EX-101.INS') or
|
|
128
177
|
doc.get('filename', '').endswith('_htm.xml')
|
|
129
178
|
for doc in self.metadata.content['documents']
|
|
130
179
|
)
|
|
131
180
|
|
|
132
|
-
self._has_fundamentals = self.
|
|
181
|
+
self._has_fundamentals = self._xbrl_bool
|
|
133
182
|
|
|
183
|
+
|
|
184
|
+
# TODO rework for better metadata accessing
|
|
134
185
|
def _load_document_by_index(self, idx):
|
|
135
186
|
"""Load a document by its index in the metadata documents list."""
|
|
136
187
|
doc = self.metadata.content['documents'][idx]
|
|
137
188
|
|
|
138
189
|
# If loaded from sgml_content, return pre-loaded document
|
|
139
190
|
if self.path is None and self.batch_tar_path is None:
|
|
140
|
-
return self.
|
|
191
|
+
return self.documents_obj_list[idx]
|
|
141
192
|
|
|
142
193
|
# Get filename from metadata - this is the source of truth
|
|
143
194
|
filename = doc.get('filename')
|
|
144
195
|
if filename is None:
|
|
145
196
|
filename = doc['sequence'] + '.txt'
|
|
146
197
|
|
|
147
|
-
|
|
148
|
-
# If filename ends with .gz or .zst, the real extension is before that
|
|
149
|
-
if filename.endswith('.gz'):
|
|
150
|
-
extension = Path(filename[:-3]).suffix
|
|
151
|
-
is_compressed = 'gzip'
|
|
152
|
-
elif filename.endswith('.zst'):
|
|
153
|
-
extension = Path(filename[:-4]).suffix
|
|
154
|
-
is_compressed = 'zstd'
|
|
155
|
-
else:
|
|
156
|
-
extension = Path(filename).suffix
|
|
157
|
-
is_compressed = False
|
|
158
|
-
|
|
198
|
+
extension = Path(filename).suffix
|
|
159
199
|
# Handle batch tar case
|
|
160
200
|
if self.batch_tar_path is not None:
|
|
161
201
|
with self.portfolio_ref.batch_tar_locks[self.batch_tar_path]:
|
|
162
202
|
tar_handle = self.portfolio_ref.batch_tar_handles[self.batch_tar_path]
|
|
163
203
|
|
|
164
204
|
# Use exact filename from metadata
|
|
165
|
-
tar_path = f'{self.
|
|
205
|
+
tar_path = f'{self.accession}/{filename}'
|
|
166
206
|
content = tar_handle.extractfile(tar_path).read()
|
|
167
207
|
|
|
168
208
|
|
|
169
|
-
|
|
170
|
-
if is_compressed == 'gzip':
|
|
171
|
-
content = gzip.decompress(content)
|
|
172
|
-
elif is_compressed == 'zstd':
|
|
173
|
-
content = zstd.ZstdDecompressor().decompress(content)
|
|
174
|
-
|
|
175
|
-
# Decode text files
|
|
176
|
-
# if extension in ['.htm', '.html', '.txt', '.xml']:
|
|
177
|
-
# content = content.decode('utf-8', errors='replace')
|
|
178
|
-
|
|
179
|
-
document_path = f"{self.batch_tar_path}::{self.accession_prefix}/{filename}"
|
|
209
|
+
document_path = f"{self.batch_tar_path}::{self.accession}/{filename}"
|
|
180
210
|
|
|
181
211
|
# Handle regular path case
|
|
182
212
|
else:
|
|
@@ -188,27 +218,7 @@ class Submission:
|
|
|
188
218
|
content = tar.extractfile(filename).read()
|
|
189
219
|
actual_filename = filename
|
|
190
220
|
except:
|
|
191
|
-
|
|
192
|
-
content = tar.extractfile(filename + '.gz').read()
|
|
193
|
-
actual_filename = filename + '.gz'
|
|
194
|
-
is_compressed = 'gzip'
|
|
195
|
-
except:
|
|
196
|
-
try:
|
|
197
|
-
content = tar.extractfile(filename + '.zst').read()
|
|
198
|
-
actual_filename = filename + '.zst'
|
|
199
|
-
is_compressed = 'zstd'
|
|
200
|
-
except:
|
|
201
|
-
raise FileNotFoundError(f"Document file not found in tar: {filename}")
|
|
202
|
-
|
|
203
|
-
# Decompress if compressed
|
|
204
|
-
if is_compressed == 'gzip':
|
|
205
|
-
content = gzip.decompress(content)
|
|
206
|
-
elif is_compressed == 'zstd':
|
|
207
|
-
content = zstd.ZstdDecompressor().decompress(content)
|
|
208
|
-
|
|
209
|
-
# Decode text files
|
|
210
|
-
# if extension in ['.htm', '.html', '.txt', '.xml']:
|
|
211
|
-
# content = content.decode('utf-8', errors='replace')
|
|
221
|
+
raise FileNotFoundError(f"Document file not found in tar: {filename}")
|
|
212
222
|
|
|
213
223
|
document_path = f"{self.path}::{actual_filename}"
|
|
214
224
|
|
|
@@ -222,15 +232,6 @@ class Submission:
|
|
|
222
232
|
with document_path.open('rb') as f:
|
|
223
233
|
content = f.read()
|
|
224
234
|
|
|
225
|
-
# Decompress if needed based on filename extension
|
|
226
|
-
if is_compressed == 'gzip':
|
|
227
|
-
content = gzip.decompress(content)
|
|
228
|
-
elif is_compressed == 'zstd':
|
|
229
|
-
content = zstd.ZstdDecompressor().decompress(content)
|
|
230
|
-
|
|
231
|
-
# Decode text files
|
|
232
|
-
# if extension in ['.htm', '.html', '.txt', '.xml']:
|
|
233
|
-
# content = content.decode('utf-8', errors='replace')
|
|
234
235
|
|
|
235
236
|
return Document(
|
|
236
237
|
type=doc['type'],
|
|
@@ -260,20 +261,24 @@ class Submission:
|
|
|
260
261
|
def parse_xbrl(self):
|
|
261
262
|
if self._xbrl:
|
|
262
263
|
return
|
|
263
|
-
|
|
264
|
+
|
|
265
|
+
if not self._xbrl_bool:
|
|
266
|
+
print(f"Submission: {self.accession} has no xbrl")
|
|
267
|
+
return
|
|
268
|
+
|
|
264
269
|
for idx, doc in enumerate(self.metadata.content['documents']):
|
|
265
270
|
if doc['type'] in ['EX-100.INS','EX-101.INS']:
|
|
266
271
|
document = self._load_document_by_index(idx)
|
|
267
272
|
self._xbrl = parse_inline_xbrl(content=document.content,file_type='extracted_inline')
|
|
268
|
-
return
|
|
269
|
-
|
|
273
|
+
return
|
|
274
|
+
|
|
270
275
|
if doc['filename'].endswith('_htm.xml'):
|
|
271
276
|
document = self._load_document_by_index(idx)
|
|
272
277
|
self._xbrl = parse_inline_xbrl(content=document.content,file_type='extracted_inline')
|
|
273
278
|
return
|
|
274
279
|
|
|
275
280
|
@property
|
|
276
|
-
def xbrl(self):
|
|
281
|
+
def xbrl(self):
|
|
277
282
|
if self._xbrl is None:
|
|
278
283
|
self.parse_xbrl()
|
|
279
284
|
return self._xbrl
|
|
@@ -353,20 +358,61 @@ class Submission:
|
|
|
353
358
|
|
|
354
359
|
@property
|
|
355
360
|
def fundamentals(self):
|
|
356
|
-
"""
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
361
|
+
"""Access fundamentals via attributes: sub.fundamentals.incomeStatement"""
|
|
362
|
+
if not hasattr(self, '_fundamentals_accessor'):
|
|
363
|
+
self._fundamentals_accessor = FundamentalsAccessor(self)
|
|
364
|
+
return self._fundamentals_accessor
|
|
365
|
+
|
|
366
|
+
@property
|
|
367
|
+
def tar(self):
|
|
368
|
+
return self._tar_submission().getvalue()
|
|
369
|
+
|
|
370
|
+
def set_tar_compression(self,compression_type='zstd',level=3):
|
|
371
|
+
self._tar_compression_type = compression_type
|
|
372
|
+
self._tar_compression_level = level
|
|
373
|
+
|
|
374
|
+
def _tar_submission(self):
|
|
375
|
+
if self._tar is not None:
|
|
376
|
+
return self._tar
|
|
377
|
+
else:
|
|
378
|
+
documents_obj_list = self._get_documents_obj_list()
|
|
379
|
+
self._tar = tar_submission(
|
|
380
|
+
documents_obj_list=documents_obj_list,
|
|
381
|
+
metadata=self.metadata.content,
|
|
382
|
+
compression_type=self._tar_compression_type,
|
|
383
|
+
level=self._tar_compression_level
|
|
384
|
+
)
|
|
385
|
+
return self._tar
|
|
364
386
|
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
387
|
+
@property
|
|
388
|
+
def accession_year_2d(self):
|
|
389
|
+
return self._get_accession_year_2d()
|
|
390
|
+
|
|
391
|
+
def _get_accession_year_2d(self):
|
|
392
|
+
if self._accession_year_2d is not None:
|
|
393
|
+
return self._accession_year_2d
|
|
394
|
+
self._accession_year_2d = format_accession(self.accession,'dash').split('-')[1]
|
|
395
|
+
return self._accession_year_2d
|
|
396
|
+
|
|
397
|
+
@property
|
|
398
|
+
def documents(self):
|
|
399
|
+
return self._get_documents()
|
|
400
|
+
|
|
401
|
+
def _get_documents(self):
|
|
402
|
+
if self._documents is not None:
|
|
403
|
+
return self._documents
|
|
404
|
+
|
|
405
|
+
self._documents = self.metadata.content['documents']
|
|
406
|
+
return self._documents
|
|
407
|
+
|
|
408
|
+
def _get_documents_obj_list(self):
|
|
409
|
+
"""Get all documents as Document objects"""
|
|
410
|
+
if hasattr(self, 'documents_obj_list'):
|
|
411
|
+
return self.documents_obj_list
|
|
412
|
+
|
|
413
|
+
# Generate documents_obj_list for batch tar and path cases
|
|
414
|
+
documents_obj_list = []
|
|
415
|
+
for idx in range(len(self.metadata.content['documents'])):
|
|
416
|
+
documents_obj_list.append(self._load_document_by_index(idx))
|
|
370
417
|
|
|
371
|
-
|
|
372
|
-
raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
|
|
418
|
+
return documents_obj_list
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import zstandard as zstd
|
|
2
|
+
from secsgml.utils import calculate_documents_locations_in_tar
|
|
3
|
+
import tarfile
|
|
4
|
+
import io
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
# Note: we don't actually need accession at this level. TODO
|
|
8
|
+
|
|
9
|
+
def compress_content(content, compression_type, level):
|
|
10
|
+
if compression_type == 'zstd':
|
|
11
|
+
# Create compressor with specified level
|
|
12
|
+
compressor = zstd.ZstdCompressor(level=level)
|
|
13
|
+
|
|
14
|
+
# Handle string content
|
|
15
|
+
# This should never be called
|
|
16
|
+
if isinstance(content, str):
|
|
17
|
+
content_bytes = content.encode('utf-8')
|
|
18
|
+
else:
|
|
19
|
+
content_bytes = content
|
|
20
|
+
|
|
21
|
+
# Compress and return
|
|
22
|
+
return compressor.compress(content_bytes)
|
|
23
|
+
|
|
24
|
+
# Return uncompressed if not zstd
|
|
25
|
+
return content
|
|
26
|
+
|
|
27
|
+
def compress_content_list(document_tuple_list, compression_type, level):
|
|
28
|
+
if compression_type is None:
|
|
29
|
+
return document_tuple_list
|
|
30
|
+
|
|
31
|
+
if level is None:
|
|
32
|
+
level = 3
|
|
33
|
+
|
|
34
|
+
# Create new list to avoid modifying original
|
|
35
|
+
compressed_list = []
|
|
36
|
+
for document_tuple in document_tuple_list:
|
|
37
|
+
content = document_tuple[0]
|
|
38
|
+
accession = document_tuple[1]
|
|
39
|
+
compressed_content = compress_content(content, compression_type, level)
|
|
40
|
+
compressed_list.append((compressed_content, accession))
|
|
41
|
+
|
|
42
|
+
return compressed_list
|
|
43
|
+
|
|
44
|
+
def tar_content_list(metadata, document_tuple_list_compressed):
|
|
45
|
+
# Update metadata with compressed sizes
|
|
46
|
+
for i, (content, accession) in enumerate(document_tuple_list_compressed):
|
|
47
|
+
metadata['documents'][i]['secsgml_size_bytes'] = len(content)
|
|
48
|
+
|
|
49
|
+
metadata = calculate_documents_locations_in_tar(metadata)
|
|
50
|
+
|
|
51
|
+
tar_buffer = io.BytesIO()
|
|
52
|
+
with tarfile.open(fileobj=tar_buffer, mode='w') as tar:
|
|
53
|
+
# Add metadata first
|
|
54
|
+
metadata_json = json.dumps(metadata).encode('utf-8')
|
|
55
|
+
tarinfo = tarfile.TarInfo(f'metadata.json')
|
|
56
|
+
tarinfo.size = len(metadata_json)
|
|
57
|
+
tar.addfile(tarinfo, io.BytesIO(metadata_json))
|
|
58
|
+
|
|
59
|
+
# Add each content
|
|
60
|
+
for i, (content, accession) in enumerate(document_tuple_list_compressed):
|
|
61
|
+
doc = metadata['documents'][i]
|
|
62
|
+
filename = doc.get('filename', doc['sequence'] + '.txt')
|
|
63
|
+
|
|
64
|
+
tarinfo = tarfile.TarInfo(name=filename)
|
|
65
|
+
tarinfo.size = len(content)
|
|
66
|
+
tar.addfile(tarinfo, io.BytesIO(content))
|
|
67
|
+
|
|
68
|
+
# Return the tar buffer
|
|
69
|
+
tar_buffer.seek(0) # Reset buffer position to beginning
|
|
70
|
+
return tar_buffer
|
|
71
|
+
|
|
72
|
+
def tar_submission(metadata, documents_obj_list, compression_type=None, level=None):
|
|
73
|
+
"""Takes a list of documents, compresses them, then tars them."""
|
|
74
|
+
document_tuple_list = [(doc.content, doc.accession) for doc in documents_obj_list]
|
|
75
|
+
document_tuple_list_compressed = compress_content_list(document_tuple_list, # Fixed: correct parameter name
|
|
76
|
+
compression_type=compression_type,
|
|
77
|
+
level=level)
|
|
78
|
+
|
|
79
|
+
return tar_content_list(metadata, document_tuple_list_compressed)
|
|
File without changes
|
|
@@ -1,16 +1,13 @@
|
|
|
1
|
-
datamule/__init__.py,sha256=
|
|
1
|
+
datamule/__init__.py,sha256=fy8h9IQQqSqOvRXJ6Q7Q-8nWwdnw2THP6puqfGkIB4k,1278
|
|
2
2
|
datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
|
|
3
3
|
datamule/datasets.py,sha256=1A9PPPyLIQ51evXLSsiKmVxNmjbO6c2FGszrairREjc,2058
|
|
4
4
|
datamule/helper.py,sha256=KqhAmTMdvATEh3I-O4xLcAcrHB9zXQERBuwzue7zyQw,3674
|
|
5
5
|
datamule/index.py,sha256=Rrcna9FJV-Oh_K6O2IuUEIDmtay_7UZ4l4jgKCi7A7I,2079
|
|
6
6
|
datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
|
|
7
|
-
datamule/portfolio.py,sha256=0-E1ZSEjJ8hba7HxF8oCrRneNuF_KKISOY6K4dRg0Cg,12282
|
|
8
|
-
datamule/portfolio_compression_utils.py,sha256=8OPYEN5zAdV1FiTxgVN3S7cTKs99Elv74bwgoIJP4QY,12654
|
|
9
|
-
datamule/sheet.py,sha256=KD7yAgSB8BE-Z4GDuH58IV-2DJ673nMcEsrCyJbeYp8,10707
|
|
10
|
-
datamule/submission.py,sha256=phHmi9ScjWHtVLjEoEdAO7RieUSKN5gPr0onfg5R8wE,16139
|
|
11
7
|
datamule/book/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
8
|
datamule/book/book.py,sha256=Vw33JHhmulNDWRN2AQpUQrf8wgVqqUYg5QJgbKhBNak,773
|
|
13
|
-
datamule/book/s3transfer.py,sha256=
|
|
9
|
+
datamule/book/s3transfer.py,sha256=arftLhYThLSGvmBSNnU2rNpkqiyvwAL32OVAKP4HOAQ,12596
|
|
10
|
+
datamule/cloud/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
11
|
datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
|
|
15
12
|
datamule/datamule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
13
|
datamule/datamule/datamule_lookup.py,sha256=e8djAg-ctSyHiKk7BjbtgugZ3p8roUjzsym5z3AihUg,9468
|
|
@@ -18,22 +15,14 @@ datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3
|
|
|
18
15
|
datamule/datamule/downloader.py,sha256=Ss9mz0Jf5UAd-CZJ6oO96o9hN04xMQIF3-e1wahokdM,18581
|
|
19
16
|
datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
|
|
20
17
|
datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
|
-
datamule/document/document.py,sha256=
|
|
22
|
-
datamule/document/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
23
|
-
datamule/document/tables/tables.py,sha256=Z3Eu6bdjiaNx4pgXlTMwk2Q-DhpMpEAygF2kJdp-Pu8,5722
|
|
24
|
-
datamule/document/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
|
|
25
|
-
datamule/document/tables/tables_25nse.py,sha256=kpoOcIpra6i3Wx_6pUCj1fkx0wUbMhx7pc8yUkrBJb4,980
|
|
26
|
-
datamule/document/tables/tables_informationtable.py,sha256=3yjuxYuLoBjRd6O0BNd0jQDmS1XUDjA6xp51Csq2cH8,649
|
|
27
|
-
datamule/document/tables/tables_npx.py,sha256=tZDBAonAQWLsgecVK_OwIgNcUJhuV5L2gkTSNbXAgNE,6652
|
|
28
|
-
datamule/document/tables/tables_ownership.py,sha256=pRoFFRGLWp8gKAAvvUbVRxIU2xDFAQhwi9bgwddsT8A,11185
|
|
29
|
-
datamule/document/tables/tables_proxyvotingrecord.py,sha256=S_Th294-KWRL-QVXkexNWIksSaFePZGSVq6EU8iiK0o,896
|
|
30
|
-
datamule/document/tables/tables_sbsef.py,sha256=X6VKVnAdWxn2TgRmaAd1WWlxPhcLPQ-53s0qDokkPI0,635
|
|
31
|
-
datamule/document/tables/tables_sdr.py,sha256=BwHRJvtijiYvNJ2lIc_30kct6VEmLimIzX28JjZBBqo,4924
|
|
32
|
-
datamule/document/tables/utils.py,sha256=2-X_1NsiWj_XsD9djxCXwTeIVlg-ip78gG11xACJiDs,738
|
|
18
|
+
datamule/document/document.py,sha256=Oj_7OMIldWB9HxlBca2gqr5E8ykDQZkPuUlcZjGuzqw,23016
|
|
33
19
|
datamule/mapping_dicts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
34
20
|
datamule/mapping_dicts/html_mapping_dicts.py,sha256=pba3utMr2KldPeEGnMRkHyVw7D2WHSDpg_5u36pHMII,5411
|
|
35
21
|
datamule/mapping_dicts/txt_mapping_dicts.py,sha256=DQPrGYbAPQxomRUtt4iiMGrwuF7BHc_LeFBQuYBzU9o,6311
|
|
36
22
|
datamule/mapping_dicts/xml_mapping_dicts.py,sha256=Z22yDVwKYonUfM5foQP00dVDE8EHhhMKp0CLqVKV5OI,438
|
|
23
|
+
datamule/portfolio/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
|
+
datamule/portfolio/portfolio.py,sha256=YPIvS4KKuEtm8A1XvNqDF39f4LJHhAFWmtpJzjbGDhY,11680
|
|
25
|
+
datamule/portfolio/portfolio_compression_utils_legacy.py,sha256=1nlbz7JfBDrI0pwTyFiBF856xqGXvQRYBulLUpk7G1A,12695
|
|
37
26
|
datamule/sec/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
38
27
|
datamule/sec/utils.py,sha256=96bavyG2Kq1t8L1YA2vwYnAHKIKdRSoVXxBO5QH1HWo,2196
|
|
39
28
|
datamule/sec/infrastructure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -50,8 +39,23 @@ datamule/sec/xbrl/filter_xbrl.py,sha256=QiSfm7tsJVLIw2PFqGh8D01qsRe_ZB-mbFhr6KcB
|
|
|
50
39
|
datamule/sec/xbrl/streamcompanyfacts.py,sha256=Qq88PqW5_j1k3Aqrl0KRmKeF54D6Wbb6H5N2tbvKUzM,3307
|
|
51
40
|
datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTqW3Y,5848
|
|
52
41
|
datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
53
|
-
datamule/seclibrary/bq.py,sha256=
|
|
54
|
-
datamule/
|
|
42
|
+
datamule/seclibrary/bq.py,sha256=TOP0WA6agDKu4vE1eHd62NDpAc02LDDrOP-g1bJpxbw,18048
|
|
43
|
+
datamule/sheet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
44
|
+
datamule/sheet/sheet.py,sha256=Dw979JGygS566N0Iwsvqk0h1s26GfbrIHDWiBaS2oH8,10711
|
|
45
|
+
datamule/submission/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
46
|
+
datamule/submission/submission.py,sha256=_dcQFZ0f8Me4-R5E92_HHNb_k2shcspRgQeN3s5-db4,16836
|
|
47
|
+
datamule/submission/tar_submission.py,sha256=lkm1neVLW2_-G26VylL6Rzx98Cavvml0Qd2wlJHD0bw,3075
|
|
48
|
+
datamule/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
49
|
+
datamule/tables/tables.py,sha256=Z3Eu6bdjiaNx4pgXlTMwk2Q-DhpMpEAygF2kJdp-Pu8,5722
|
|
50
|
+
datamule/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
|
|
51
|
+
datamule/tables/tables_25nse.py,sha256=kpoOcIpra6i3Wx_6pUCj1fkx0wUbMhx7pc8yUkrBJb4,980
|
|
52
|
+
datamule/tables/tables_informationtable.py,sha256=3yjuxYuLoBjRd6O0BNd0jQDmS1XUDjA6xp51Csq2cH8,649
|
|
53
|
+
datamule/tables/tables_npx.py,sha256=tZDBAonAQWLsgecVK_OwIgNcUJhuV5L2gkTSNbXAgNE,6652
|
|
54
|
+
datamule/tables/tables_ownership.py,sha256=pRoFFRGLWp8gKAAvvUbVRxIU2xDFAQhwi9bgwddsT8A,11185
|
|
55
|
+
datamule/tables/tables_proxyvotingrecord.py,sha256=S_Th294-KWRL-QVXkexNWIksSaFePZGSVq6EU8iiK0o,896
|
|
56
|
+
datamule/tables/tables_sbsef.py,sha256=X6VKVnAdWxn2TgRmaAd1WWlxPhcLPQ-53s0qDokkPI0,635
|
|
57
|
+
datamule/tables/tables_sdr.py,sha256=BwHRJvtijiYvNJ2lIc_30kct6VEmLimIzX28JjZBBqo,4924
|
|
58
|
+
datamule/tables/utils.py,sha256=2-X_1NsiWj_XsD9djxCXwTeIVlg-ip78gG11xACJiDs,738
|
|
55
59
|
datamule/tags/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
56
60
|
datamule/tags/config.py,sha256=rxawvOBDT2v72Aw-VkmnUOLsKSAIrZBrjz_E0hPU7MY,1677
|
|
57
61
|
datamule/tags/dictionaries.py,sha256=1v2OoN1KnM3HbFHxATxe7LhVRoXe64ecRRgA3oak210,4587
|
|
@@ -61,7 +65,7 @@ datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
61
65
|
datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
|
|
62
66
|
datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
|
|
63
67
|
datamule/utils/pdf.py,sha256=Z9xrdVhKex2YdvjYsaPaygRE_J6P_JNiUGkwflz2Hw0,735
|
|
64
|
-
datamule-2.3.
|
|
65
|
-
datamule-2.3.
|
|
66
|
-
datamule-2.3.
|
|
67
|
-
datamule-2.3.
|
|
68
|
+
datamule-2.3.6.dist-info/METADATA,sha256=njsh6r7ekNM3iYGV1GIAns0_fMAlxYy6nOcxlRrHIeI,609
|
|
69
|
+
datamule-2.3.6.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
|
70
|
+
datamule-2.3.6.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
|
71
|
+
datamule-2.3.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|