datamule 2.0.2__py3-none-any.whl → 2.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/datamule/downloader.py +1 -1
- datamule/document/document.py +2 -67
- datamule/submission.py +86 -0
- {datamule-2.0.2.dist-info → datamule-2.0.4.dist-info}/METADATA +1 -1
- {datamule-2.0.2.dist-info → datamule-2.0.4.dist-info}/RECORD +7 -7
- {datamule-2.0.2.dist-info → datamule-2.0.4.dist-info}/WHEEL +0 -0
- {datamule-2.0.2.dist-info → datamule-2.0.4.dist-info}/top_level.txt +0 -0
datamule/datamule/downloader.py
CHANGED
@@ -315,7 +315,7 @@ class Downloader:
|
|
315
315
|
filings = [filing for filing in filings if filing['accessionNumber'] in filtered_accession_numbers]
|
316
316
|
|
317
317
|
if skip_accession_numbers:
|
318
|
-
skip_accession_numbers = [
|
318
|
+
skip_accession_numbers = [format_accession(item,'int') for item in skip_accession_numbers]
|
319
319
|
filings = [filing for filing in filings if filing['accessionNumber'] not in skip_accession_numbers]
|
320
320
|
|
321
321
|
logger.debug(f"Generating URLs for {len(filings)} filings...")
|
datamule/document/document.py
CHANGED
@@ -12,8 +12,7 @@ from .processing import process_tabular_data
|
|
12
12
|
from pathlib import Path
|
13
13
|
import webbrowser
|
14
14
|
from secsgml.utils import bytes_to_str
|
15
|
-
|
16
|
-
from company_fundamentals import construct_fundamentals
|
15
|
+
|
17
16
|
|
18
17
|
class Document:
|
19
18
|
def __init__(self, type, content, extension,accession,filing_date,path=None):
|
@@ -35,8 +34,7 @@ class Document:
|
|
35
34
|
self.extension = extension
|
36
35
|
# this will be filled by parsed
|
37
36
|
self.data = None
|
38
|
-
|
39
|
-
self.fundamentals = None
|
37
|
+
|
40
38
|
|
41
39
|
#_load_text_content
|
42
40
|
def _preprocess_txt_content(self):
|
@@ -106,69 +104,6 @@ class Document:
|
|
106
104
|
return bool(re.search(pattern, self.content))
|
107
105
|
return False
|
108
106
|
|
109
|
-
def parse_xbrl(self,type='inline'):
|
110
|
-
if self.xbrl:
|
111
|
-
return
|
112
|
-
if type =='inline':
|
113
|
-
if self.extension not in ['.htm','.html']:
|
114
|
-
return
|
115
|
-
self.xbrl = parse_inline_xbrl(self.content)
|
116
|
-
else:
|
117
|
-
raise ValueError("Only inline has been implemented so far.")
|
118
|
-
|
119
|
-
def parse_fundamentals(self,categories=None):
|
120
|
-
self.parse_xbrl()
|
121
|
-
# Transform XBRL records into the format needed by construct_fundamentals
|
122
|
-
xbrl = []
|
123
|
-
|
124
|
-
for xbrl_record in self.xbrl:
|
125
|
-
try:
|
126
|
-
# Extract basic fields
|
127
|
-
value = xbrl_record.get('_val', None)
|
128
|
-
taxonomy, name = xbrl_record['_attributes']['name'].split(':')
|
129
|
-
|
130
|
-
# Handle scaling if present
|
131
|
-
if xbrl_record.get('_attributes', {}).get('scale') is not None:
|
132
|
-
scale = int(xbrl_record['_attributes']['scale'])
|
133
|
-
try:
|
134
|
-
value = str(Decimal(value.replace(',', '')) * (Decimal(10) ** scale))
|
135
|
-
except:
|
136
|
-
pass
|
137
|
-
|
138
|
-
# Extract period dates
|
139
|
-
period_start_date = None
|
140
|
-
period_end_date = None
|
141
|
-
|
142
|
-
if xbrl_record.get('_context'):
|
143
|
-
context = xbrl_record['_context']
|
144
|
-
period_start_date = context.get('context_period_instant') or context.get('context_period_startdate')
|
145
|
-
period_end_date = context.get('context_period_enddate')
|
146
|
-
|
147
|
-
# Create record in the format expected by construct_fundamentals
|
148
|
-
record = {
|
149
|
-
'taxonomy': taxonomy,
|
150
|
-
'name': name,
|
151
|
-
'value': value,
|
152
|
-
'period_start_date': period_start_date,
|
153
|
-
'period_end_date': period_end_date
|
154
|
-
}
|
155
|
-
|
156
|
-
xbrl.append(record)
|
157
|
-
|
158
|
-
except Exception as e:
|
159
|
-
# Skip malformed records
|
160
|
-
continue
|
161
|
-
|
162
|
-
# Call construct_fundamentals with the transformed data
|
163
|
-
fundamentals = construct_fundamentals(xbrl,
|
164
|
-
taxonomy_key='taxonomy',
|
165
|
-
concept_key='name',
|
166
|
-
start_date_key='period_start_date',
|
167
|
-
end_date_key='period_end_date',
|
168
|
-
categories=categories)
|
169
|
-
|
170
|
-
self.fundamentals = fundamentals
|
171
|
-
|
172
107
|
# Note: this method will be heavily modified in the future
|
173
108
|
def parse(self):
|
174
109
|
# check if we have already parsed the content
|
datamule/submission.py
CHANGED
@@ -9,11 +9,20 @@ import tarfile
|
|
9
9
|
import zstandard as zstd
|
10
10
|
import gzip
|
11
11
|
import urllib.request
|
12
|
+
from secxbrl import parse_inline_xbrl
|
13
|
+
from company_fundamentals import construct_fundamentals
|
14
|
+
from decimal import Decimal
|
15
|
+
|
12
16
|
|
13
17
|
class Submission:
|
14
18
|
def __init__(self, path=None, sgml_content=None, keep_document_types=None,
|
15
19
|
batch_tar_path=None, accession_prefix=None, portfolio_ref=None,url=None):
|
16
20
|
|
21
|
+
|
22
|
+
# declare vars to be filled later
|
23
|
+
self.xbrl = None
|
24
|
+
self.fundamentals = None
|
25
|
+
|
17
26
|
# Validate parameters
|
18
27
|
param_count = sum(x is not None for x in [path, sgml_content, batch_tar_path,url])
|
19
28
|
if param_count != 1:
|
@@ -238,4 +247,81 @@ class Submission:
|
|
238
247
|
if doc['type'] in document_types:
|
239
248
|
yield self._load_document_by_index(idx)
|
240
249
|
|
250
|
+
def parse_xbrl(self):
|
251
|
+
if self.xbrl:
|
252
|
+
return
|
253
|
+
|
254
|
+
for idx, doc in enumerate(self.metadata.content['documents']):
|
255
|
+
if doc['type'] in ['EX-100.INS','EX-101.INS']:
|
256
|
+
document = self._load_document_by_index(idx)
|
257
|
+
self.xbrl = parse_inline_xbrl(content=document.content,file_type='extracted_inline')
|
258
|
+
return
|
259
|
+
|
260
|
+
if doc['filename'].endswith('_htm.xml'):
|
261
|
+
document = self._load_document_by_index(idx)
|
262
|
+
self.xbrl = parse_inline_xbrl(content=document.content,file_type='extracted_inline')
|
263
|
+
return
|
264
|
+
|
265
|
+
|
266
|
+
def parse_fundamentals(self,categories=None):
|
267
|
+
self.parse_xbrl()
|
268
|
+
|
269
|
+
# if no xbrl return
|
270
|
+
if not self.xbrl:
|
271
|
+
return
|
272
|
+
# Transform XBRL records into the format needed by construct_fundamentals
|
273
|
+
xbrl = []
|
274
|
+
|
275
|
+
for xbrl_record in self.xbrl:
|
276
|
+
try:
|
277
|
+
# Extract basic fields
|
278
|
+
value = xbrl_record.get('_val', None)
|
279
|
+
|
280
|
+
taxonomy, name = xbrl_record['_attributes']['name'].split(':')
|
281
|
+
|
282
|
+
|
283
|
+
# Handle scaling if present
|
284
|
+
if xbrl_record.get('_attributes', {}).get('scale') is not None:
|
285
|
+
scale = int(xbrl_record['_attributes']['scale'])
|
286
|
+
try:
|
287
|
+
value = str(Decimal(value.replace(',', '')) * (Decimal(10) ** scale))
|
288
|
+
except:
|
289
|
+
pass
|
290
|
+
|
241
291
|
|
292
|
+
# Extract period dates
|
293
|
+
period_start_date = None
|
294
|
+
period_end_date = None
|
295
|
+
|
296
|
+
if xbrl_record.get('_context'):
|
297
|
+
context = xbrl_record['_context']
|
298
|
+
period_start_date = context.get('context_period_instant') or context.get('context_period_startdate')
|
299
|
+
period_end_date = context.get('context_period_enddate')
|
300
|
+
|
301
|
+
# Create record in the format expected by construct_fundamentals
|
302
|
+
record = {
|
303
|
+
'taxonomy': taxonomy,
|
304
|
+
'name': name,
|
305
|
+
'value': value,
|
306
|
+
'period_start_date': period_start_date,
|
307
|
+
'period_end_date': period_end_date
|
308
|
+
}
|
309
|
+
|
310
|
+
xbrl.append(record)
|
311
|
+
|
312
|
+
except Exception as e:
|
313
|
+
# Skip malformed records
|
314
|
+
continue
|
315
|
+
|
316
|
+
|
317
|
+
# Call construct_fundamentals with the transformed data
|
318
|
+
fundamentals = construct_fundamentals(xbrl,
|
319
|
+
taxonomy_key='taxonomy',
|
320
|
+
concept_key='name',
|
321
|
+
start_date_key='period_start_date',
|
322
|
+
end_date_key='period_end_date',
|
323
|
+
categories=categories)
|
324
|
+
|
325
|
+
self.fundamentals = fundamentals
|
326
|
+
|
327
|
+
|
@@ -6,15 +6,15 @@ datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,9
|
|
6
6
|
datamule/portfolio.py,sha256=YViG1JgJ9SFhg8N3tOOhBI8oc6Pmi2vwnHeHmlkC_5U,12119
|
7
7
|
datamule/portfolio_compression_utils.py,sha256=8OPYEN5zAdV1FiTxgVN3S7cTKs99Elv74bwgoIJP4QY,12654
|
8
8
|
datamule/sheet.py,sha256=Ws_YRtpvewLVioarngVMe8cgG_sp11MP9_goGbRaiWE,23952
|
9
|
-
datamule/submission.py,sha256=
|
9
|
+
datamule/submission.py,sha256=qcb5TogrB2q6x4zcGPKFf4dkrAy0bAPzY71Ops_xW44,14437
|
10
10
|
datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
|
11
11
|
datamule/datamule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
12
|
datamule/datamule/datamule_lookup.py,sha256=e8djAg-ctSyHiKk7BjbtgugZ3p8roUjzsym5z3AihUg,9468
|
13
13
|
datamule/datamule/datamule_mysql_rds.py,sha256=Oj_xPTBKkzWsuRlb_tphjJrBW1eua1cOuxjGwJx581k,10591
|
14
|
-
datamule/datamule/downloader.py,sha256=
|
14
|
+
datamule/datamule/downloader.py,sha256=aTyVUuIwynPtHB0Z9BvCasy9Ao5wfHptNAsjN-7yDTk,18525
|
15
15
|
datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
|
16
16
|
datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
-
datamule/document/document.py,sha256=
|
17
|
+
datamule/document/document.py,sha256=U9hSXT2Y06prM6sPcUU6uziV1f4_BhaaGz3QXE5zveg,14034
|
18
18
|
datamule/document/processing.py,sha256=jDCEzBFDSQtq7nQxRScIsbALnFcvMPOkNkMUCa7mFxg,31921
|
19
19
|
datamule/document/table.py,sha256=73yUJKY82ap32jhLmZeTti-jQ_lyhcJGlGwyxLtgYOg,12944
|
20
20
|
datamule/document/mappings/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -65,7 +65,7 @@ datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,180
|
|
65
65
|
datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
66
66
|
datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
|
67
67
|
datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
|
68
|
-
datamule-2.0.
|
69
|
-
datamule-2.0.
|
70
|
-
datamule-2.0.
|
71
|
-
datamule-2.0.
|
68
|
+
datamule-2.0.4.dist-info/METADATA,sha256=CNSfwZgqLh3WR4TObVkP8Y1p2wWx4To_NMthG4EvhEQ,560
|
69
|
+
datamule-2.0.4.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
70
|
+
datamule-2.0.4.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
71
|
+
datamule-2.0.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|