datamule 1.2.0__py3-none-any.whl → 1.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/__init__.py +3 -1
- datamule/document/__init__.py +0 -0
- datamule/document/document.py +255 -0
- datamule/document/processing.py +604 -0
- datamule/document/table.py +260 -0
- datamule/package_updater.py +30 -0
- datamule/portfolio.py +5 -3
- datamule/sec/submissions/downloader.py +14 -37
- datamule/seclibrary/downloader.py +50 -9
- datamule/submission.py +111 -26
- {datamule-1.2.0.dist-info → datamule-1.2.3.dist-info}/METADATA +1 -1
- {datamule-1.2.0.dist-info → datamule-1.2.3.dist-info}/RECORD +14 -10
- datamule/document.py +0 -465
- {datamule-1.2.0.dist-info → datamule-1.2.3.dist-info}/WHEEL +0 -0
- {datamule-1.2.0.dist-info → datamule-1.2.3.dist-info}/top_level.txt +0 -0
datamule/submission.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
import json
|
3
|
-
from .document import Document
|
3
|
+
from .document.document import Document
|
4
4
|
from secsgml import parse_sgml_submission_into_memory
|
5
|
-
|
5
|
+
import os
|
6
|
+
import aiofiles
|
7
|
+
|
6
8
|
|
7
9
|
class Submission:
|
8
10
|
def __init__(self, path=None,sgml_content=None,keep_document_types=None):
|
@@ -14,7 +16,13 @@ class Submission:
|
|
14
16
|
if sgml_content is not None:
|
15
17
|
self.path = None
|
16
18
|
self.metadata, raw_documents = parse_sgml_submission_into_memory(sgml_content)
|
19
|
+
|
20
|
+
# code dupe
|
21
|
+
self.accession = self.metadata['accession-number']
|
22
|
+
self.filing_date= f"{self.metadata['filing-date'][:4]}-{self.metadata['filing-date'][4:6]}-{self.metadata['filing-date'][6:8]}"
|
23
|
+
|
17
24
|
self.documents = []
|
25
|
+
filtered_metadata_documents = []
|
18
26
|
|
19
27
|
for idx,doc in enumerate(self.metadata['documents']):
|
20
28
|
type = doc.get('type')
|
@@ -24,16 +32,25 @@ class Submission:
|
|
24
32
|
continue
|
25
33
|
filename = doc.get('filename')
|
26
34
|
extension = Path(filename).suffix
|
27
|
-
self.documents.append(Document(type=type, content=raw_documents[idx], extension=extension))
|
35
|
+
self.documents.append(Document(type=type, content=raw_documents[idx], extension=extension,filing_date=self.filing_date,accession=self.accession))
|
28
36
|
|
37
|
+
filtered_metadata_documents.append(doc)
|
38
|
+
|
39
|
+
self.metadata['documents'] = filtered_metadata_documents
|
29
40
|
|
30
41
|
if path is not None:
|
31
42
|
self.path = Path(path)
|
32
43
|
metadata_path = self.path / 'metadata.json'
|
33
44
|
with metadata_path.open('r') as f:
|
34
45
|
self.metadata = json.load(f)
|
46
|
+
|
47
|
+
# Code dupe
|
48
|
+
self.accession = self.metadata['accession-number']
|
49
|
+
self.filing_date= f"{self.metadata['filing-date'][:4]}-{self.metadata['filing-date'][4:6]}-{self.metadata['filing-date'][6:8]}"
|
35
50
|
|
36
51
|
|
52
|
+
|
53
|
+
|
37
54
|
def document_type(self, document_type):
|
38
55
|
# Convert single document type to list for consistent handling
|
39
56
|
if isinstance(document_type, str):
|
@@ -54,10 +71,13 @@ class Submission:
|
|
54
71
|
document_path = self.path / filename
|
55
72
|
extension = document_path.suffix
|
56
73
|
|
57
|
-
with document_path.open('
|
74
|
+
with document_path.open('rb') as f:
|
58
75
|
content = f.read()
|
59
76
|
|
60
|
-
|
77
|
+
if extension in ['.htm','.html','.txt','.xml']:
|
78
|
+
content = content.decode('utf-8', errors='replace')
|
79
|
+
|
80
|
+
yield Document(type=doc['type'], content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
|
61
81
|
# if loaded from sgml_content
|
62
82
|
else:
|
63
83
|
yield self.documents[idx]
|
@@ -78,10 +98,13 @@ class Submission:
|
|
78
98
|
|
79
99
|
# check if the file exists
|
80
100
|
if document_path.exists():
|
81
|
-
with document_path.open('
|
101
|
+
with document_path.open('rb') as f:
|
82
102
|
content = f.read()
|
83
103
|
|
84
|
-
|
104
|
+
if extension in ['.htm','.html','.txt','.xml']:
|
105
|
+
content = content.decode('utf-8', errors='replace')
|
106
|
+
|
107
|
+
yield Document(type=doc['type'], content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
|
85
108
|
else:
|
86
109
|
print(f"Warning: File {document_path} does not exist likely due to keep types in downloading.")
|
87
110
|
|
@@ -89,25 +112,87 @@ class Submission:
|
|
89
112
|
else:
|
90
113
|
yield self.documents[idx]
|
91
114
|
|
92
|
-
# keep documents by document type
|
93
|
-
def keep(self, document_type):
|
94
|
-
# Convert single document type to list for consistent handling
|
95
|
-
if isinstance(document_type, str):
|
96
|
-
document_types = [document_type]
|
97
|
-
else:
|
98
|
-
document_types = document_type
|
99
115
|
|
100
|
-
|
101
|
-
|
116
|
+
|
117
|
+
|
118
|
+
def save(self, output_dir="filings"):
|
119
|
+
file_dir = Path(output_dir) / str(self.accession)
|
120
|
+
file_dir.mkdir(parents=True, exist_ok=True)
|
121
|
+
|
122
|
+
metadata_path = file_dir / "metadata.json"
|
123
|
+
with open(metadata_path, 'w') as f:
|
124
|
+
json.dump(self.metadata, f, indent=4)
|
125
|
+
|
126
|
+
for idx, doc in enumerate(self.metadata['documents']):
|
127
|
+
try:
|
102
128
|
filename = doc.get('filename')
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
129
|
+
if filename is None:
|
130
|
+
filename = f"{doc.get('sequence', idx)}.txt"
|
131
|
+
except (KeyError, IndexError):
|
132
|
+
filename = f"{idx}.txt"
|
133
|
+
|
134
|
+
doc_path = file_dir / filename
|
135
|
+
|
136
|
+
if self.path is not None:
|
137
|
+
if hasattr(self, 'documents') and self.documents:
|
138
|
+
content = self.documents[idx].content
|
139
|
+
else:
|
140
|
+
orig_doc_path = self.path / filename
|
141
|
+
if orig_doc_path.exists():
|
142
|
+
with open(orig_doc_path, 'r', encoding='utf-8', errors='replace') as f:
|
143
|
+
content = f.read()
|
144
|
+
else:
|
145
|
+
print(f"Warning: File {orig_doc_path} does not exist, skipping.")
|
146
|
+
continue
|
147
|
+
else:
|
148
|
+
content = self.documents[idx].content
|
149
|
+
|
150
|
+
if isinstance(content, bytes):
|
151
|
+
with open(doc_path, 'wb') as f:
|
152
|
+
f.write(content)
|
153
|
+
else:
|
154
|
+
with open(doc_path, 'w', encoding='utf-8', errors='replace') as f:
|
155
|
+
f.write(content)
|
156
|
+
|
157
|
+
return file_dir
|
108
158
|
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
159
|
+
async def save_async(self, output_dir="filings"):
|
160
|
+
file_dir = Path(output_dir) / str(self.accession)
|
161
|
+
os.makedirs(file_dir, exist_ok=True)
|
162
|
+
|
163
|
+
metadata_path = file_dir / "metadata.json"
|
164
|
+
async with aiofiles.open(metadata_path, 'w') as f:
|
165
|
+
await f.write(json.dumps(self.metadata, indent=4))
|
166
|
+
|
167
|
+
for idx, doc in enumerate(self.metadata['documents']):
|
168
|
+
try:
|
169
|
+
filename = doc.get('filename')
|
170
|
+
if filename is None:
|
171
|
+
filename = f"{doc.get('sequence', idx)}.txt"
|
172
|
+
except (KeyError, IndexError):
|
173
|
+
filename = f"{idx}.txt"
|
174
|
+
|
175
|
+
doc_path = file_dir / filename
|
176
|
+
|
177
|
+
if self.path is not None:
|
178
|
+
if hasattr(self, 'documents') and self.documents:
|
179
|
+
content = self.documents[idx].content
|
180
|
+
else:
|
181
|
+
orig_doc_path = self.path / filename
|
182
|
+
if orig_doc_path.exists():
|
183
|
+
async with aiofiles.open(orig_doc_path, 'r', encoding='utf-8', errors='replace') as f:
|
184
|
+
content = await f.read()
|
185
|
+
else:
|
186
|
+
print(f"Warning: File {orig_doc_path} does not exist, skipping.")
|
187
|
+
continue
|
188
|
+
else:
|
189
|
+
content = self.documents[idx].content
|
190
|
+
|
191
|
+
if isinstance(content, bytes):
|
192
|
+
async with aiofiles.open(doc_path, 'wb') as f:
|
193
|
+
await f.write(content)
|
194
|
+
else:
|
195
|
+
async with aiofiles.open(doc_path, 'w', encoding='utf-8', errors='replace') as f:
|
196
|
+
await f.write(content)
|
197
|
+
|
198
|
+
return file_dir
|
@@ -1,11 +1,15 @@
|
|
1
|
-
datamule/__init__.py,sha256=
|
1
|
+
datamule/__init__.py,sha256=8KioESb9y0Xwy72WuTfsYZnnMFdCrRhSv8DW-kZ4-To,1066
|
2
2
|
datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
|
3
|
-
datamule/document.py,sha256=CvuyazJ1qP8Ygpv49ikMc8DyGK7N-tApTU2Ccgv57q4,21556
|
4
3
|
datamule/helper.py,sha256=xgOVnea-lUlQ5I-U0vYUp0VeKPNZehNhqjJvegA3lYE,3342
|
5
4
|
datamule/index.py,sha256=0txvbzPcvY1GsdxA-wGdLzAByxSeE_1VyyBp9mZEQRM,2292
|
6
|
-
datamule/
|
5
|
+
datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
|
6
|
+
datamule/portfolio.py,sha256=so6j2KrkcZOToHIqkANAu3CC4QsfgaUN1zk9CrbRe1E,7225
|
7
7
|
datamule/sheet.py,sha256=TvFqK9eAYuVoJ2uWdAlx5EN6vS9lke-aZf7FqtUiDBc,22304
|
8
|
-
datamule/submission.py,sha256=
|
8
|
+
datamule/submission.py,sha256=HXuFL6snLevGk7DGlvPbjcBOJuccAIxEPXnkA1TXX8Y,8121
|
9
|
+
datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
+
datamule/document/document.py,sha256=BRnHPVt-vIT7EZTF-c-Ulv3N33xX9zE02Q9mKXVDeuY,9474
|
11
|
+
datamule/document/processing.py,sha256=fw-1OWfbmZhG1R8XpJx_vcGwz3_djmk0FrblHAMPmwc,27476
|
12
|
+
datamule/document/table.py,sha256=Sv9jTGiVhnWIY9nHaynUUixwbCrvbLsf0fdOnFR-NCY,10791
|
9
13
|
datamule/mapping_dicts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
14
|
datamule/mapping_dicts/txt_mapping_dicts.py,sha256=DQPrGYbAPQxomRUtt4iiMGrwuF7BHc_LeFBQuYBzU9o,6311
|
11
15
|
datamule/mapping_dicts/xml_mapping_dicts.py,sha256=Z22yDVwKYonUfM5foQP00dVDE8EHhhMKp0CLqVKV5OI,438
|
@@ -16,7 +20,7 @@ datamule/sec/infrastructure/submissions_metadata.py,sha256=f1KarzFSryKm0EV8DCDNs
|
|
16
20
|
datamule/sec/rss/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
21
|
datamule/sec/rss/monitor.py,sha256=6r4EYaSlGu6VYErlj9zXJsIMLVie1cfacSZU-ESfuBI,18231
|
18
22
|
datamule/sec/submissions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
|
-
datamule/sec/submissions/downloader.py,sha256=
|
23
|
+
datamule/sec/submissions/downloader.py,sha256=60wX2Yml1UCuxOtU0xMxqqeyHhrypCmlDQ0jZF-StJo,2665
|
20
24
|
datamule/sec/submissions/eftsquery.py,sha256=mSZon8rlW8dxma7M49ZW5V02Fn-ENOdt9TNO6elBrhE,27983
|
21
25
|
datamule/sec/submissions/monitor.py,sha256=Im2kgnUehhTgyY2Vq3uk07n4Vkj4PjII_SsRDi8ehAE,5384
|
22
26
|
datamule/sec/submissions/streamer.py,sha256=EXyWNCD9N6mZmvm9lFSCFodF19zSQ8jfIbWPZNp0K5Y,11253
|
@@ -28,9 +32,9 @@ datamule/sec/xbrl/streamcompanyfacts.py,sha256=WyJIwuy5mNMXWpx_IkhFzDMe9MOfQ-vNk
|
|
28
32
|
datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTqW3Y,5848
|
29
33
|
datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
30
34
|
datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
|
31
|
-
datamule/seclibrary/downloader.py,sha256=
|
35
|
+
datamule/seclibrary/downloader.py,sha256=fJztJ_sEfv2oHHbDff07DRlXLmztXnzt3Yvv5YaZgGk,13718
|
32
36
|
datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
|
33
|
-
datamule-1.2.
|
34
|
-
datamule-1.2.
|
35
|
-
datamule-1.2.
|
36
|
-
datamule-1.2.
|
37
|
+
datamule-1.2.3.dist-info/METADATA,sha256=3gODk6YjozgMTYnjvXRX_pox_Otkq7tSDZY2LEl6MiU,512
|
38
|
+
datamule-1.2.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
39
|
+
datamule-1.2.3.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
40
|
+
datamule-1.2.3.dist-info/RECORD,,
|