datamule 2.1.2__tar.gz → 2.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamule-2.1.2 → datamule-2.1.4}/PKG-INFO +1 -1
- datamule-2.1.4/datamule/datasets.py +49 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/document/document.py +29 -8
- {datamule-2.1.2 → datamule-2.1.4}/datamule/portfolio.py +10 -6
- {datamule-2.1.2 → datamule-2.1.4}/datamule/submission.py +4 -4
- {datamule-2.1.2 → datamule-2.1.4}/datamule.egg-info/PKG-INFO +1 -1
- {datamule-2.1.2 → datamule-2.1.4}/datamule.egg-info/SOURCES.txt +1 -0
- {datamule-2.1.2 → datamule-2.1.4}/setup.py +1 -1
- {datamule-2.1.2 → datamule-2.1.4}/datamule/__init__.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/config.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/data/listed_filer_metadata.csv +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/datamule/__init__.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/datamule/datamule_lookup.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/datamule/datamule_mysql_rds.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/datamule/downloader.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/datamule/sec_connector.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/document/__init__.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/document/tables/__init__.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/document/tables/tables.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/document/tables/tables_13fhr.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/document/tables/tables_25nse.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/document/tables/tables_informationtable.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/document/tables/tables_npx.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/document/tables/tables_ownership.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/document/tables/tables_proxyvotingrecord.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/document/tables/tables_sbsef.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/document/tables/tables_sdr.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/document/tables/utils.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/helper.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/index.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/mapping_dicts/__init__.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/package_updater.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/portfolio_compression_utils.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/__init__.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/infrastructure/__init__.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/submissions/__init__.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/submissions/downloader.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/submissions/eftsquery.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/submissions/monitor.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/submissions/streamer.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/submissions/textsearch.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/utils.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/xbrl/__init__.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/xbrl/filter_xbrl.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/seclibrary/__init__.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/seclibrary/bq.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/sheet.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/utils/__init__.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/utils/construct_submissions_data.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule/utils/format_accession.py +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule.egg-info/dependency_links.txt +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule.egg-info/requires.txt +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/datamule.egg-info/top_level.txt +0 -0
- {datamule-2.1.2 → datamule-2.1.4}/setup.cfg +0 -0
@@ -0,0 +1,49 @@
|
|
1
|
+
# datamule/datasets.py
|
2
|
+
from pathlib import Path
|
3
|
+
import requests
|
4
|
+
import gzip
|
5
|
+
import shutil
|
6
|
+
import csv
|
7
|
+
|
8
|
+
# Dataset URLs
|
9
|
+
DATASET_URLS = {
|
10
|
+
"cik_cusip_crosswalk": "https://github.com/john-friedman/datamule-data/raw/refs/heads/master/data/datasets/cik_cusip_crosswalk.csv.gz"
|
11
|
+
}
|
12
|
+
|
13
|
+
def update_dataset(name):
|
14
|
+
"""Force update a dataset by re-downloading it."""
|
15
|
+
return _get_dataset(name, update=True)
|
16
|
+
|
17
|
+
def _get_dataset(name, update=False):
|
18
|
+
"""Internal function to get dataset as list of dicts, downloading if necessary."""
|
19
|
+
if name not in DATASET_URLS:
|
20
|
+
raise ValueError(f"Unknown dataset: {name}")
|
21
|
+
|
22
|
+
url = DATASET_URLS[name]
|
23
|
+
data_dir = Path.home() / ".datamule" / "datasets"
|
24
|
+
file_path = data_dir / f"{name}.csv"
|
25
|
+
|
26
|
+
if not file_path.exists() or update:
|
27
|
+
print(f"Downloading {name}...")
|
28
|
+
data_dir.mkdir(parents=True, exist_ok=True)
|
29
|
+
|
30
|
+
response = requests.get(url, stream=True)
|
31
|
+
response.raise_for_status()
|
32
|
+
|
33
|
+
gz_path = file_path.with_suffix('.csv.gz')
|
34
|
+
with open(gz_path, 'wb') as f:
|
35
|
+
for chunk in response.iter_content(chunk_size=8192):
|
36
|
+
f.write(chunk)
|
37
|
+
|
38
|
+
with gzip.open(gz_path, 'rb') as f_in:
|
39
|
+
with open(file_path, 'wb') as f_out:
|
40
|
+
shutil.copyfileobj(f_in, f_out)
|
41
|
+
|
42
|
+
gz_path.unlink()
|
43
|
+
|
44
|
+
# Read CSV and return as list of dicts
|
45
|
+
with open(file_path, 'r') as f:
|
46
|
+
return list(csv.DictReader(f))
|
47
|
+
|
48
|
+
# Dataset available as list of dicts on import
|
49
|
+
cik_cusip_crosswalk = _get_dataset("cik_cusip_crosswalk")
|
@@ -12,6 +12,7 @@ from selectolax.parser import HTMLParser
|
|
12
12
|
from pathlib import Path
|
13
13
|
import webbrowser
|
14
14
|
from secsgml.utils import bytes_to_str
|
15
|
+
import tempfile
|
15
16
|
|
16
17
|
from .tables.tables import Tables
|
17
18
|
|
@@ -36,18 +37,19 @@ class Document:
|
|
36
37
|
# this will be filled by parsed
|
37
38
|
self._data = None
|
38
39
|
self._tables = None
|
40
|
+
self._text = None
|
39
41
|
|
40
42
|
|
41
43
|
|
42
44
|
#_load_text_content
|
43
45
|
def _preprocess_txt_content(self):
|
44
|
-
|
46
|
+
self._text = self.content.decode().translate(str.maketrans({
|
45
47
|
'\xa0': ' ', '\u2003': ' ',
|
46
48
|
'\u2018': "'", '\u2019': "'",
|
47
49
|
'\u201c': '"', '\u201d': '"'
|
48
50
|
}))
|
49
51
|
|
50
|
-
#
|
52
|
+
# needs work
|
51
53
|
def _preprocess_html_content(self):
|
52
54
|
parser = HTMLParser(self.content,detect_encoding=True,decode_errors='ignore')
|
53
55
|
|
@@ -95,7 +97,7 @@ class Document:
|
|
95
97
|
while '\n\n\n' in text:
|
96
98
|
text = text.replace('\n\n\n', '\n\n')
|
97
99
|
|
98
|
-
|
100
|
+
self._text = text.translate(str.maketrans({
|
99
101
|
'\xa0': ' ', '\u2003': ' ',
|
100
102
|
'\u2018': "'", '\u2019': "'",
|
101
103
|
'\u201c': '"', '\u201d': '"'
|
@@ -116,7 +118,7 @@ class Document:
|
|
116
118
|
mapping_dict = None
|
117
119
|
|
118
120
|
if self.extension == '.txt':
|
119
|
-
content = self.
|
121
|
+
content = self.text
|
120
122
|
if self.type == '10-Q':
|
121
123
|
mapping_dict = dict_10q
|
122
124
|
elif self.type == '10-K':
|
@@ -224,6 +226,15 @@ class Document:
|
|
224
226
|
self.parse()
|
225
227
|
return self._data
|
226
228
|
|
229
|
+
@property
|
230
|
+
def text(self):
|
231
|
+
if self._text is None:
|
232
|
+
if self.extension in ['.htm','.html']:
|
233
|
+
self._preprocess_html_content()
|
234
|
+
elif self.extension == '.txt':
|
235
|
+
self._preprocess_txt_content()
|
236
|
+
return self._text
|
237
|
+
|
227
238
|
def write_json(self, output_filename=None):
|
228
239
|
if not self.data:
|
229
240
|
self.parse()
|
@@ -308,13 +319,23 @@ class Document:
|
|
308
319
|
self.parse()
|
309
320
|
|
310
321
|
if not self.data:
|
311
|
-
|
312
|
-
webbrowser.open('file://' + str(self.path))
|
313
|
-
else:
|
314
|
-
pass
|
322
|
+
pass
|
315
323
|
else:
|
316
324
|
visualize_dict(self.data)
|
317
325
|
|
326
|
+
# alpha feature
|
327
|
+
def open(self):
|
328
|
+
"""Open the document. Experimental. Creates copy in temp, rather than use tar path for now."""
|
329
|
+
if self.extension in ['.htm', '.html','.txt','.jpg','.png', '.pdf']:
|
330
|
+
# Create a temporary file with the content and open it
|
331
|
+
|
332
|
+
with tempfile.NamedTemporaryFile(mode='wb', suffix=self.extension, delete=False) as f:
|
333
|
+
f.write(self.content)
|
334
|
+
temp_path = f.name
|
335
|
+
webbrowser.open('file://' + temp_path)
|
336
|
+
else:
|
337
|
+
print(f"Cannot open files with extension {self.extension}")
|
338
|
+
|
318
339
|
def get_section(self, title=None, title_regex=None,title_class=None, format='dict'):
|
319
340
|
if not self.data:
|
320
341
|
self.parse()
|
@@ -96,12 +96,16 @@ class Portfolio:
|
|
96
96
|
# Create submissions for each accession
|
97
97
|
submissions = []
|
98
98
|
for accession_prefix in accession_prefixes:
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
99
|
+
try:
|
100
|
+
submission = Submission(
|
101
|
+
batch_tar_path=batch_tar_path,
|
102
|
+
accession_prefix=accession_prefix,
|
103
|
+
portfolio_ref=self
|
104
|
+
)
|
105
|
+
submissions.append(submission)
|
106
|
+
except Exception as e:
|
107
|
+
pass
|
108
|
+
#print(f"Path: {batch_tar_path}. Exception: {e}")
|
105
109
|
pbar.update(1) # Update progress for each successful submission
|
106
110
|
|
107
111
|
return submissions
|
@@ -12,6 +12,7 @@ import urllib.request
|
|
12
12
|
from secxbrl import parse_inline_xbrl
|
13
13
|
from company_fundamentals import construct_fundamentals
|
14
14
|
from decimal import Decimal
|
15
|
+
from .utils.format_accession import format_accession
|
15
16
|
|
16
17
|
|
17
18
|
class Submission:
|
@@ -93,11 +94,10 @@ class Submission:
|
|
93
94
|
# standardize metadata
|
94
95
|
metadata = transform_metadata_string(metadata)
|
95
96
|
self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
|
96
|
-
|
97
|
+
|
98
|
+
# lets just use accesion-prefix, to get around malformed metadata files (1995 has a lot!)
|
99
|
+
self.accession = format_accession(self.accession_prefix,'dash')
|
97
100
|
|
98
|
-
# Band-aid fix: some SGML files in the SEC are bad lol, so they have TWO header sections. Will fix post w/ my cleaned archive
|
99
|
-
if isinstance(self.accession,list):
|
100
|
-
self.accession = self.accession[0]
|
101
101
|
#print(f"s: {self.metadata.content['accession-number']} : {batch_tar_path}")
|
102
102
|
self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
|
103
103
|
|
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
|
|
32
32
|
setup(
|
33
33
|
name="datamule",
|
34
34
|
author="John Friedman",
|
35
|
-
version="2.1.
|
35
|
+
version="2.1.4",
|
36
36
|
description="Work with SEC submissions at scale.",
|
37
37
|
packages=find_packages(include=['datamule', 'datamule.*']),
|
38
38
|
url="https://github.com/john-friedman/datamule-python",
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|