datamule 1.0.3__py3-none-any.whl → 1.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/__init__.py +2 -13
- datamule/document.py +8 -9
- datamule/helper.py +85 -105
- datamule/portfolio.py +105 -29
- datamule/submission.py +0 -38
- {datamule-1.0.3.dist-info → datamule-1.0.7.dist-info}/METADATA +2 -8
- datamule-1.0.7.dist-info/RECORD +10 -0
- datamule/book/__init__.py +0 -0
- datamule/book/book.py +0 -34
- datamule/book/eftsquery.py +0 -127
- datamule/book/xbrl_retriever.py +0 -88
- datamule/data/company_former_names.csv +0 -8148
- datamule/data/company_metadata.csv +0 -10049
- datamule/data/company_tickers.csv +0 -9999
- datamule/data/sec-glossary.csv +0 -728
- datamule/data/xbrl_descriptions.csv +0 -10024
- datamule/downloader/downloader.py +0 -374
- datamule/downloader/premiumdownloader.py +0 -335
- datamule/mapping_dicts/txt_mapping_dicts.py +0 -234
- datamule/mapping_dicts/xml_mapping_dicts.py +0 -19
- datamule/monitor.py +0 -283
- datamule/mulebot/__init__.py +0 -1
- datamule/mulebot/helper.py +0 -35
- datamule/mulebot/mulebot.py +0 -130
- datamule/mulebot/mulebot_server/__init__.py +0 -1
- datamule/mulebot/mulebot_server/server.py +0 -87
- datamule/mulebot/mulebot_server/static/css/minimalist.css +0 -174
- datamule/mulebot/mulebot_server/static/scripts/artifacts.js +0 -68
- datamule/mulebot/mulebot_server/static/scripts/chat.js +0 -92
- datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +0 -56
- datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +0 -15
- datamule/mulebot/mulebot_server/static/scripts/main.js +0 -57
- datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +0 -27
- datamule/mulebot/mulebot_server/static/scripts/suggestions.js +0 -47
- datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +0 -129
- datamule/mulebot/mulebot_server/static/scripts/utils.js +0 -28
- datamule/mulebot/mulebot_server/templates/chat-minimalist.html +0 -91
- datamule/mulebot/search.py +0 -52
- datamule/mulebot/tools.py +0 -82
- datamule/packageupdater.py +0 -207
- datamule-1.0.3.dist-info/RECORD +0 -43
- {datamule-1.0.3.dist-info → datamule-1.0.7.dist-info}/WHEEL +0 -0
- {datamule-1.0.3.dist-info → datamule-1.0.7.dist-info}/top_level.txt +0 -0
datamule/__init__.py
CHANGED
@@ -1,12 +1,7 @@
|
|
1
|
-
from .downloader.downloader import Downloader
|
2
|
-
from .downloader.premiumdownloader import PremiumDownloader
|
3
|
-
from .monitor import Monitor
|
4
|
-
from .packageupdater import PackageUpdater
|
5
1
|
from .submission import Submission
|
6
2
|
from .portfolio import Portfolio
|
7
3
|
from .document import Document
|
8
|
-
from
|
9
|
-
from .helper import load_package_csv, load_package_dataset
|
4
|
+
from .helper import _load_package_csv, load_package_dataset
|
10
5
|
from .config import Config
|
11
6
|
|
12
7
|
|
@@ -32,16 +27,10 @@ def _setup_notebook_env():
|
|
32
27
|
_setup_notebook_env()
|
33
28
|
|
34
29
|
__all__ = [
|
35
|
-
'
|
36
|
-
'PremiumDownloader',
|
37
|
-
'load_package_csv',
|
30
|
+
'_load_package_csv',
|
38
31
|
'load_package_dataset',
|
39
|
-
'Filing',
|
40
32
|
'Portfolio',
|
41
|
-
'Monitor',
|
42
|
-
'PackageUpdater',
|
43
33
|
'Submission',
|
44
34
|
'Document',
|
45
|
-
'parse_sgml_submission',
|
46
35
|
'Config'
|
47
36
|
]
|
datamule/document.py
CHANGED
@@ -1,11 +1,10 @@
|
|
1
1
|
import json
|
2
2
|
import csv
|
3
|
-
from .helper import convert_to_dashed_accession
|
4
3
|
import re
|
5
4
|
from doc2dict import xml2dict, txt2dict, dict2dict
|
6
5
|
from doc2dict.mapping import flatten_hierarchy
|
7
|
-
from .mapping_dicts import
|
8
|
-
from .mapping_dicts import
|
6
|
+
from .mapping_dicts.txt_mapping_dicts import dict_10k, dict_10q, dict_8k, dict_13d, dict_13g
|
7
|
+
from .mapping_dicts.xml_mapping_dicts import dict_345
|
9
8
|
from selectolax.parser import HTMLParser
|
10
9
|
|
11
10
|
class Document:
|
@@ -107,7 +106,7 @@ class Document:
|
|
107
106
|
|
108
107
|
if self.path.suffix == '.xml':
|
109
108
|
if self.type in ['3', '4', '5']:
|
110
|
-
mapping_dict =
|
109
|
+
mapping_dict = dict_345
|
111
110
|
|
112
111
|
self.load_content()
|
113
112
|
self.data = xml2dict(content=self.content, mapping_dict=mapping_dict)
|
@@ -116,15 +115,15 @@ class Document:
|
|
116
115
|
self._load_file_content()
|
117
116
|
|
118
117
|
if self.type == '10-K':
|
119
|
-
mapping_dict =
|
118
|
+
mapping_dict = dict_10k
|
120
119
|
elif self.type == '10-Q':
|
121
|
-
mapping_dict =
|
120
|
+
mapping_dict = dict_10q
|
122
121
|
elif self.type == '8-K':
|
123
|
-
mapping_dict =
|
122
|
+
mapping_dict = dict_8k
|
124
123
|
elif self.type == 'SC 13D':
|
125
|
-
mapping_dict =
|
124
|
+
mapping_dict = dict_13d
|
126
125
|
elif self.type == 'SC 13G':
|
127
|
-
mapping_dict =
|
126
|
+
mapping_dict = dict_13g
|
128
127
|
|
129
128
|
self.data = {}
|
130
129
|
self.data['document'] = dict2dict(txt2dict(content=self.content, mapping_dict=mapping_dict))
|
datamule/helper.py
CHANGED
@@ -1,123 +1,103 @@
|
|
1
|
-
import
|
2
|
-
import os
|
3
|
-
from tqdm import tqdm
|
4
|
-
import zipfile
|
5
|
-
from pkg_resources import resource_filename
|
1
|
+
from functools import lru_cache
|
6
2
|
import csv
|
7
|
-
import
|
3
|
+
from pathlib import Path
|
8
4
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
# DONE
|
15
|
-
def _download_from_dropbox(url, output_path):
|
16
|
-
headers = {'user-agent': 'Wget/1.16 (linux-gnu)'}
|
17
|
-
r = requests.get(url, stream=True, headers=headers)
|
18
|
-
total_size = int(r.headers.get('content-length', 0))
|
5
|
+
def _load_package_csv(name):
|
6
|
+
"""Load CSV files from ~/.datamule/ directory"""
|
7
|
+
data_dir = Path.home() / ".datamule"
|
8
|
+
csv_path = data_dir / f"{name}.csv"
|
19
9
|
|
20
|
-
|
21
|
-
desc="Downloading " + os.path.basename(output_path),
|
22
|
-
total=total_size,
|
23
|
-
unit='iB',
|
24
|
-
unit_scale=True,
|
25
|
-
unit_divisor=1024,
|
26
|
-
) as progress_bar:
|
27
|
-
for chunk in r.iter_content(chunk_size=1024):
|
28
|
-
size = f.write(chunk)
|
29
|
-
progress_bar.update(size)
|
30
|
-
|
31
|
-
# Check if the downloaded file is a zip file
|
32
|
-
if zipfile.is_zipfile(output_path):
|
33
|
-
extract_path = os.path.dirname(output_path)
|
34
|
-
with zipfile.ZipFile(output_path, 'r') as zip_ref:
|
35
|
-
for file_info in zip_ref.infolist():
|
36
|
-
extract_file_path = os.path.join(extract_path, file_info.filename)
|
37
|
-
with zip_ref.open(file_info) as file_in_zip, \
|
38
|
-
open(extract_file_path, 'wb') as output_file, \
|
39
|
-
tqdm(total=file_info.file_size, unit='B', unit_scale=True,
|
40
|
-
desc=f"Extracting {file_info.filename}") as pbar:
|
41
|
-
while True:
|
42
|
-
chunk = file_in_zip.read(8192)
|
43
|
-
if not chunk:
|
44
|
-
break
|
45
|
-
output_file.write(chunk)
|
46
|
-
pbar.update(len(chunk))
|
47
|
-
|
48
|
-
# Remove the zip file after extraction
|
49
|
-
os.remove(output_path)
|
50
|
-
print(f"Extracted contents to {extract_path}")
|
51
|
-
else:
|
52
|
-
print(f"Downloaded file is not a zip. Saved to {output_path}")
|
53
|
-
|
54
|
-
# May generalize to load any package resource
|
55
|
-
def load_package_csv(name):
|
56
|
-
"""Load package CSV files"""
|
57
|
-
csv_path = resource_filename('datamule', f'data/{name}.csv')
|
58
|
-
company_tickers = []
|
10
|
+
data = []
|
59
11
|
|
60
12
|
with open(csv_path, 'r') as csvfile:
|
61
13
|
csv_reader = csv.DictReader(csvfile)
|
62
14
|
for row in csv_reader:
|
63
|
-
|
15
|
+
data.append(row)
|
64
16
|
|
65
|
-
return
|
17
|
+
return data
|
66
18
|
|
67
19
|
def load_package_dataset(dataset):
|
68
|
-
if dataset ==
|
69
|
-
return
|
70
|
-
elif dataset =='company_former_names':
|
71
|
-
return load_package_csv('company_former_names')
|
72
|
-
elif dataset =='company_metadata':
|
73
|
-
return load_package_csv('company_metadata')
|
74
|
-
elif dataset == 'sec_glossary':
|
75
|
-
return load_package_csv('sec-glossary')
|
76
|
-
elif dataset == 'xbrl_descriptions':
|
77
|
-
return load_package_csv('xbrl_descriptions')
|
20
|
+
if dataset =='listed_filer_metadata':
|
21
|
+
return _load_package_csv('listed_filer_metadata')
|
78
22
|
|
79
|
-
|
80
|
-
def
|
81
|
-
|
82
|
-
|
83
|
-
if ticker:
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
23
|
+
@lru_cache(maxsize=128)
|
24
|
+
def get_cik_from_dataset(dataset_name, key, value):
|
25
|
+
dataset = load_package_dataset(dataset_name)
|
26
|
+
|
27
|
+
if dataset_name == 'listed_filer_metadata' and key == 'ticker':
|
28
|
+
key = 'tickers'
|
29
|
+
|
30
|
+
result = []
|
31
|
+
for company in dataset:
|
32
|
+
if key in ['tickers', 'exchanges'] and dataset_name == 'listed_filer_metadata':
|
33
|
+
# Parse the string representation of list into an actual list
|
34
|
+
list_values = [i.strip() for i in company[key][1:-1].replace("'", "").replace('"', '').split(',')]
|
35
|
+
if str(value) in list_values:
|
36
|
+
result.append(company['cik'])
|
37
|
+
elif str(value) == company[key]:
|
38
|
+
result.append(company['cik'])
|
39
|
+
|
40
|
+
return result
|
95
41
|
|
96
42
|
|
97
|
-
def fix_filing_url(url):
|
98
|
-
match_suffix = re.search(r'/(\d{4})\.(.+?)$', url)
|
99
|
-
if match_suffix:
|
100
|
-
suffix_number = match_suffix.group(1)
|
101
|
-
file_ext = match_suffix.group(2)
|
102
|
-
match_accession = re.search(r'/(\d{18})/', url)
|
103
|
-
if match_accession:
|
104
|
-
accession_number = match_accession.group(1)
|
105
|
-
formatted_accession_number = f"{accession_number[:10]}-{accession_number[10:12]}-{accession_number[12:]}"
|
106
|
-
new_url = url.rsplit('/', 1)[0] + f'/{formatted_accession_number}-{suffix_number}.{file_ext}'
|
107
|
-
return new_url
|
108
|
-
return url
|
109
43
|
|
110
|
-
|
111
|
-
|
112
|
-
|
44
|
+
@lru_cache(maxsize=128)
|
45
|
+
def get_ciks_from_metadata_filters(**kwargs):
|
46
|
+
"""Get CIKs from listed_filer_metadata.csv that match all provided filters."""
|
113
47
|
|
114
|
-
#
|
115
|
-
|
116
|
-
raise ValueError("Invalid accession number format. Expected 18 characters.")
|
48
|
+
# Start with None to get all CIKs from first filter
|
49
|
+
result_ciks = None
|
117
50
|
|
118
|
-
#
|
119
|
-
|
51
|
+
# For each filter, get matching CIKs and keep intersection
|
52
|
+
for key, value in kwargs.items():
|
53
|
+
# Get CIKs for this filter
|
54
|
+
ciks = get_cik_from_dataset('listed_filer_metadata', key, value)
|
55
|
+
ciks = [int(cik) for cik in ciks]
|
56
|
+
|
57
|
+
# If this is the first filter, set as initial result
|
58
|
+
if result_ciks is None:
|
59
|
+
result_ciks = set(ciks)
|
60
|
+
# Otherwise, take intersection with previous results
|
61
|
+
else:
|
62
|
+
result_ciks &= set(ciks)
|
63
|
+
|
64
|
+
# If no matches left, we can exit early
|
65
|
+
if not result_ciks:
|
66
|
+
return []
|
120
67
|
|
121
|
-
return
|
68
|
+
return list(result_ciks)
|
69
|
+
|
70
|
+
|
71
|
+
def _process_cik_and_metadata_filters(cik=None, ticker=None, **kwargs):
|
72
|
+
"""
|
73
|
+
Helper method to process CIK, ticker, and metadata filters.
|
74
|
+
Returns a list of CIKs after processing.
|
75
|
+
"""
|
76
|
+
# Input validation
|
77
|
+
if cik is not None and ticker is not None:
|
78
|
+
raise ValueError("Only one of cik or ticker should be provided, not both.")
|
79
|
+
|
80
|
+
# Convert ticker to CIK if provided
|
81
|
+
if ticker is not None:
|
82
|
+
cik = get_cik_from_dataset('listed_filer_metadata', 'ticker', ticker)
|
83
|
+
|
84
|
+
# Normalize CIK format
|
85
|
+
if cik is not None:
|
86
|
+
if isinstance(cik, str):
|
87
|
+
cik = [int(cik)]
|
88
|
+
elif isinstance(cik, int):
|
89
|
+
cik = [cik]
|
90
|
+
elif isinstance(cik, list):
|
91
|
+
cik = [int(x) for x in cik]
|
92
|
+
|
93
|
+
# Process metadata filters if provided
|
94
|
+
if kwargs:
|
95
|
+
metadata_ciks = get_ciks_from_metadata_filters(**kwargs)
|
122
96
|
|
123
|
-
|
97
|
+
if cik is not None:
|
98
|
+
cik = list(set(cik).intersection(metadata_ciks))
|
99
|
+
else:
|
100
|
+
cik = metadata_ciks
|
101
|
+
|
102
|
+
return cik
|
103
|
+
|
datamule/portfolio.py
CHANGED
@@ -2,19 +2,29 @@ from pathlib import Path
|
|
2
2
|
from tqdm import tqdm
|
3
3
|
from concurrent.futures import ThreadPoolExecutor
|
4
4
|
from .submission import Submission
|
5
|
-
from .downloader
|
6
|
-
from .
|
5
|
+
from .sec.submissions.downloader import download as sec_download
|
6
|
+
from .sec.submissions.textsearch import filter_text
|
7
7
|
from .config import Config
|
8
8
|
import os
|
9
|
+
from .helper import _process_cik_and_metadata_filters
|
10
|
+
from .seclibrary.downloader import download as seclibrary_download
|
11
|
+
from .sec.xbrl.filter_xbrl import filter_xbrl
|
12
|
+
from .sec.submissions.monitor import monitor
|
13
|
+
from .sec.xbrl.xbrlmonitor import XBRLMonitor
|
14
|
+
|
9
15
|
|
10
16
|
class Portfolio:
|
11
17
|
def __init__(self, path):
|
12
18
|
self.path = Path(path)
|
13
19
|
self.submissions = []
|
20
|
+
self.submissions_loaded = False
|
14
21
|
self.MAX_WORKERS = os.cpu_count() - 1
|
15
22
|
|
16
23
|
if self.path.exists():
|
17
24
|
self._load_submissions()
|
25
|
+
self.submissions_loaded = True
|
26
|
+
else:
|
27
|
+
self.path.mkdir(parents=True, exist_ok=True)
|
18
28
|
|
19
29
|
def _load_submissions(self):
|
20
30
|
folders = [f for f in self.path.iterdir() if f.is_dir()]
|
@@ -40,6 +50,8 @@ class Portfolio:
|
|
40
50
|
|
41
51
|
def process_submissions(self, callback):
|
42
52
|
"""Process all submissions using a thread pool."""
|
53
|
+
if not self.submissions_loaded:
|
54
|
+
self._load_submissions()
|
43
55
|
with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
|
44
56
|
results = list(tqdm(
|
45
57
|
executor.map(callback, self.submissions),
|
@@ -50,6 +62,9 @@ class Portfolio:
|
|
50
62
|
|
51
63
|
def process_documents(self, callback):
|
52
64
|
"""Process all documents using a thread pool."""
|
65
|
+
if not self.submissions_loaded:
|
66
|
+
self._load_submissions()
|
67
|
+
|
53
68
|
documents = [doc for sub in self.submissions for doc in sub]
|
54
69
|
|
55
70
|
with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
|
@@ -59,48 +74,109 @@ class Portfolio:
|
|
59
74
|
desc="Processing documents"
|
60
75
|
))
|
61
76
|
return results
|
77
|
+
|
78
|
+
def filter_text(self, text_query, cik=None, ticker=None, submission_type=None, filing_date=None, **kwargs):
|
79
|
+
"""
|
80
|
+
Filter text based on query and various parameters.
|
81
|
+
When called multiple times, takes the intersection of results.
|
82
|
+
Now supports metadata filters through kwargs.
|
83
|
+
"""
|
84
|
+
# Process CIK and metadata filters
|
85
|
+
cik = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
|
86
|
+
|
87
|
+
# Call the filter_text function with processed parameters
|
88
|
+
new_accession_numbers = filter_text(
|
89
|
+
text_query=text_query,
|
90
|
+
cik=cik,
|
91
|
+
submission_type=submission_type,
|
92
|
+
filing_date=filing_date
|
93
|
+
)
|
94
|
+
|
95
|
+
# If we already have accession numbers, take the intersection
|
96
|
+
if hasattr(self, 'accession_numbers') and self.accession_numbers:
|
97
|
+
self.accession_numbers = list(set(self.accession_numbers).intersection(new_accession_numbers))
|
98
|
+
else:
|
99
|
+
# First query, just set the accession numbers
|
100
|
+
self.accession_numbers = new_accession_numbers
|
101
|
+
|
102
|
+
def filter_xbrl(self, taxonomy, concept, unit, period, logic, value):
|
103
|
+
"""
|
104
|
+
Filter XBRL data based on logic and value.
|
105
|
+
"""
|
106
|
+
new_accession_numbers = filter_xbrl(
|
107
|
+
taxonomy=taxonomy,
|
108
|
+
concept=concept,
|
109
|
+
unit=unit,
|
110
|
+
period=period,
|
111
|
+
logic=logic,
|
112
|
+
value=value
|
113
|
+
)
|
114
|
+
|
115
|
+
# If we already have accession numbers, take the intersection
|
116
|
+
if hasattr(self, 'accession_numbers') and self.accession_numbers:
|
117
|
+
self.accession_numbers = list(set(self.accession_numbers).intersection(new_accession_numbers))
|
118
|
+
else:
|
119
|
+
# First query, just set the accession numbers
|
120
|
+
self.accession_numbers = new_accession_numbers
|
62
121
|
|
63
|
-
def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None):
|
122
|
+
def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None, **kwargs):
|
64
123
|
if provider is None:
|
65
124
|
config = Config()
|
66
125
|
provider = config.get_default_source()
|
67
126
|
|
68
|
-
|
69
|
-
|
70
|
-
|
127
|
+
# Process CIK and metadata filters
|
128
|
+
cik = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
|
129
|
+
|
130
|
+
if provider == 'datamule':
|
131
|
+
|
132
|
+
seclibrary_download(
|
133
|
+
output_dir=self.path,
|
134
|
+
cik=cik,
|
135
|
+
submission_type=submission_type,
|
136
|
+
filing_date=filing_date,
|
137
|
+
accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None
|
138
|
+
)
|
139
|
+
else:
|
140
|
+
sec_download(
|
141
|
+
output_dir=self.path,
|
142
|
+
cik=cik,
|
143
|
+
submission_type=submission_type,
|
144
|
+
filing_date=filing_date,
|
145
|
+
requests_per_second=5, # Revisit this later.
|
146
|
+
accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None
|
147
|
+
)
|
148
|
+
|
149
|
+
self.submissions_loaded = False
|
150
|
+
def monitor_submissions(self,data_callback=None, poll_callback=None, submission_type=None, cik=None,
|
151
|
+
polling_interval=200, requests_per_second=5, quiet=False, start_date=None, ticker=None, **kwargs):
|
152
|
+
|
153
|
+
cik = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
|
154
|
+
|
155
|
+
monitor(
|
156
|
+
data_callback=data_callback,
|
157
|
+
poll_callback=poll_callback,
|
71
158
|
cik=cik,
|
72
|
-
ticker=ticker,
|
73
159
|
submission_type=submission_type,
|
74
|
-
|
160
|
+
polling_interval=polling_interval,
|
161
|
+
requests_per_second=requests_per_second,
|
162
|
+
quiet=quiet,
|
163
|
+
start_date=start_date
|
75
164
|
)
|
76
|
-
|
77
|
-
# Reload submissions after download
|
78
|
-
self._load_submissions()
|
79
165
|
|
166
|
+
|
167
|
+
|
168
|
+
|
80
169
|
def __iter__(self):
|
170
|
+
if not self.submissions_loaded:
|
171
|
+
self._load_submissions()
|
81
172
|
return iter(self.submissions)
|
82
173
|
|
83
174
|
def document_type(self, document_types):
|
84
175
|
"""Filter documents by type(s)."""
|
176
|
+
if not self.submissions_loaded:
|
177
|
+
self._load_submissions()
|
85
178
|
if isinstance(document_types, str):
|
86
179
|
document_types = [document_types]
|
87
180
|
|
88
181
|
for submission in self.submissions:
|
89
|
-
yield from submission.document_type(document_types)
|
90
|
-
|
91
|
-
def contains_string(self, pattern, document_types=None):
|
92
|
-
"""Search for pattern in documents, with optional type filter."""
|
93
|
-
def check_document(document):
|
94
|
-
return document if document.contains_string(pattern) else None
|
95
|
-
|
96
|
-
# Get documents, filtered by type if specified
|
97
|
-
documents = list(self.document_type(document_types)) if document_types else [
|
98
|
-
doc for sub in self.submissions for doc in sub
|
99
|
-
]
|
100
|
-
|
101
|
-
with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
|
102
|
-
results = executor.map(check_document, documents)
|
103
|
-
|
104
|
-
for doc in tqdm(results, total=len(documents), desc=f"Searching for '{pattern}'"):
|
105
|
-
if doc is not None:
|
106
|
-
yield doc
|
182
|
+
yield from submission.document_type(document_types)
|
datamule/submission.py
CHANGED
@@ -11,44 +11,6 @@ class Submission:
|
|
11
11
|
metadata_path = self.path / 'metadata.json'
|
12
12
|
with metadata_path.open('r') as f:
|
13
13
|
self.metadata = json.load(f)
|
14
|
-
|
15
|
-
def keep(self, document_types):
|
16
|
-
"""Keep files of specified document types, delete others
|
17
|
-
Args:
|
18
|
-
document_types: string or list of strings representing document types to keep
|
19
|
-
"""
|
20
|
-
# Convert single string to list for consistent handling
|
21
|
-
if isinstance(document_types, str):
|
22
|
-
document_types = [document_types]
|
23
|
-
|
24
|
-
for doc in self.metadata['documents']:
|
25
|
-
filename = doc.get('filename')
|
26
|
-
if filename is None:
|
27
|
-
continue
|
28
|
-
|
29
|
-
filepath = self.path / filename
|
30
|
-
# Delete if document type isn't in our keep list
|
31
|
-
if doc['type'] not in document_types and filepath.exists():
|
32
|
-
filepath.unlink()
|
33
|
-
|
34
|
-
def drop(self, document_types):
|
35
|
-
"""Delete files of specified document types, keep others
|
36
|
-
Args:
|
37
|
-
document_types: string or list of strings representing document types to drop
|
38
|
-
"""
|
39
|
-
# Convert single string to list for consistent handling
|
40
|
-
if isinstance(document_types, str):
|
41
|
-
document_types = [document_types]
|
42
|
-
|
43
|
-
for doc in self.metadata['documents']:
|
44
|
-
filename = doc.get('filename')
|
45
|
-
if filename is None:
|
46
|
-
continue
|
47
|
-
|
48
|
-
filepath = self.path / filename
|
49
|
-
# Delete if document type is in our drop list
|
50
|
-
if doc['type'] in document_types and filepath.exists():
|
51
|
-
filepath.unlink()
|
52
14
|
|
53
15
|
def document_type(self, document_type):
|
54
16
|
# Convert single document type to list for consistent handling
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: datamule
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.7
|
4
4
|
Summary: Making it easier to use SEC filings.
|
5
5
|
Home-page: https://github.com/john-friedman/datamule-python
|
6
6
|
Author: John Friedman
|
@@ -17,11 +17,5 @@ Requires-Dist: pytz
|
|
17
17
|
Requires-Dist: zstandard
|
18
18
|
Requires-Dist: doc2dict
|
19
19
|
Requires-Dist: secsgml
|
20
|
-
|
21
|
-
Requires-Dist: openai; extra == "all"
|
22
|
-
Requires-Dist: flask; extra == "all"
|
23
|
-
Provides-Extra: mulebot
|
24
|
-
Requires-Dist: openai; extra == "mulebot"
|
25
|
-
Provides-Extra: mulebot_server
|
26
|
-
Requires-Dist: flask; extra == "mulebot-server"
|
20
|
+
Requires-Dist: lxml
|
27
21
|
|
@@ -0,0 +1,10 @@
|
|
1
|
+
datamule/__init__.py,sha256=0npnB3i2F7YB7etG315oDiCd-eMo-A6MP5LX2gQclHY,914
|
2
|
+
datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
|
3
|
+
datamule/document.py,sha256=BC8jdVy9pMOA9ghIqV5N2XJidmVNThqbBohsuSAnVoY,10813
|
4
|
+
datamule/helper.py,sha256=xgOVnea-lUlQ5I-U0vYUp0VeKPNZehNhqjJvegA3lYE,3342
|
5
|
+
datamule/portfolio.py,sha256=JmZlTrom_g7FXKXxWp_CiQTyC7p6_cDP08G0kFUja48,6982
|
6
|
+
datamule/submission.py,sha256=JsxYlEz1Ywu6eC32OS15p4p-p8qB6SWd_rXuf2p5UfY,1247
|
7
|
+
datamule-1.0.7.dist-info/METADATA,sha256=HY7kDLSjl9RuZoJkpgCBA2ugL0EpzzXMc0S7-4qjcNk,512
|
8
|
+
datamule-1.0.7.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
9
|
+
datamule-1.0.7.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
10
|
+
datamule-1.0.7.dist-info/RECORD,,
|
datamule/book/__init__.py
DELETED
File without changes
|
datamule/book/book.py
DELETED
@@ -1,34 +0,0 @@
|
|
1
|
-
# Streams data rather than downloading it.
|
2
|
-
# additional functionality such as query by xbrl, and other db
|
3
|
-
# also this is basically our experimental rework of portfolio w/o disturbing existing users
|
4
|
-
# this is highly experimental and may not work as expected
|
5
|
-
# only for datamule source
|
6
|
-
# likely new bottleneck will be local parsing() - will be bypassed in future when we have parsed archive
|
7
|
-
# wow parsed archive is going to be crazy fast - like every 10k in 1 minute.
|
8
|
-
|
9
|
-
# example queries filter by sic = 7372, xbrl query = dei:operatingprofit > 0 in date range 2018-2019
|
10
|
-
|
11
|
-
# hmm do we go for sql esq or not.
|
12
|
-
# I think we do.
|
13
|
-
# i think we remove cik, ticker, sic, etc and just have a query object
|
14
|
-
# should be sql esq so users can use it easily w/o learnign new syntax
|
15
|
-
|
16
|
-
# WHERE submission_type = '10-K'
|
17
|
-
# AND us-gaap:ResearchAndDevelopmentExpense > 0
|
18
|
-
# AND dei:debt_to_equity < 2
|
19
|
-
# AND filing_date BETWEEN '2023-01-01' AND '2023-12-31'
|
20
|
-
# AND CIK in (123, 456, 789)
|
21
|
-
# AND SIC in (123, 456, 789)
|
22
|
-
# AND ticker in ('AAPL', 'GOOGL', 'AMZN')
|
23
|
-
# AND document_type = 'EX-99.1' # to select attachments
|
24
|
-
|
25
|
-
from .eftsquery import EFTSQuery
|
26
|
-
|
27
|
-
|
28
|
-
class Book():
|
29
|
-
def process_submissions(self,cik,ticker,sic,submission_type,document_type,date,
|
30
|
-
xbrl_query={},
|
31
|
-
metadata_callback=None,
|
32
|
-
document_callback=None,):
|
33
|
-
# grabs data and processes it
|
34
|
-
pass
|